diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/added_tokens.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/added_tokens.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/generation_config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00001-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f7c977122387ec4d864bd9742cc093b6ee79c690 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2edf51bfde4fab8838b8baa8ca36505452019432a289721a0e8f6f947548090 +size 4921072616 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00002-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6139142358703a32e9af54da170b98f410d2b6dd --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c05b04110c22ec7814645abc2eb7fafd6dfbfd0f161bff0d466b7e6d0a15bca8 +size 4978830984 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00003-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..05fe017845942a2fe3adc15573ef23a40244485b --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:989fd57460aa4bee56c1ad1878bedd022983c53ece7b6851ea3de066975ba1ac +size 4100977896 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model.safetensors.index.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/norm_stats.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..f33de4b80f47e0bac1a414431a8354d8345d60c5 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -24.65332532291412, + -30.64622355117798, + -14.452480476760865, + -1.8581012797355654, + -2.2742317820549007, + -1.9569469915390014, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 3.0011677881240857, + 22.348905650329584, + 21.68580058555603, + 2.3937565994262693, + 4.117288079452516, + 3.295379007720948, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + -6.570000648498535, + -1.0618462562561035, + 3.623035430908203, + 0.010442602448165417, + 0.7240540385246277, + 0.44398337602615356, + 0.12898989021778107, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 7.04909086227417, + 17.099597930908203, + 8.363018989562988, + 0.6997263431549072, + 1.1358375549316406, + 0.9687971472740173, + 0.9916459321975708, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.777750787353515, + -21.249025872802733, + -2.4021557040214536, + -4.092200187206268, + -3.2986312219619753, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.645499613952634, + 30.59561934127808, + 14.405443457031247, + 1.8499586300849913, + 2.268683268356323, + 1.963451420021057, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.6817545890808105, + 1.3444018363952637, + -3.5411791801452637, + -0.009792014956474304, + -0.7230188846588135, + -0.44849714636802673, + 0.15749873220920563, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.988739013671875, + 16.884004592895508, + 8.242538452148438, + 0.6991510391235352, + 1.1302146911621094, + 0.9690405130386353, + 0.9875192046165466, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/pi0.yaml b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff8c0ecf109af377331583e4079865e7d8037bc8 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 5 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/special_tokens_map.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/tokenizer.model b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/tokenizer_config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/trainer_state.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6818e6d53f2c404a1f8a9c3f88907bf68d115a9b --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/trainer_state.json @@ -0,0 +1,15434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.3464716322908379, + "eval_steps": 500, + "global_step": 22000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006120325601321991, + "grad_norm": 2.2432243824005127, + "learning_rate": 1.8e-07, + "loss": 0.1384, + "step": 10 + }, + { + "epoch": 0.0012240651202643981, + "grad_norm": 1.959119439125061, + "learning_rate": 3.8e-07, + "loss": 0.1388, + "step": 20 + }, + { + "epoch": 0.001836097680396597, + "grad_norm": 1.8843899965286255, + "learning_rate": 5.800000000000001e-07, + "loss": 0.1307, + "step": 30 + }, + { + "epoch": 0.0024481302405287963, + "grad_norm": 1.7569042444229126, + "learning_rate": 7.8e-07, + "loss": 0.1238, + "step": 40 + }, + { + "epoch": 0.0030601628006609954, + "grad_norm": 2.6189017295837402, + "learning_rate": 9.800000000000001e-07, + "loss": 0.1275, + "step": 50 + }, + { + "epoch": 0.003672195360793194, + "grad_norm": 1.8418694734573364, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.1032, + "step": 60 + }, + { + "epoch": 0.004284227920925393, + "grad_norm": 1.481676697731018, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0816, + "step": 70 + }, + { + "epoch": 0.004896260481057593, + "grad_norm": 0.9590038061141968, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.0732, + "step": 80 + }, + { + "epoch": 0.005508293041189791, + "grad_norm": 1.002897024154663, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.0608, + "step": 90 + }, + { + "epoch": 0.006120325601321991, + "grad_norm": 0.9830108284950256, + "learning_rate": 1.98e-06, + "loss": 0.042, + "step": 100 + }, + { + "epoch": 0.006732358161454189, + "grad_norm": 0.858244001865387, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.0314, + "step": 110 + }, + { + "epoch": 0.007344390721586388, + "grad_norm": 0.5761063694953918, + "learning_rate": 2.38e-06, + "loss": 0.029, + "step": 120 + }, + { + "epoch": 0.007956423281718587, + "grad_norm": 0.5434514284133911, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.0227, + "step": 130 + }, + { + "epoch": 0.008568455841850786, + "grad_norm": 0.6488766670227051, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.0202, + "step": 140 + }, + { + "epoch": 0.009180488401982986, + "grad_norm": 0.36763015389442444, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.0157, + "step": 150 + }, + { + "epoch": 0.009792520962115185, + "grad_norm": 0.49271446466445923, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.0194, + "step": 160 + }, + { + "epoch": 0.010404553522247383, + "grad_norm": 0.23608209192752838, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.0122, + "step": 170 + }, + { + "epoch": 0.011016586082379582, + "grad_norm": 0.47871828079223633, + "learning_rate": 3.58e-06, + "loss": 0.0131, + "step": 180 + }, + { + "epoch": 0.011628618642511782, + "grad_norm": 0.6862446069717407, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.0131, + "step": 190 + }, + { + "epoch": 0.012240651202643981, + "grad_norm": 0.7964349389076233, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0155, + "step": 200 + }, + { + "epoch": 0.01285268376277618, + "grad_norm": 0.5564846396446228, + "learning_rate": 4.18e-06, + "loss": 0.0104, + "step": 210 + }, + { + "epoch": 0.013464716322908379, + "grad_norm": 0.2810452878475189, + "learning_rate": 4.38e-06, + "loss": 0.0128, + "step": 220 + }, + { + "epoch": 0.014076748883040578, + "grad_norm": 0.4474979341030121, + "learning_rate": 4.58e-06, + "loss": 0.0188, + "step": 230 + }, + { + "epoch": 0.014688781443172776, + "grad_norm": 0.47965875267982483, + "learning_rate": 4.78e-06, + "loss": 0.0141, + "step": 240 + }, + { + "epoch": 0.015300814003304975, + "grad_norm": 0.3410812020301819, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0085, + "step": 250 + }, + { + "epoch": 0.015912846563437173, + "grad_norm": 0.39907002449035645, + "learning_rate": 5.18e-06, + "loss": 0.0106, + "step": 260 + }, + { + "epoch": 0.016524879123569373, + "grad_norm": 0.28909367322921753, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0103, + "step": 270 + }, + { + "epoch": 0.017136911683701572, + "grad_norm": 0.31524109840393066, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0101, + "step": 280 + }, + { + "epoch": 0.017748944243833772, + "grad_norm": 0.29430100321769714, + "learning_rate": 5.78e-06, + "loss": 0.0109, + "step": 290 + }, + { + "epoch": 0.01836097680396597, + "grad_norm": 0.2709169387817383, + "learning_rate": 5.98e-06, + "loss": 0.0102, + "step": 300 + }, + { + "epoch": 0.01897300936409817, + "grad_norm": 0.33067119121551514, + "learning_rate": 6.18e-06, + "loss": 0.0095, + "step": 310 + }, + { + "epoch": 0.01958504192423037, + "grad_norm": 0.28110620379447937, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0102, + "step": 320 + }, + { + "epoch": 0.02019707448436257, + "grad_norm": 0.27736902236938477, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0088, + "step": 330 + }, + { + "epoch": 0.020809107044494766, + "grad_norm": 0.3238557279109955, + "learning_rate": 6.780000000000001e-06, + "loss": 0.01, + "step": 340 + }, + { + "epoch": 0.021421139604626965, + "grad_norm": 0.30263441801071167, + "learning_rate": 6.98e-06, + "loss": 0.0095, + "step": 350 + }, + { + "epoch": 0.022033172164759165, + "grad_norm": 0.2618265450000763, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0096, + "step": 360 + }, + { + "epoch": 0.022645204724891364, + "grad_norm": 0.272565633058548, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0093, + "step": 370 + }, + { + "epoch": 0.023257237285023564, + "grad_norm": 0.44272440671920776, + "learning_rate": 7.58e-06, + "loss": 0.0087, + "step": 380 + }, + { + "epoch": 0.023869269845155763, + "grad_norm": 0.27631404995918274, + "learning_rate": 7.78e-06, + "loss": 0.0093, + "step": 390 + }, + { + "epoch": 0.024481302405287963, + "grad_norm": 0.4108494520187378, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0093, + "step": 400 + }, + { + "epoch": 0.02509333496542016, + "grad_norm": 0.43498387932777405, + "learning_rate": 8.18e-06, + "loss": 0.0098, + "step": 410 + }, + { + "epoch": 0.02570536752555236, + "grad_norm": 0.3419845700263977, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0091, + "step": 420 + }, + { + "epoch": 0.026317400085684558, + "grad_norm": 0.5677013993263245, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0104, + "step": 430 + }, + { + "epoch": 0.026929432645816757, + "grad_norm": 0.24424298107624054, + "learning_rate": 8.78e-06, + "loss": 0.0089, + "step": 440 + }, + { + "epoch": 0.027541465205948957, + "grad_norm": 0.267781138420105, + "learning_rate": 8.98e-06, + "loss": 0.0107, + "step": 450 + }, + { + "epoch": 0.028153497766081156, + "grad_norm": 0.38459253311157227, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0081, + "step": 460 + }, + { + "epoch": 0.028765530326213356, + "grad_norm": 0.2647954523563385, + "learning_rate": 9.38e-06, + "loss": 0.0082, + "step": 470 + }, + { + "epoch": 0.029377562886345552, + "grad_norm": 0.44312018156051636, + "learning_rate": 9.58e-06, + "loss": 0.0102, + "step": 480 + }, + { + "epoch": 0.02998959544647775, + "grad_norm": 0.2309781014919281, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0118, + "step": 490 + }, + { + "epoch": 0.03060162800660995, + "grad_norm": 0.41755014657974243, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0094, + "step": 500 + }, + { + "epoch": 0.03121366056674215, + "grad_norm": 0.38537120819091797, + "learning_rate": 1.018e-05, + "loss": 0.011, + "step": 510 + }, + { + "epoch": 0.031825693126874346, + "grad_norm": 0.49801477789878845, + "learning_rate": 1.038e-05, + "loss": 0.0093, + "step": 520 + }, + { + "epoch": 0.03243772568700655, + "grad_norm": 0.3854966163635254, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0116, + "step": 530 + }, + { + "epoch": 0.033049758247138745, + "grad_norm": 0.3163810968399048, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.008, + "step": 540 + }, + { + "epoch": 0.03366179080727095, + "grad_norm": 0.33000636100769043, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0093, + "step": 550 + }, + { + "epoch": 0.034273823367403145, + "grad_norm": 0.3350297808647156, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0083, + "step": 560 + }, + { + "epoch": 0.03488585592753535, + "grad_norm": 0.18780949711799622, + "learning_rate": 1.138e-05, + "loss": 0.0097, + "step": 570 + }, + { + "epoch": 0.035497888487667544, + "grad_norm": 0.20399607717990875, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0092, + "step": 580 + }, + { + "epoch": 0.03610992104779974, + "grad_norm": 0.15931005775928497, + "learning_rate": 1.178e-05, + "loss": 0.0076, + "step": 590 + }, + { + "epoch": 0.03672195360793194, + "grad_norm": 0.20751547813415527, + "learning_rate": 1.198e-05, + "loss": 0.0079, + "step": 600 + }, + { + "epoch": 0.03733398616806414, + "grad_norm": 0.39666953682899475, + "learning_rate": 1.218e-05, + "loss": 0.0072, + "step": 610 + }, + { + "epoch": 0.03794601872819634, + "grad_norm": 0.385407030582428, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0089, + "step": 620 + }, + { + "epoch": 0.03855805128832854, + "grad_norm": 0.5228332877159119, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0092, + "step": 630 + }, + { + "epoch": 0.03917008384846074, + "grad_norm": 0.29315415024757385, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0098, + "step": 640 + }, + { + "epoch": 0.03978211640859294, + "grad_norm": 0.4300646483898163, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0079, + "step": 650 + }, + { + "epoch": 0.04039414896872514, + "grad_norm": 0.38021156191825867, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0103, + "step": 660 + }, + { + "epoch": 0.041006181528857336, + "grad_norm": 0.43489688634872437, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0105, + "step": 670 + }, + { + "epoch": 0.04161821408898953, + "grad_norm": 0.48019328713417053, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0124, + "step": 680 + }, + { + "epoch": 0.042230246649121735, + "grad_norm": 0.28486984968185425, + "learning_rate": 1.378e-05, + "loss": 0.0122, + "step": 690 + }, + { + "epoch": 0.04284227920925393, + "grad_norm": 0.35172080993652344, + "learning_rate": 1.398e-05, + "loss": 0.0093, + "step": 700 + }, + { + "epoch": 0.043454311769386134, + "grad_norm": 0.32531124353408813, + "learning_rate": 1.418e-05, + "loss": 0.0116, + "step": 710 + }, + { + "epoch": 0.04406634432951833, + "grad_norm": 0.388637512922287, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0077, + "step": 720 + }, + { + "epoch": 0.04467837688965053, + "grad_norm": 0.3816429078578949, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0123, + "step": 730 + }, + { + "epoch": 0.04529040944978273, + "grad_norm": 0.22786036133766174, + "learning_rate": 1.478e-05, + "loss": 0.0089, + "step": 740 + }, + { + "epoch": 0.045902442009914925, + "grad_norm": 0.2965328097343445, + "learning_rate": 1.498e-05, + "loss": 0.011, + "step": 750 + }, + { + "epoch": 0.04651447457004713, + "grad_norm": 0.3568362593650818, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0101, + "step": 760 + }, + { + "epoch": 0.047126507130179324, + "grad_norm": 0.2972166836261749, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0093, + "step": 770 + }, + { + "epoch": 0.04773853969031153, + "grad_norm": 0.4221388101577759, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.012, + "step": 780 + }, + { + "epoch": 0.04835057225044372, + "grad_norm": 0.37255391478538513, + "learning_rate": 1.578e-05, + "loss": 0.0085, + "step": 790 + }, + { + "epoch": 0.048962604810575926, + "grad_norm": 0.36007094383239746, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.008, + "step": 800 + }, + { + "epoch": 0.04957463737070812, + "grad_norm": 0.40588808059692383, + "learning_rate": 1.618e-05, + "loss": 0.0081, + "step": 810 + }, + { + "epoch": 0.05018666993084032, + "grad_norm": 0.46563687920570374, + "learning_rate": 1.638e-05, + "loss": 0.0076, + "step": 820 + }, + { + "epoch": 0.05079870249097252, + "grad_norm": 0.3161381483078003, + "learning_rate": 1.658e-05, + "loss": 0.0129, + "step": 830 + }, + { + "epoch": 0.05141073505110472, + "grad_norm": 0.3800298869609833, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0146, + "step": 840 + }, + { + "epoch": 0.05202276761123692, + "grad_norm": 0.36572107672691345, + "learning_rate": 1.698e-05, + "loss": 0.0148, + "step": 850 + }, + { + "epoch": 0.052634800171369116, + "grad_norm": 0.4084141254425049, + "learning_rate": 1.718e-05, + "loss": 0.0085, + "step": 860 + }, + { + "epoch": 0.05324683273150132, + "grad_norm": 0.2906867265701294, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0116, + "step": 870 + }, + { + "epoch": 0.053858865291633515, + "grad_norm": 0.41204380989074707, + "learning_rate": 1.758e-05, + "loss": 0.0076, + "step": 880 + }, + { + "epoch": 0.05447089785176571, + "grad_norm": 0.5292996764183044, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0094, + "step": 890 + }, + { + "epoch": 0.055082930411897914, + "grad_norm": 0.23192685842514038, + "learning_rate": 1.798e-05, + "loss": 0.0116, + "step": 900 + }, + { + "epoch": 0.05569496297203011, + "grad_norm": 0.41050270199775696, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0099, + "step": 910 + }, + { + "epoch": 0.05630699553216231, + "grad_norm": 0.3336002230644226, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0082, + "step": 920 + }, + { + "epoch": 0.05691902809229451, + "grad_norm": 0.46233776211738586, + "learning_rate": 1.858e-05, + "loss": 0.0104, + "step": 930 + }, + { + "epoch": 0.05753106065242671, + "grad_norm": 0.36776405572891235, + "learning_rate": 1.878e-05, + "loss": 0.0115, + "step": 940 + }, + { + "epoch": 0.05814309321255891, + "grad_norm": 0.47848618030548096, + "learning_rate": 1.898e-05, + "loss": 0.0108, + "step": 950 + }, + { + "epoch": 0.058755125772691104, + "grad_norm": 0.35507604479789734, + "learning_rate": 1.918e-05, + "loss": 0.0095, + "step": 960 + }, + { + "epoch": 0.05936715833282331, + "grad_norm": 0.4613397717475891, + "learning_rate": 1.938e-05, + "loss": 0.0119, + "step": 970 + }, + { + "epoch": 0.0599791908929555, + "grad_norm": 0.34492260217666626, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0109, + "step": 980 + }, + { + "epoch": 0.060591223453087706, + "grad_norm": 0.34624582529067993, + "learning_rate": 1.978e-05, + "loss": 0.0099, + "step": 990 + }, + { + "epoch": 0.0612032560132199, + "grad_norm": 0.9161475896835327, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0109, + "step": 1000 + }, + { + "epoch": 0.061815288573352105, + "grad_norm": 0.367807537317276, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0087, + "step": 1010 + }, + { + "epoch": 0.0624273211334843, + "grad_norm": 0.4043216407299042, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.0084, + "step": 1020 + }, + { + "epoch": 0.0630393536936165, + "grad_norm": 0.315305233001709, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0075, + "step": 1030 + }, + { + "epoch": 0.06365138625374869, + "grad_norm": 0.49702969193458557, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0103, + "step": 1040 + }, + { + "epoch": 0.0642634188138809, + "grad_norm": 0.46286216378211975, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0116, + "step": 1050 + }, + { + "epoch": 0.0648754513740131, + "grad_norm": 0.332142174243927, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0103, + "step": 1060 + }, + { + "epoch": 0.0654874839341453, + "grad_norm": 0.6118510961532593, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0118, + "step": 1070 + }, + { + "epoch": 0.06609951649427749, + "grad_norm": 0.49074795842170715, + "learning_rate": 1.999967041472886e-05, + "loss": 0.011, + "step": 1080 + }, + { + "epoch": 0.0667115490544097, + "grad_norm": 0.42575374245643616, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0125, + "step": 1090 + }, + { + "epoch": 0.0673235816145419, + "grad_norm": 0.3223794996738434, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0091, + "step": 1100 + }, + { + "epoch": 0.06793561417467409, + "grad_norm": 0.4952760636806488, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.011, + "step": 1110 + }, + { + "epoch": 0.06854764673480629, + "grad_norm": 0.36144813895225525, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0096, + "step": 1120 + }, + { + "epoch": 0.06915967929493849, + "grad_norm": 0.31190025806427, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0115, + "step": 1130 + }, + { + "epoch": 0.0697717118550707, + "grad_norm": 0.7014928460121155, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.014, + "step": 1140 + }, + { + "epoch": 0.07038374441520288, + "grad_norm": 0.4382205605506897, + "learning_rate": 1.999882759038658e-05, + "loss": 0.0111, + "step": 1150 + }, + { + "epoch": 0.07099577697533509, + "grad_norm": 0.3750714659690857, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0116, + "step": 1160 + }, + { + "epoch": 0.07160780953546729, + "grad_norm": 0.4174371361732483, + "learning_rate": 1.999849173538598e-05, + "loss": 0.009, + "step": 1170 + }, + { + "epoch": 0.07221984209559948, + "grad_norm": 0.44394591450691223, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0094, + "step": 1180 + }, + { + "epoch": 0.07283187465573168, + "grad_norm": 0.43412888050079346, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0098, + "step": 1190 + }, + { + "epoch": 0.07344390721586389, + "grad_norm": 0.6421196460723877, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.01, + "step": 1200 + }, + { + "epoch": 0.07405593977599609, + "grad_norm": 0.6313903331756592, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0137, + "step": 1210 + }, + { + "epoch": 0.07466797233612828, + "grad_norm": 0.49340254068374634, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0104, + "step": 1220 + }, + { + "epoch": 0.07528000489626048, + "grad_norm": 0.40420663356781006, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0126, + "step": 1230 + }, + { + "epoch": 0.07589203745639268, + "grad_norm": 0.3955318033695221, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.013, + "step": 1240 + }, + { + "epoch": 0.07650407001652489, + "grad_norm": 0.4967520236968994, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0098, + "step": 1250 + }, + { + "epoch": 0.07711610257665708, + "grad_norm": 0.3380029499530792, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0084, + "step": 1260 + }, + { + "epoch": 0.07772813513678928, + "grad_norm": 0.4542321562767029, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.009, + "step": 1270 + }, + { + "epoch": 0.07834016769692148, + "grad_norm": 0.4533286392688751, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0093, + "step": 1280 + }, + { + "epoch": 0.07895220025705367, + "grad_norm": 0.39559242129325867, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0106, + "step": 1290 + }, + { + "epoch": 0.07956423281718587, + "grad_norm": 0.23190362751483917, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.01, + "step": 1300 + }, + { + "epoch": 0.08017626537731808, + "grad_norm": 0.4732286334037781, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0089, + "step": 1310 + }, + { + "epoch": 0.08078829793745028, + "grad_norm": 0.3010174036026001, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0091, + "step": 1320 + }, + { + "epoch": 0.08140033049758247, + "grad_norm": 0.3989834189414978, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0097, + "step": 1330 + }, + { + "epoch": 0.08201236305771467, + "grad_norm": 0.4597114622592926, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.01, + "step": 1340 + }, + { + "epoch": 0.08262439561784687, + "grad_norm": 0.426826536655426, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.011, + "step": 1350 + }, + { + "epoch": 0.08323642817797906, + "grad_norm": 0.4876341223716736, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0105, + "step": 1360 + }, + { + "epoch": 0.08384846073811127, + "grad_norm": 0.5444457530975342, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0099, + "step": 1370 + }, + { + "epoch": 0.08446049329824347, + "grad_norm": 0.5096126794815063, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.007, + "step": 1380 + }, + { + "epoch": 0.08507252585837567, + "grad_norm": 0.43828368186950684, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.009, + "step": 1390 + }, + { + "epoch": 0.08568455841850786, + "grad_norm": 0.40163955092430115, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0103, + "step": 1400 + }, + { + "epoch": 0.08629659097864006, + "grad_norm": 0.3110432028770447, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0115, + "step": 1410 + }, + { + "epoch": 0.08690862353877227, + "grad_norm": 0.8393893241882324, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.012, + "step": 1420 + }, + { + "epoch": 0.08752065609890446, + "grad_norm": 0.2751714289188385, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0093, + "step": 1430 + }, + { + "epoch": 0.08813268865903666, + "grad_norm": 0.36969971656799316, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0112, + "step": 1440 + }, + { + "epoch": 0.08874472121916886, + "grad_norm": 0.3721938729286194, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0113, + "step": 1450 + }, + { + "epoch": 0.08935675377930107, + "grad_norm": 0.26564934849739075, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0107, + "step": 1460 + }, + { + "epoch": 0.08996878633943325, + "grad_norm": 0.36552169919013977, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0123, + "step": 1470 + }, + { + "epoch": 0.09058081889956546, + "grad_norm": 0.23664990067481995, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0074, + "step": 1480 + }, + { + "epoch": 0.09119285145969766, + "grad_norm": 0.49903133511543274, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0084, + "step": 1490 + }, + { + "epoch": 0.09180488401982985, + "grad_norm": 0.43505051732063293, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0115, + "step": 1500 + }, + { + "epoch": 0.09241691657996205, + "grad_norm": 0.20318932831287384, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0088, + "step": 1510 + }, + { + "epoch": 0.09302894914009426, + "grad_norm": 0.3289708197116852, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.008, + "step": 1520 + }, + { + "epoch": 0.09364098170022646, + "grad_norm": 0.3920934200286865, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0145, + "step": 1530 + }, + { + "epoch": 0.09425301426035865, + "grad_norm": 0.40396374464035034, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0081, + "step": 1540 + }, + { + "epoch": 0.09486504682049085, + "grad_norm": 0.4044182300567627, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.012, + "step": 1550 + }, + { + "epoch": 0.09547707938062305, + "grad_norm": 0.2318611741065979, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0115, + "step": 1560 + }, + { + "epoch": 0.09608911194075524, + "grad_norm": 0.3905714750289917, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.008, + "step": 1570 + }, + { + "epoch": 0.09670114450088745, + "grad_norm": 0.2516922652721405, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0084, + "step": 1580 + }, + { + "epoch": 0.09731317706101965, + "grad_norm": 0.338455468416214, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0122, + "step": 1590 + }, + { + "epoch": 0.09792520962115185, + "grad_norm": 0.31875041127204895, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0081, + "step": 1600 + }, + { + "epoch": 0.09853724218128404, + "grad_norm": 0.2996121644973755, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0068, + "step": 1610 + }, + { + "epoch": 0.09914927474141624, + "grad_norm": 0.4381162226200104, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0103, + "step": 1620 + }, + { + "epoch": 0.09976130730154845, + "grad_norm": 0.5531038045883179, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0168, + "step": 1630 + }, + { + "epoch": 0.10037333986168064, + "grad_norm": 1.1283385753631592, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0119, + "step": 1640 + }, + { + "epoch": 0.10098537242181284, + "grad_norm": 0.38017332553863525, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0096, + "step": 1650 + }, + { + "epoch": 0.10159740498194504, + "grad_norm": 0.4669477045536041, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0111, + "step": 1660 + }, + { + "epoch": 0.10220943754207724, + "grad_norm": 0.3903254270553589, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0098, + "step": 1670 + }, + { + "epoch": 0.10282147010220943, + "grad_norm": 0.49671587347984314, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0083, + "step": 1680 + }, + { + "epoch": 0.10343350266234164, + "grad_norm": 0.36555853486061096, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0088, + "step": 1690 + }, + { + "epoch": 0.10404553522247384, + "grad_norm": 0.21804726123809814, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0086, + "step": 1700 + }, + { + "epoch": 0.10465756778260603, + "grad_norm": 0.6744784116744995, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0109, + "step": 1710 + }, + { + "epoch": 0.10526960034273823, + "grad_norm": 0.34379470348358154, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0096, + "step": 1720 + }, + { + "epoch": 0.10588163290287043, + "grad_norm": 0.27760598063468933, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0095, + "step": 1730 + }, + { + "epoch": 0.10649366546300264, + "grad_norm": 0.36294442415237427, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0091, + "step": 1740 + }, + { + "epoch": 0.10710569802313483, + "grad_norm": 0.42200908064842224, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.011, + "step": 1750 + }, + { + "epoch": 0.10771773058326703, + "grad_norm": 0.47863906621932983, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0107, + "step": 1760 + }, + { + "epoch": 0.10832976314339923, + "grad_norm": 0.32717248797416687, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0112, + "step": 1770 + }, + { + "epoch": 0.10894179570353142, + "grad_norm": 0.4255545735359192, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0106, + "step": 1780 + }, + { + "epoch": 0.10955382826366362, + "grad_norm": 0.5034983158111572, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0098, + "step": 1790 + }, + { + "epoch": 0.11016586082379583, + "grad_norm": 0.37071412801742554, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0099, + "step": 1800 + }, + { + "epoch": 0.11077789338392803, + "grad_norm": 0.23624737560749054, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0069, + "step": 1810 + }, + { + "epoch": 0.11138992594406022, + "grad_norm": 0.5815485715866089, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0095, + "step": 1820 + }, + { + "epoch": 0.11200195850419242, + "grad_norm": 1.1828722953796387, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0104, + "step": 1830 + }, + { + "epoch": 0.11261399106432463, + "grad_norm": 0.38099589943885803, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0106, + "step": 1840 + }, + { + "epoch": 0.11322602362445681, + "grad_norm": 0.38476184010505676, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0112, + "step": 1850 + }, + { + "epoch": 0.11383805618458902, + "grad_norm": 0.48982104659080505, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0125, + "step": 1860 + }, + { + "epoch": 0.11445008874472122, + "grad_norm": 0.4165821671485901, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0099, + "step": 1870 + }, + { + "epoch": 0.11506212130485342, + "grad_norm": 0.3412662446498871, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0061, + "step": 1880 + }, + { + "epoch": 0.11567415386498561, + "grad_norm": 0.46617937088012695, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0129, + "step": 1890 + }, + { + "epoch": 0.11628618642511782, + "grad_norm": 0.2705824077129364, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0082, + "step": 1900 + }, + { + "epoch": 0.11689821898525002, + "grad_norm": 0.3567829430103302, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0125, + "step": 1910 + }, + { + "epoch": 0.11751025154538221, + "grad_norm": 0.4438138008117676, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0129, + "step": 1920 + }, + { + "epoch": 0.11812228410551441, + "grad_norm": 0.356703519821167, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0083, + "step": 1930 + }, + { + "epoch": 0.11873431666564661, + "grad_norm": 0.6039804220199585, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0086, + "step": 1940 + }, + { + "epoch": 0.11934634922577882, + "grad_norm": 0.4572801887989044, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0067, + "step": 1950 + }, + { + "epoch": 0.119958381785911, + "grad_norm": 0.5063445568084717, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0091, + "step": 1960 + }, + { + "epoch": 0.12057041434604321, + "grad_norm": 0.3467857837677002, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.008, + "step": 1970 + }, + { + "epoch": 0.12118244690617541, + "grad_norm": 0.4875742197036743, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0102, + "step": 1980 + }, + { + "epoch": 0.1217944794663076, + "grad_norm": 0.3209119141101837, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0081, + "step": 1990 + }, + { + "epoch": 0.1224065120264398, + "grad_norm": 0.4731980860233307, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0118, + "step": 2000 + }, + { + "epoch": 0.123018544586572, + "grad_norm": 0.5742963552474976, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.0125, + "step": 2010 + }, + { + "epoch": 0.12363057714670421, + "grad_norm": 0.41357406973838806, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.1242426097068364, + "grad_norm": 0.6277521252632141, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0096, + "step": 2030 + }, + { + "epoch": 0.1248546422669686, + "grad_norm": 0.41252902150154114, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0108, + "step": 2040 + }, + { + "epoch": 0.1254666748271008, + "grad_norm": 0.782122790813446, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0134, + "step": 2050 + }, + { + "epoch": 0.126078707387233, + "grad_norm": 0.45011264085769653, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0102, + "step": 2060 + }, + { + "epoch": 0.1266907399473652, + "grad_norm": 0.2724951207637787, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0088, + "step": 2070 + }, + { + "epoch": 0.12730277250749739, + "grad_norm": 0.2351481169462204, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.009, + "step": 2080 + }, + { + "epoch": 0.1279148050676296, + "grad_norm": 0.34568479657173157, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0092, + "step": 2090 + }, + { + "epoch": 0.1285268376277618, + "grad_norm": 0.44493499398231506, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0087, + "step": 2100 + }, + { + "epoch": 0.129138870187894, + "grad_norm": 0.3011283874511719, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0108, + "step": 2110 + }, + { + "epoch": 0.1297509027480262, + "grad_norm": 0.4170232117176056, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0087, + "step": 2120 + }, + { + "epoch": 0.1303629353081584, + "grad_norm": 0.2696056365966797, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0093, + "step": 2130 + }, + { + "epoch": 0.1309749678682906, + "grad_norm": 0.4092336893081665, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0083, + "step": 2140 + }, + { + "epoch": 0.13158700042842278, + "grad_norm": 0.36637401580810547, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.01, + "step": 2150 + }, + { + "epoch": 0.13219903298855498, + "grad_norm": 0.28675684332847595, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0079, + "step": 2160 + }, + { + "epoch": 0.13281106554868718, + "grad_norm": 0.27699902653694153, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0071, + "step": 2170 + }, + { + "epoch": 0.1334230981088194, + "grad_norm": 0.3832298517227173, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0104, + "step": 2180 + }, + { + "epoch": 0.1340351306689516, + "grad_norm": 0.3590598702430725, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0074, + "step": 2190 + }, + { + "epoch": 0.1346471632290838, + "grad_norm": 0.21830014884471893, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0093, + "step": 2200 + }, + { + "epoch": 0.135259195789216, + "grad_norm": 0.342492938041687, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0109, + "step": 2210 + }, + { + "epoch": 0.13587122834934817, + "grad_norm": 0.6337023973464966, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0082, + "step": 2220 + }, + { + "epoch": 0.13648326090948038, + "grad_norm": 0.41742798686027527, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0083, + "step": 2230 + }, + { + "epoch": 0.13709529346961258, + "grad_norm": 0.3180190324783325, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0083, + "step": 2240 + }, + { + "epoch": 0.13770732602974478, + "grad_norm": 0.36720144748687744, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0114, + "step": 2250 + }, + { + "epoch": 0.13831935858987698, + "grad_norm": 0.29457366466522217, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0077, + "step": 2260 + }, + { + "epoch": 0.1389313911500092, + "grad_norm": 0.24702222645282745, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0074, + "step": 2270 + }, + { + "epoch": 0.1395434237101414, + "grad_norm": 0.3203345835208893, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0079, + "step": 2280 + }, + { + "epoch": 0.14015545627027357, + "grad_norm": 0.4375395178794861, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0127, + "step": 2290 + }, + { + "epoch": 0.14076748883040577, + "grad_norm": 0.44338247179985046, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0077, + "step": 2300 + }, + { + "epoch": 0.14137952139053797, + "grad_norm": 0.31765618920326233, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0091, + "step": 2310 + }, + { + "epoch": 0.14199155395067017, + "grad_norm": 0.322534441947937, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0068, + "step": 2320 + }, + { + "epoch": 0.14260358651080238, + "grad_norm": 0.23571068048477173, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0094, + "step": 2330 + }, + { + "epoch": 0.14321561907093458, + "grad_norm": 0.26818808913230896, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0092, + "step": 2340 + }, + { + "epoch": 0.14382765163106678, + "grad_norm": 0.31886982917785645, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0084, + "step": 2350 + }, + { + "epoch": 0.14443968419119896, + "grad_norm": 0.5176070928573608, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0104, + "step": 2360 + }, + { + "epoch": 0.14505171675133116, + "grad_norm": 0.4322161078453064, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0081, + "step": 2370 + }, + { + "epoch": 0.14566374931146336, + "grad_norm": 0.4076510965824127, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0102, + "step": 2380 + }, + { + "epoch": 0.14627578187159557, + "grad_norm": 0.3808838725090027, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0096, + "step": 2390 + }, + { + "epoch": 0.14688781443172777, + "grad_norm": 0.5045232176780701, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0109, + "step": 2400 + }, + { + "epoch": 0.14749984699185997, + "grad_norm": 0.3932737708091736, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0077, + "step": 2410 + }, + { + "epoch": 0.14811187955199218, + "grad_norm": 0.28561875224113464, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0141, + "step": 2420 + }, + { + "epoch": 0.14872391211212435, + "grad_norm": 0.414410799741745, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0094, + "step": 2430 + }, + { + "epoch": 0.14933594467225655, + "grad_norm": 0.4587285816669464, + "learning_rate": 1.989086647373215e-05, + "loss": 0.009, + "step": 2440 + }, + { + "epoch": 0.14994797723238876, + "grad_norm": 0.7567377686500549, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0084, + "step": 2450 + }, + { + "epoch": 0.15056000979252096, + "grad_norm": 0.4980221390724182, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0087, + "step": 2460 + }, + { + "epoch": 0.15117204235265316, + "grad_norm": 0.41810303926467896, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0082, + "step": 2470 + }, + { + "epoch": 0.15178407491278537, + "grad_norm": 0.4193445146083832, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0131, + "step": 2480 + }, + { + "epoch": 0.15239610747291757, + "grad_norm": 0.2561246156692505, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0074, + "step": 2490 + }, + { + "epoch": 0.15300814003304977, + "grad_norm": 0.22316500544548035, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0069, + "step": 2500 + }, + { + "epoch": 0.15362017259318195, + "grad_norm": 0.31504112482070923, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0097, + "step": 2510 + }, + { + "epoch": 0.15423220515331415, + "grad_norm": 0.2944568991661072, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0101, + "step": 2520 + }, + { + "epoch": 0.15484423771344635, + "grad_norm": 0.2744649052619934, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0074, + "step": 2530 + }, + { + "epoch": 0.15545627027357856, + "grad_norm": 0.2717166841030121, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.007, + "step": 2540 + }, + { + "epoch": 0.15606830283371076, + "grad_norm": 0.32652929425239563, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0097, + "step": 2550 + }, + { + "epoch": 0.15668033539384296, + "grad_norm": 0.3169964849948883, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0089, + "step": 2560 + }, + { + "epoch": 0.15729236795397517, + "grad_norm": 0.24130010604858398, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0083, + "step": 2570 + }, + { + "epoch": 0.15790440051410734, + "grad_norm": 0.3869011700153351, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0088, + "step": 2580 + }, + { + "epoch": 0.15851643307423954, + "grad_norm": 0.2944110333919525, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0085, + "step": 2590 + }, + { + "epoch": 0.15912846563437175, + "grad_norm": 0.27993839979171753, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0097, + "step": 2600 + }, + { + "epoch": 0.15974049819450395, + "grad_norm": 0.42018845677375793, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0104, + "step": 2610 + }, + { + "epoch": 0.16035253075463615, + "grad_norm": 0.45006832480430603, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0072, + "step": 2620 + }, + { + "epoch": 0.16096456331476836, + "grad_norm": 0.275564581155777, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0072, + "step": 2630 + }, + { + "epoch": 0.16157659587490056, + "grad_norm": 0.503052294254303, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0091, + "step": 2640 + }, + { + "epoch": 0.16218862843503273, + "grad_norm": 0.33740976452827454, + "learning_rate": 1.985678043265668e-05, + "loss": 0.008, + "step": 2650 + }, + { + "epoch": 0.16280066099516494, + "grad_norm": 0.5379078984260559, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0097, + "step": 2660 + }, + { + "epoch": 0.16341269355529714, + "grad_norm": 0.3605813980102539, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0109, + "step": 2670 + }, + { + "epoch": 0.16402472611542934, + "grad_norm": 0.49490585923194885, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.013, + "step": 2680 + }, + { + "epoch": 0.16463675867556155, + "grad_norm": 0.29894375801086426, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0089, + "step": 2690 + }, + { + "epoch": 0.16524879123569375, + "grad_norm": 0.395270437002182, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0092, + "step": 2700 + }, + { + "epoch": 0.16586082379582595, + "grad_norm": 0.25507843494415283, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0078, + "step": 2710 + }, + { + "epoch": 0.16647285635595813, + "grad_norm": 0.3304852843284607, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0178, + "step": 2720 + }, + { + "epoch": 0.16708488891609033, + "grad_norm": 0.4356633126735687, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0082, + "step": 2730 + }, + { + "epoch": 0.16769692147622253, + "grad_norm": 0.4104527533054352, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0086, + "step": 2740 + }, + { + "epoch": 0.16830895403635474, + "grad_norm": 0.25723493099212646, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0097, + "step": 2750 + }, + { + "epoch": 0.16892098659648694, + "grad_norm": 0.3280608057975769, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0079, + "step": 2760 + }, + { + "epoch": 0.16953301915661914, + "grad_norm": 0.4641128480434418, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0081, + "step": 2770 + }, + { + "epoch": 0.17014505171675134, + "grad_norm": 0.2704941928386688, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.0112, + "step": 2780 + }, + { + "epoch": 0.17075708427688352, + "grad_norm": 0.42343780398368835, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0084, + "step": 2790 + }, + { + "epoch": 0.17136911683701572, + "grad_norm": 0.2606532573699951, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0085, + "step": 2800 + }, + { + "epoch": 0.17198114939714793, + "grad_norm": 0.39099374413490295, + "learning_rate": 1.982773261916081e-05, + "loss": 0.014, + "step": 2810 + }, + { + "epoch": 0.17259318195728013, + "grad_norm": 0.32653889060020447, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0095, + "step": 2820 + }, + { + "epoch": 0.17320521451741233, + "grad_norm": 0.34765321016311646, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0084, + "step": 2830 + }, + { + "epoch": 0.17381724707754453, + "grad_norm": 0.2844177186489105, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.011, + "step": 2840 + }, + { + "epoch": 0.17442927963767674, + "grad_norm": 0.5079899430274963, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0114, + "step": 2850 + }, + { + "epoch": 0.1750413121978089, + "grad_norm": 0.4043678045272827, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0073, + "step": 2860 + }, + { + "epoch": 0.17565334475794112, + "grad_norm": 0.3833003640174866, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0076, + "step": 2870 + }, + { + "epoch": 0.17626537731807332, + "grad_norm": 0.2826341986656189, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0072, + "step": 2880 + }, + { + "epoch": 0.17687740987820552, + "grad_norm": 0.6043460965156555, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0084, + "step": 2890 + }, + { + "epoch": 0.17748944243833772, + "grad_norm": 0.3238481879234314, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0149, + "step": 2900 + }, + { + "epoch": 0.17810147499846993, + "grad_norm": 0.45817995071411133, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0103, + "step": 2910 + }, + { + "epoch": 0.17871350755860213, + "grad_norm": 0.21048744022846222, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0094, + "step": 2920 + }, + { + "epoch": 0.1793255401187343, + "grad_norm": 0.3401891887187958, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0058, + "step": 2930 + }, + { + "epoch": 0.1799375726788665, + "grad_norm": 0.3655509948730469, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0071, + "step": 2940 + }, + { + "epoch": 0.1805496052389987, + "grad_norm": 0.47406241297721863, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0067, + "step": 2950 + }, + { + "epoch": 0.18116163779913091, + "grad_norm": 0.3278841972351074, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0121, + "step": 2960 + }, + { + "epoch": 0.18177367035926312, + "grad_norm": 0.271436482667923, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.008, + "step": 2970 + }, + { + "epoch": 0.18238570291939532, + "grad_norm": 0.41475561261177063, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.008, + "step": 2980 + }, + { + "epoch": 0.18299773547952752, + "grad_norm": 0.5389090776443481, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0091, + "step": 2990 + }, + { + "epoch": 0.1836097680396597, + "grad_norm": 0.3958609700202942, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0106, + "step": 3000 + }, + { + "epoch": 0.1842218005997919, + "grad_norm": 0.3456019461154938, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0084, + "step": 3010 + }, + { + "epoch": 0.1848338331599241, + "grad_norm": 0.2959386706352234, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0071, + "step": 3020 + }, + { + "epoch": 0.1854458657200563, + "grad_norm": 0.2617223858833313, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0082, + "step": 3030 + }, + { + "epoch": 0.1860578982801885, + "grad_norm": 0.45173966884613037, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0105, + "step": 3040 + }, + { + "epoch": 0.1866699308403207, + "grad_norm": 0.4127421975135803, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.008, + "step": 3050 + }, + { + "epoch": 0.18728196340045292, + "grad_norm": 0.3142230808734894, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0085, + "step": 3060 + }, + { + "epoch": 0.1878939959605851, + "grad_norm": 0.49720287322998047, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0089, + "step": 3070 + }, + { + "epoch": 0.1885060285207173, + "grad_norm": 0.6417365074157715, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.0088, + "step": 3080 + }, + { + "epoch": 0.1891180610808495, + "grad_norm": 0.44801583886146545, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.0098, + "step": 3090 + }, + { + "epoch": 0.1897300936409817, + "grad_norm": 0.3606127202510834, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0087, + "step": 3100 + }, + { + "epoch": 0.1903421262011139, + "grad_norm": 0.268971711397171, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0092, + "step": 3110 + }, + { + "epoch": 0.1909541587612461, + "grad_norm": 0.2367011308670044, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0074, + "step": 3120 + }, + { + "epoch": 0.1915661913213783, + "grad_norm": 0.41643625497817993, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0062, + "step": 3130 + }, + { + "epoch": 0.19217822388151048, + "grad_norm": 0.33202284574508667, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0081, + "step": 3140 + }, + { + "epoch": 0.1927902564416427, + "grad_norm": 0.279813289642334, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0074, + "step": 3150 + }, + { + "epoch": 0.1934022890017749, + "grad_norm": 0.5127174258232117, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0101, + "step": 3160 + }, + { + "epoch": 0.1940143215619071, + "grad_norm": 0.36921849846839905, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0078, + "step": 3170 + }, + { + "epoch": 0.1946263541220393, + "grad_norm": 0.3509728014469147, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0114, + "step": 3180 + }, + { + "epoch": 0.1952383866821715, + "grad_norm": 0.3088139295578003, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0089, + "step": 3190 + }, + { + "epoch": 0.1958504192423037, + "grad_norm": 0.43653762340545654, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0116, + "step": 3200 + }, + { + "epoch": 0.19646245180243588, + "grad_norm": 0.2522308826446533, + "learning_rate": 1.974353140804231e-05, + "loss": 0.007, + "step": 3210 + }, + { + "epoch": 0.19707448436256808, + "grad_norm": 0.37519100308418274, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0098, + "step": 3220 + }, + { + "epoch": 0.19768651692270028, + "grad_norm": 0.379027783870697, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0086, + "step": 3230 + }, + { + "epoch": 0.1982985494828325, + "grad_norm": 0.2713090479373932, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0073, + "step": 3240 + }, + { + "epoch": 0.1989105820429647, + "grad_norm": 0.41106846928596497, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0188, + "step": 3250 + }, + { + "epoch": 0.1995226146030969, + "grad_norm": 0.3914758861064911, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0097, + "step": 3260 + }, + { + "epoch": 0.2001346471632291, + "grad_norm": 0.4763018488883972, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0082, + "step": 3270 + }, + { + "epoch": 0.20074667972336127, + "grad_norm": 0.23002664744853973, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0085, + "step": 3280 + }, + { + "epoch": 0.20135871228349347, + "grad_norm": 0.2887377142906189, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0074, + "step": 3290 + }, + { + "epoch": 0.20197074484362568, + "grad_norm": 0.2322079837322235, + "learning_rate": 1.972231769371516e-05, + "loss": 0.009, + "step": 3300 + }, + { + "epoch": 0.20258277740375788, + "grad_norm": 0.39307233691215515, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0095, + "step": 3310 + }, + { + "epoch": 0.20319480996389008, + "grad_norm": 0.5209783315658569, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.012, + "step": 3320 + }, + { + "epoch": 0.20380684252402229, + "grad_norm": 0.45187172293663025, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0086, + "step": 3330 + }, + { + "epoch": 0.2044188750841545, + "grad_norm": 0.480970174074173, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0072, + "step": 3340 + }, + { + "epoch": 0.20503090764428666, + "grad_norm": 0.30979010462760925, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0091, + "step": 3350 + }, + { + "epoch": 0.20564294020441887, + "grad_norm": 0.6410729289054871, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0096, + "step": 3360 + }, + { + "epoch": 0.20625497276455107, + "grad_norm": 0.23707512021064758, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0078, + "step": 3370 + }, + { + "epoch": 0.20686700532468327, + "grad_norm": 0.3029544949531555, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0115, + "step": 3380 + }, + { + "epoch": 0.20747903788481548, + "grad_norm": 0.28677740693092346, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0068, + "step": 3390 + }, + { + "epoch": 0.20809107044494768, + "grad_norm": 0.2433662712574005, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0082, + "step": 3400 + }, + { + "epoch": 0.20870310300507988, + "grad_norm": 0.38066667318344116, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0089, + "step": 3410 + }, + { + "epoch": 0.20931513556521206, + "grad_norm": 0.3830282390117645, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0073, + "step": 3420 + }, + { + "epoch": 0.20992716812534426, + "grad_norm": 0.359684556722641, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0088, + "step": 3430 + }, + { + "epoch": 0.21053920068547646, + "grad_norm": 0.3497346341609955, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0082, + "step": 3440 + }, + { + "epoch": 0.21115123324560867, + "grad_norm": 0.3664748966693878, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0091, + "step": 3450 + }, + { + "epoch": 0.21176326580574087, + "grad_norm": 0.382804811000824, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0077, + "step": 3460 + }, + { + "epoch": 0.21237529836587307, + "grad_norm": 0.22746194899082184, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0107, + "step": 3470 + }, + { + "epoch": 0.21298733092600527, + "grad_norm": 0.4094266891479492, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0096, + "step": 3480 + }, + { + "epoch": 0.21359936348613745, + "grad_norm": 0.26990365982055664, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0089, + "step": 3490 + }, + { + "epoch": 0.21421139604626965, + "grad_norm": 0.2602371275424957, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0074, + "step": 3500 + }, + { + "epoch": 0.21482342860640186, + "grad_norm": 0.34200435876846313, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0083, + "step": 3510 + }, + { + "epoch": 0.21543546116653406, + "grad_norm": 0.4260508716106415, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0098, + "step": 3520 + }, + { + "epoch": 0.21604749372666626, + "grad_norm": 0.4017483592033386, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0103, + "step": 3530 + }, + { + "epoch": 0.21665952628679847, + "grad_norm": 0.40005844831466675, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0094, + "step": 3540 + }, + { + "epoch": 0.21727155884693067, + "grad_norm": 0.3856841027736664, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0095, + "step": 3550 + }, + { + "epoch": 0.21788359140706284, + "grad_norm": 0.3245168626308441, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0067, + "step": 3560 + }, + { + "epoch": 0.21849562396719505, + "grad_norm": 0.2698485255241394, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0079, + "step": 3570 + }, + { + "epoch": 0.21910765652732725, + "grad_norm": 0.24520452320575714, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0109, + "step": 3580 + }, + { + "epoch": 0.21971968908745945, + "grad_norm": 0.397175133228302, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0114, + "step": 3590 + }, + { + "epoch": 0.22033172164759166, + "grad_norm": 0.40339091420173645, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0094, + "step": 3600 + }, + { + "epoch": 0.22094375420772386, + "grad_norm": 0.404435396194458, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0087, + "step": 3610 + }, + { + "epoch": 0.22155578676785606, + "grad_norm": 0.3300188183784485, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0073, + "step": 3620 + }, + { + "epoch": 0.22216781932798824, + "grad_norm": 0.23486892879009247, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0087, + "step": 3630 + }, + { + "epoch": 0.22277985188812044, + "grad_norm": 0.37211188673973083, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0078, + "step": 3640 + }, + { + "epoch": 0.22339188444825264, + "grad_norm": 0.32422709465026855, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.009, + "step": 3650 + }, + { + "epoch": 0.22400391700838485, + "grad_norm": 0.43535664677619934, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0099, + "step": 3660 + }, + { + "epoch": 0.22461594956851705, + "grad_norm": 0.3295724093914032, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0074, + "step": 3670 + }, + { + "epoch": 0.22522798212864925, + "grad_norm": 0.2840734124183655, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0082, + "step": 3680 + }, + { + "epoch": 0.22584001468878145, + "grad_norm": 0.2861844599246979, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0079, + "step": 3690 + }, + { + "epoch": 0.22645204724891363, + "grad_norm": 0.3194407820701599, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0071, + "step": 3700 + }, + { + "epoch": 0.22706407980904583, + "grad_norm": 0.38770729303359985, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0076, + "step": 3710 + }, + { + "epoch": 0.22767611236917804, + "grad_norm": 0.4637960195541382, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0093, + "step": 3720 + }, + { + "epoch": 0.22828814492931024, + "grad_norm": 0.31972312927246094, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0078, + "step": 3730 + }, + { + "epoch": 0.22890017748944244, + "grad_norm": 0.5273001790046692, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0076, + "step": 3740 + }, + { + "epoch": 0.22951221004957464, + "grad_norm": 0.30589622259140015, + "learning_rate": 1.960385541132679e-05, + "loss": 0.009, + "step": 3750 + }, + { + "epoch": 0.23012424260970685, + "grad_norm": 0.31634265184402466, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0063, + "step": 3760 + }, + { + "epoch": 0.23073627516983902, + "grad_norm": 0.32762402296066284, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0081, + "step": 3770 + }, + { + "epoch": 0.23134830772997123, + "grad_norm": 0.42696496844291687, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0089, + "step": 3780 + }, + { + "epoch": 0.23196034029010343, + "grad_norm": 0.4676671624183655, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0093, + "step": 3790 + }, + { + "epoch": 0.23257237285023563, + "grad_norm": 0.3347911536693573, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0131, + "step": 3800 + }, + { + "epoch": 0.23318440541036783, + "grad_norm": 0.3083193600177765, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0072, + "step": 3810 + }, + { + "epoch": 0.23379643797050004, + "grad_norm": 0.38178423047065735, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0083, + "step": 3820 + }, + { + "epoch": 0.23440847053063224, + "grad_norm": 0.2796846330165863, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0072, + "step": 3830 + }, + { + "epoch": 0.23502050309076442, + "grad_norm": 0.37444883584976196, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.008, + "step": 3840 + }, + { + "epoch": 0.23563253565089662, + "grad_norm": 0.3286772668361664, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0092, + "step": 3850 + }, + { + "epoch": 0.23624456821102882, + "grad_norm": 0.45423513650894165, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0068, + "step": 3860 + }, + { + "epoch": 0.23685660077116102, + "grad_norm": 0.36881721019744873, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0068, + "step": 3870 + }, + { + "epoch": 0.23746863333129323, + "grad_norm": 0.3560579717159271, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0084, + "step": 3880 + }, + { + "epoch": 0.23808066589142543, + "grad_norm": 0.43887296319007874, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0076, + "step": 3890 + }, + { + "epoch": 0.23869269845155763, + "grad_norm": 0.3080165982246399, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0069, + "step": 3900 + }, + { + "epoch": 0.2393047310116898, + "grad_norm": 0.2327195703983307, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0077, + "step": 3910 + }, + { + "epoch": 0.239916763571822, + "grad_norm": 0.5960802435874939, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0088, + "step": 3920 + }, + { + "epoch": 0.24052879613195421, + "grad_norm": 0.36213600635528564, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0076, + "step": 3930 + }, + { + "epoch": 0.24114082869208642, + "grad_norm": 0.2950032949447632, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0115, + "step": 3940 + }, + { + "epoch": 0.24175286125221862, + "grad_norm": 0.4527084529399872, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0089, + "step": 3950 + }, + { + "epoch": 0.24236489381235082, + "grad_norm": 0.4422491192817688, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0135, + "step": 3960 + }, + { + "epoch": 0.24297692637248303, + "grad_norm": 0.45049232244491577, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 0.2435889589326152, + "grad_norm": 0.2566494941711426, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0095, + "step": 3980 + }, + { + "epoch": 0.2442009914927474, + "grad_norm": 0.49880343675613403, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0099, + "step": 3990 + }, + { + "epoch": 0.2448130240528796, + "grad_norm": 0.4699341952800751, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0084, + "step": 4000 + }, + { + "epoch": 0.2454250566130118, + "grad_norm": 0.41230708360671997, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0071, + "step": 4010 + }, + { + "epoch": 0.246037089173144, + "grad_norm": 0.4836854934692383, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.01, + "step": 4020 + }, + { + "epoch": 0.24664912173327622, + "grad_norm": 0.3056115508079529, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0082, + "step": 4030 + }, + { + "epoch": 0.24726115429340842, + "grad_norm": 0.151325523853302, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0062, + "step": 4040 + }, + { + "epoch": 0.2478731868535406, + "grad_norm": 0.3798811137676239, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0081, + "step": 4050 + }, + { + "epoch": 0.2484852194136728, + "grad_norm": 0.3308229148387909, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0072, + "step": 4060 + }, + { + "epoch": 0.249097251973805, + "grad_norm": 0.2891339957714081, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0074, + "step": 4070 + }, + { + "epoch": 0.2497092845339372, + "grad_norm": 0.24179549515247345, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 0.2503213170940694, + "grad_norm": 0.20879383385181427, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0064, + "step": 4090 + }, + { + "epoch": 0.2509333496542016, + "grad_norm": 0.39275774359703064, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0073, + "step": 4100 + }, + { + "epoch": 0.2515453822143338, + "grad_norm": 0.2925782799720764, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0095, + "step": 4110 + }, + { + "epoch": 0.252157414774466, + "grad_norm": 0.6465128660202026, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0102, + "step": 4120 + }, + { + "epoch": 0.2527694473345982, + "grad_norm": 0.34663915634155273, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.007, + "step": 4130 + }, + { + "epoch": 0.2533814798947304, + "grad_norm": 0.3387165367603302, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0091, + "step": 4140 + }, + { + "epoch": 0.2539935124548626, + "grad_norm": 0.32989630103111267, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0084, + "step": 4150 + }, + { + "epoch": 0.25460554501499477, + "grad_norm": 0.22870391607284546, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0071, + "step": 4160 + }, + { + "epoch": 0.255217577575127, + "grad_norm": 0.3866496682167053, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0079, + "step": 4170 + }, + { + "epoch": 0.2558296101352592, + "grad_norm": 0.29885268211364746, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0068, + "step": 4180 + }, + { + "epoch": 0.2564416426953914, + "grad_norm": 0.4693736135959625, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0095, + "step": 4190 + }, + { + "epoch": 0.2570536752555236, + "grad_norm": 0.2822454273700714, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0074, + "step": 4200 + }, + { + "epoch": 0.2576657078156558, + "grad_norm": 0.21141012012958527, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0099, + "step": 4210 + }, + { + "epoch": 0.258277740375788, + "grad_norm": 0.2284570336341858, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0102, + "step": 4220 + }, + { + "epoch": 0.2588897729359202, + "grad_norm": 0.4675048887729645, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0112, + "step": 4230 + }, + { + "epoch": 0.2595018054960524, + "grad_norm": 0.3906441628932953, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0088, + "step": 4240 + }, + { + "epoch": 0.2601138380561846, + "grad_norm": 0.22990387678146362, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0091, + "step": 4250 + }, + { + "epoch": 0.2607258706163168, + "grad_norm": 0.41871073842048645, + "learning_rate": 1.944490251296856e-05, + "loss": 0.009, + "step": 4260 + }, + { + "epoch": 0.261337903176449, + "grad_norm": 0.2724440395832062, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0074, + "step": 4270 + }, + { + "epoch": 0.2619499357365812, + "grad_norm": 0.42590636014938354, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0069, + "step": 4280 + }, + { + "epoch": 0.2625619682967134, + "grad_norm": 0.3604855239391327, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0068, + "step": 4290 + }, + { + "epoch": 0.26317400085684556, + "grad_norm": 0.475304514169693, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0082, + "step": 4300 + }, + { + "epoch": 0.26378603341697776, + "grad_norm": 0.24752479791641235, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0065, + "step": 4310 + }, + { + "epoch": 0.26439806597710996, + "grad_norm": 0.4384835958480835, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0104, + "step": 4320 + }, + { + "epoch": 0.26501009853724217, + "grad_norm": 0.24999107420444489, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0076, + "step": 4330 + }, + { + "epoch": 0.26562213109737437, + "grad_norm": 0.292491614818573, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0069, + "step": 4340 + }, + { + "epoch": 0.2662341636575066, + "grad_norm": 0.2380208522081375, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0069, + "step": 4350 + }, + { + "epoch": 0.2668461962176388, + "grad_norm": 0.2906023859977722, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0063, + "step": 4360 + }, + { + "epoch": 0.267458228777771, + "grad_norm": 0.4718990623950958, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0074, + "step": 4370 + }, + { + "epoch": 0.2680702613379032, + "grad_norm": 0.33257269859313965, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0073, + "step": 4380 + }, + { + "epoch": 0.2686822938980354, + "grad_norm": 0.34411463141441345, + "learning_rate": 1.940024231916886e-05, + "loss": 0.006, + "step": 4390 + }, + { + "epoch": 0.2692943264581676, + "grad_norm": 0.40312516689300537, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0093, + "step": 4400 + }, + { + "epoch": 0.2699063590182998, + "grad_norm": 0.2248350828886032, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0082, + "step": 4410 + }, + { + "epoch": 0.270518391578432, + "grad_norm": 0.30094820261001587, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0092, + "step": 4420 + }, + { + "epoch": 0.2711304241385642, + "grad_norm": 0.4277440309524536, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0099, + "step": 4430 + }, + { + "epoch": 0.27174245669869634, + "grad_norm": 0.2876254916191101, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0078, + "step": 4440 + }, + { + "epoch": 0.27235448925882855, + "grad_norm": 0.3453986346721649, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0096, + "step": 4450 + }, + { + "epoch": 0.27296652181896075, + "grad_norm": 0.31379634141921997, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0088, + "step": 4460 + }, + { + "epoch": 0.27357855437909295, + "grad_norm": 0.294477254152298, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0073, + "step": 4470 + }, + { + "epoch": 0.27419058693922516, + "grad_norm": 0.3773270845413208, + "learning_rate": 1.936834723687526e-05, + "loss": 0.008, + "step": 4480 + }, + { + "epoch": 0.27480261949935736, + "grad_norm": 0.31942978501319885, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0086, + "step": 4490 + }, + { + "epoch": 0.27541465205948956, + "grad_norm": 0.46827632188796997, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0081, + "step": 4500 + }, + { + "epoch": 0.27602668461962176, + "grad_norm": 0.2735249102115631, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0107, + "step": 4510 + }, + { + "epoch": 0.27663871717975397, + "grad_norm": 0.30048197507858276, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0082, + "step": 4520 + }, + { + "epoch": 0.27725074973988617, + "grad_norm": 0.3507469594478607, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0093, + "step": 4530 + }, + { + "epoch": 0.2778627823000184, + "grad_norm": 0.5642989277839661, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0091, + "step": 4540 + }, + { + "epoch": 0.2784748148601506, + "grad_norm": 0.2769993245601654, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0105, + "step": 4550 + }, + { + "epoch": 0.2790868474202828, + "grad_norm": 0.30269622802734375, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0066, + "step": 4560 + }, + { + "epoch": 0.279698879980415, + "grad_norm": 0.3717023432254791, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0114, + "step": 4570 + }, + { + "epoch": 0.28031091254054713, + "grad_norm": 0.5065163373947144, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0075, + "step": 4580 + }, + { + "epoch": 0.28092294510067933, + "grad_norm": 0.4302189350128174, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0077, + "step": 4590 + }, + { + "epoch": 0.28153497766081154, + "grad_norm": 0.44008374214172363, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0086, + "step": 4600 + }, + { + "epoch": 0.28214701022094374, + "grad_norm": 0.4647364318370819, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0073, + "step": 4610 + }, + { + "epoch": 0.28275904278107594, + "grad_norm": 0.4229913651943207, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0077, + "step": 4620 + }, + { + "epoch": 0.28337107534120815, + "grad_norm": 0.36600178480148315, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0078, + "step": 4630 + }, + { + "epoch": 0.28398310790134035, + "grad_norm": 0.47143280506134033, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0085, + "step": 4640 + }, + { + "epoch": 0.28459514046147255, + "grad_norm": 0.29140496253967285, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0056, + "step": 4650 + }, + { + "epoch": 0.28520717302160475, + "grad_norm": 0.3964666426181793, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0071, + "step": 4660 + }, + { + "epoch": 0.28581920558173696, + "grad_norm": 0.407536119222641, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0095, + "step": 4670 + }, + { + "epoch": 0.28643123814186916, + "grad_norm": 0.33687031269073486, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0091, + "step": 4680 + }, + { + "epoch": 0.28704327070200136, + "grad_norm": 0.3182448446750641, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0087, + "step": 4690 + }, + { + "epoch": 0.28765530326213357, + "grad_norm": 0.40998023748397827, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0099, + "step": 4700 + }, + { + "epoch": 0.28826733582226577, + "grad_norm": 0.28750360012054443, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0087, + "step": 4710 + }, + { + "epoch": 0.2888793683823979, + "grad_norm": 0.36494627594947815, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0062, + "step": 4720 + }, + { + "epoch": 0.2894914009425301, + "grad_norm": 0.37047910690307617, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0094, + "step": 4730 + }, + { + "epoch": 0.2901034335026623, + "grad_norm": 0.2577553987503052, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0082, + "step": 4740 + }, + { + "epoch": 0.2907154660627945, + "grad_norm": 0.24589397013187408, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0067, + "step": 4750 + }, + { + "epoch": 0.29132749862292673, + "grad_norm": 0.37927499413490295, + "learning_rate": 1.926404507646751e-05, + "loss": 0.008, + "step": 4760 + }, + { + "epoch": 0.29193953118305893, + "grad_norm": 0.40547946095466614, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0101, + "step": 4770 + }, + { + "epoch": 0.29255156374319113, + "grad_norm": 0.47896578907966614, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0114, + "step": 4780 + }, + { + "epoch": 0.29316359630332334, + "grad_norm": 0.42911696434020996, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0066, + "step": 4790 + }, + { + "epoch": 0.29377562886345554, + "grad_norm": 0.21735505759716034, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0072, + "step": 4800 + }, + { + "epoch": 0.29438766142358774, + "grad_norm": 0.25916650891304016, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0078, + "step": 4810 + }, + { + "epoch": 0.29499969398371995, + "grad_norm": 0.23863966763019562, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0086, + "step": 4820 + }, + { + "epoch": 0.29561172654385215, + "grad_norm": 0.41552650928497314, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0078, + "step": 4830 + }, + { + "epoch": 0.29622375910398435, + "grad_norm": 0.2775874733924866, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0082, + "step": 4840 + }, + { + "epoch": 0.29683579166411656, + "grad_norm": 0.28962916135787964, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0088, + "step": 4850 + }, + { + "epoch": 0.2974478242242487, + "grad_norm": 0.3488757610321045, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0076, + "step": 4860 + }, + { + "epoch": 0.2980598567843809, + "grad_norm": 0.3833489716053009, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0096, + "step": 4870 + }, + { + "epoch": 0.2986718893445131, + "grad_norm": 0.20357537269592285, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.0061, + "step": 4880 + }, + { + "epoch": 0.2992839219046453, + "grad_norm": 0.4648539423942566, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0099, + "step": 4890 + }, + { + "epoch": 0.2998959544647775, + "grad_norm": 0.2701941728591919, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0062, + "step": 4900 + }, + { + "epoch": 0.3005079870249097, + "grad_norm": 0.31277161836624146, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0069, + "step": 4910 + }, + { + "epoch": 0.3011200195850419, + "grad_norm": 0.27697697281837463, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0094, + "step": 4920 + }, + { + "epoch": 0.3017320521451741, + "grad_norm": 0.22880606353282928, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0074, + "step": 4930 + }, + { + "epoch": 0.3023440847053063, + "grad_norm": 0.258404940366745, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0078, + "step": 4940 + }, + { + "epoch": 0.30295611726543853, + "grad_norm": 0.394394189119339, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0109, + "step": 4950 + }, + { + "epoch": 0.30356814982557073, + "grad_norm": 0.24108687043190002, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0082, + "step": 4960 + }, + { + "epoch": 0.30418018238570294, + "grad_norm": 0.34520867466926575, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0098, + "step": 4970 + }, + { + "epoch": 0.30479221494583514, + "grad_norm": 0.33723267912864685, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0104, + "step": 4980 + }, + { + "epoch": 0.30540424750596734, + "grad_norm": 0.28276878595352173, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0072, + "step": 4990 + }, + { + "epoch": 0.30601628006609954, + "grad_norm": 0.32236188650131226, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.012, + "step": 5000 + }, + { + "epoch": 0.3066283126262317, + "grad_norm": 0.20596888661384583, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0073, + "step": 5010 + }, + { + "epoch": 0.3072403451863639, + "grad_norm": 0.37921255826950073, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0073, + "step": 5020 + }, + { + "epoch": 0.3078523777464961, + "grad_norm": 0.30738911032676697, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0083, + "step": 5030 + }, + { + "epoch": 0.3084644103066283, + "grad_norm": 0.1938163936138153, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0065, + "step": 5040 + }, + { + "epoch": 0.3090764428667605, + "grad_norm": 0.25826898217201233, + "learning_rate": 1.914800406458133e-05, + "loss": 0.008, + "step": 5050 + }, + { + "epoch": 0.3096884754268927, + "grad_norm": 0.18951697647571564, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0058, + "step": 5060 + }, + { + "epoch": 0.3103005079870249, + "grad_norm": 0.3877381980419159, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0095, + "step": 5070 + }, + { + "epoch": 0.3109125405471571, + "grad_norm": 0.3133573830127716, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0088, + "step": 5080 + }, + { + "epoch": 0.3115245731072893, + "grad_norm": 0.33131852746009827, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0062, + "step": 5090 + }, + { + "epoch": 0.3121366056674215, + "grad_norm": 0.21276263892650604, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0071, + "step": 5100 + }, + { + "epoch": 0.3127486382275537, + "grad_norm": 0.46878281235694885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0084, + "step": 5110 + }, + { + "epoch": 0.3133606707876859, + "grad_norm": 0.44227683544158936, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0097, + "step": 5120 + }, + { + "epoch": 0.3139727033478181, + "grad_norm": 0.41950204968452454, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0115, + "step": 5130 + }, + { + "epoch": 0.31458473590795033, + "grad_norm": 0.4214445948600769, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0081, + "step": 5140 + }, + { + "epoch": 0.3151967684680825, + "grad_norm": 0.3779868483543396, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0075, + "step": 5150 + }, + { + "epoch": 0.3158088010282147, + "grad_norm": 0.4587777853012085, + "learning_rate": 1.910187855634501e-05, + "loss": 0.009, + "step": 5160 + }, + { + "epoch": 0.3164208335883469, + "grad_norm": 0.4875587224960327, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0099, + "step": 5170 + }, + { + "epoch": 0.3170328661484791, + "grad_norm": 0.22378237545490265, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0071, + "step": 5180 + }, + { + "epoch": 0.3176448987086113, + "grad_norm": 0.3360678553581238, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0101, + "step": 5190 + }, + { + "epoch": 0.3182569312687435, + "grad_norm": 0.36370640993118286, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0068, + "step": 5200 + }, + { + "epoch": 0.3188689638288757, + "grad_norm": 0.25814393162727356, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0076, + "step": 5210 + }, + { + "epoch": 0.3194809963890079, + "grad_norm": 0.39010074734687805, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0066, + "step": 5220 + }, + { + "epoch": 0.3200930289491401, + "grad_norm": 0.44009074568748474, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0078, + "step": 5230 + }, + { + "epoch": 0.3207050615092723, + "grad_norm": 0.45733046531677246, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0074, + "step": 5240 + }, + { + "epoch": 0.3213170940694045, + "grad_norm": 0.4555135667324066, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0089, + "step": 5250 + }, + { + "epoch": 0.3219291266295367, + "grad_norm": 0.5864276885986328, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0083, + "step": 5260 + }, + { + "epoch": 0.3225411591896689, + "grad_norm": 0.3305470943450928, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0094, + "step": 5270 + }, + { + "epoch": 0.3231531917498011, + "grad_norm": 0.21458053588867188, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.0091, + "step": 5280 + }, + { + "epoch": 0.32376522430993326, + "grad_norm": 0.2927384376525879, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.007, + "step": 5290 + }, + { + "epoch": 0.32437725687006547, + "grad_norm": 0.387608140707016, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0068, + "step": 5300 + }, + { + "epoch": 0.32498928943019767, + "grad_norm": 0.28193122148513794, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0065, + "step": 5310 + }, + { + "epoch": 0.3256013219903299, + "grad_norm": 0.33098119497299194, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0082, + "step": 5320 + }, + { + "epoch": 0.3262133545504621, + "grad_norm": 0.5442482233047485, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0124, + "step": 5330 + }, + { + "epoch": 0.3268253871105943, + "grad_norm": 0.503669798374176, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0093, + "step": 5340 + }, + { + "epoch": 0.3274374196707265, + "grad_norm": 0.2307574301958084, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0071, + "step": 5350 + }, + { + "epoch": 0.3280494522308587, + "grad_norm": 0.3543917238712311, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.007, + "step": 5360 + }, + { + "epoch": 0.3286614847909909, + "grad_norm": 0.21763169765472412, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0059, + "step": 5370 + }, + { + "epoch": 0.3292735173511231, + "grad_norm": 0.38023391366004944, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0068, + "step": 5380 + }, + { + "epoch": 0.3298855499112553, + "grad_norm": 0.44597327709198, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0069, + "step": 5390 + }, + { + "epoch": 0.3304975824713875, + "grad_norm": 0.2994389533996582, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0083, + "step": 5400 + }, + { + "epoch": 0.3311096150315197, + "grad_norm": 0.26668304204940796, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0071, + "step": 5410 + }, + { + "epoch": 0.3317216475916519, + "grad_norm": 0.25944197177886963, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0065, + "step": 5420 + }, + { + "epoch": 0.33233368015178405, + "grad_norm": 0.3646431267261505, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0065, + "step": 5430 + }, + { + "epoch": 0.33294571271191625, + "grad_norm": 0.34860959649086, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0098, + "step": 5440 + }, + { + "epoch": 0.33355774527204846, + "grad_norm": 0.33718568086624146, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0069, + "step": 5450 + }, + { + "epoch": 0.33416977783218066, + "grad_norm": 0.2417302280664444, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0064, + "step": 5460 + }, + { + "epoch": 0.33478181039231286, + "grad_norm": 0.26607826352119446, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0048, + "step": 5470 + }, + { + "epoch": 0.33539384295244506, + "grad_norm": 0.31762364506721497, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0064, + "step": 5480 + }, + { + "epoch": 0.33600587551257727, + "grad_norm": 0.21427015960216522, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0079, + "step": 5490 + }, + { + "epoch": 0.33661790807270947, + "grad_norm": 0.3372637629508972, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0077, + "step": 5500 + }, + { + "epoch": 0.3372299406328417, + "grad_norm": 0.3760700821876526, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0066, + "step": 5510 + }, + { + "epoch": 0.3378419731929739, + "grad_norm": 0.22838029265403748, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0061, + "step": 5520 + }, + { + "epoch": 0.3384540057531061, + "grad_norm": 0.3105243444442749, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0089, + "step": 5530 + }, + { + "epoch": 0.3390660383132383, + "grad_norm": 0.23694929480552673, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0086, + "step": 5540 + }, + { + "epoch": 0.3396780708733705, + "grad_norm": 0.22935174405574799, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0074, + "step": 5550 + }, + { + "epoch": 0.3402901034335027, + "grad_norm": 0.26384714245796204, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0069, + "step": 5560 + }, + { + "epoch": 0.34090213599363484, + "grad_norm": 0.33245643973350525, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0061, + "step": 5570 + }, + { + "epoch": 0.34151416855376704, + "grad_norm": 0.3904813230037689, + "learning_rate": 1.891523933768891e-05, + "loss": 0.009, + "step": 5580 + }, + { + "epoch": 0.34212620111389924, + "grad_norm": 0.33858415484428406, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0067, + "step": 5590 + }, + { + "epoch": 0.34273823367403145, + "grad_norm": 0.3197486996650696, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0082, + "step": 5600 + }, + { + "epoch": 0.34335026623416365, + "grad_norm": 0.23814789950847626, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0068, + "step": 5610 + }, + { + "epoch": 0.34396229879429585, + "grad_norm": 0.3820457458496094, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0082, + "step": 5620 + }, + { + "epoch": 0.34457433135442805, + "grad_norm": 0.27518680691719055, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0063, + "step": 5630 + }, + { + "epoch": 0.34518636391456026, + "grad_norm": 0.24741721153259277, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0076, + "step": 5640 + }, + { + "epoch": 0.34579839647469246, + "grad_norm": 0.5140052437782288, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0178, + "step": 5650 + }, + { + "epoch": 0.34641042903482466, + "grad_norm": 0.5363543033599854, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0097, + "step": 5660 + }, + { + "epoch": 0.34702246159495687, + "grad_norm": 0.41116055846214294, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0078, + "step": 5670 + }, + { + "epoch": 0.34763449415508907, + "grad_norm": 0.412762314081192, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0086, + "step": 5680 + }, + { + "epoch": 0.34824652671522127, + "grad_norm": 0.399527907371521, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0088, + "step": 5690 + }, + { + "epoch": 0.3488585592753535, + "grad_norm": 0.3447834551334381, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0084, + "step": 5700 + }, + { + "epoch": 0.3494705918354856, + "grad_norm": 0.3418859541416168, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0092, + "step": 5710 + }, + { + "epoch": 0.3500826243956178, + "grad_norm": 0.3336535692214966, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.0065, + "step": 5720 + }, + { + "epoch": 0.35069465695575003, + "grad_norm": 0.34575122594833374, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0068, + "step": 5730 + }, + { + "epoch": 0.35130668951588223, + "grad_norm": 0.34325110912323, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.01, + "step": 5740 + }, + { + "epoch": 0.35191872207601443, + "grad_norm": 0.20104236900806427, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0085, + "step": 5750 + }, + { + "epoch": 0.35253075463614664, + "grad_norm": 0.33699074387550354, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0073, + "step": 5760 + }, + { + "epoch": 0.35314278719627884, + "grad_norm": 0.33322635293006897, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0091, + "step": 5770 + }, + { + "epoch": 0.35375481975641104, + "grad_norm": 0.26897475123405457, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0087, + "step": 5780 + }, + { + "epoch": 0.35436685231654325, + "grad_norm": 0.5310013890266418, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0067, + "step": 5790 + }, + { + "epoch": 0.35497888487667545, + "grad_norm": 0.4203440845012665, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0097, + "step": 5800 + }, + { + "epoch": 0.35559091743680765, + "grad_norm": 0.2179369181394577, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0082, + "step": 5810 + }, + { + "epoch": 0.35620294999693985, + "grad_norm": 0.2789444625377655, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0066, + "step": 5820 + }, + { + "epoch": 0.35681498255707206, + "grad_norm": 0.28009694814682007, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.007, + "step": 5830 + }, + { + "epoch": 0.35742701511720426, + "grad_norm": 0.304768443107605, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0065, + "step": 5840 + }, + { + "epoch": 0.3580390476773364, + "grad_norm": 0.2829401195049286, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0061, + "step": 5850 + }, + { + "epoch": 0.3586510802374686, + "grad_norm": 0.3388998508453369, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0083, + "step": 5860 + }, + { + "epoch": 0.3592631127976008, + "grad_norm": 0.3313426673412323, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0069, + "step": 5870 + }, + { + "epoch": 0.359875145357733, + "grad_norm": 0.2886904180049896, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0094, + "step": 5880 + }, + { + "epoch": 0.3604871779178652, + "grad_norm": 0.3132432997226715, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0086, + "step": 5890 + }, + { + "epoch": 0.3610992104779974, + "grad_norm": 0.37195107340812683, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0137, + "step": 5900 + }, + { + "epoch": 0.3617112430381296, + "grad_norm": 0.30853375792503357, + "learning_rate": 1.875708056549365e-05, + "loss": 0.01, + "step": 5910 + }, + { + "epoch": 0.36232327559826183, + "grad_norm": 0.39785459637641907, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0073, + "step": 5920 + }, + { + "epoch": 0.36293530815839403, + "grad_norm": 0.26958727836608887, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0059, + "step": 5930 + }, + { + "epoch": 0.36354734071852624, + "grad_norm": 0.354956716299057, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0069, + "step": 5940 + }, + { + "epoch": 0.36415937327865844, + "grad_norm": 0.3470858037471771, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0066, + "step": 5950 + }, + { + "epoch": 0.36477140583879064, + "grad_norm": 0.30000701546669006, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0075, + "step": 5960 + }, + { + "epoch": 0.36538343839892284, + "grad_norm": 0.5558263063430786, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0083, + "step": 5970 + }, + { + "epoch": 0.36599547095905505, + "grad_norm": 0.39146295189857483, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0062, + "step": 5980 + }, + { + "epoch": 0.3666075035191872, + "grad_norm": 0.44002753496170044, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0089, + "step": 5990 + }, + { + "epoch": 0.3672195360793194, + "grad_norm": 0.3220095932483673, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0097, + "step": 6000 + }, + { + "epoch": 0.3678315686394516, + "grad_norm": 0.3569507598876953, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0067, + "step": 6010 + }, + { + "epoch": 0.3684436011995838, + "grad_norm": 0.3004184365272522, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0084, + "step": 6020 + }, + { + "epoch": 0.369055633759716, + "grad_norm": 0.2931320071220398, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0079, + "step": 6030 + }, + { + "epoch": 0.3696676663198482, + "grad_norm": 0.39551016688346863, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0088, + "step": 6040 + }, + { + "epoch": 0.3702796988799804, + "grad_norm": 0.33755603432655334, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0082, + "step": 6050 + }, + { + "epoch": 0.3708917314401126, + "grad_norm": 0.3101558983325958, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0069, + "step": 6060 + }, + { + "epoch": 0.3715037640002448, + "grad_norm": 0.2921602129936218, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0065, + "step": 6070 + }, + { + "epoch": 0.372115796560377, + "grad_norm": 0.3601403832435608, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0063, + "step": 6080 + }, + { + "epoch": 0.3727278291205092, + "grad_norm": 0.34929168224334717, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0073, + "step": 6090 + }, + { + "epoch": 0.3733398616806414, + "grad_norm": 0.3987390995025635, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0068, + "step": 6100 + }, + { + "epoch": 0.37395189424077363, + "grad_norm": 0.2641090452671051, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0091, + "step": 6110 + }, + { + "epoch": 0.37456392680090583, + "grad_norm": 0.23139338195323944, + "learning_rate": 1.865125972978549e-05, + "loss": 0.006, + "step": 6120 + }, + { + "epoch": 0.375175959361038, + "grad_norm": 0.26552167534828186, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0056, + "step": 6130 + }, + { + "epoch": 0.3757879919211702, + "grad_norm": 0.43827885389328003, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0084, + "step": 6140 + }, + { + "epoch": 0.3764000244813024, + "grad_norm": 0.27495354413986206, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.012, + "step": 6150 + }, + { + "epoch": 0.3770120570414346, + "grad_norm": 0.36078640818595886, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0104, + "step": 6160 + }, + { + "epoch": 0.3776240896015668, + "grad_norm": 0.28252753615379333, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0103, + "step": 6170 + }, + { + "epoch": 0.378236122161699, + "grad_norm": 0.2674558162689209, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0069, + "step": 6180 + }, + { + "epoch": 0.3788481547218312, + "grad_norm": 0.21457509696483612, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0089, + "step": 6190 + }, + { + "epoch": 0.3794601872819634, + "grad_norm": 0.3142339885234833, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0075, + "step": 6200 + }, + { + "epoch": 0.3800722198420956, + "grad_norm": 0.32714203000068665, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0084, + "step": 6210 + }, + { + "epoch": 0.3806842524022278, + "grad_norm": 0.2632557153701782, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0072, + "step": 6220 + }, + { + "epoch": 0.38129628496236, + "grad_norm": 0.1893932968378067, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0063, + "step": 6230 + }, + { + "epoch": 0.3819083175224922, + "grad_norm": 0.49935290217399597, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0087, + "step": 6240 + }, + { + "epoch": 0.3825203500826244, + "grad_norm": 0.34605127573013306, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0101, + "step": 6250 + }, + { + "epoch": 0.3831323826427566, + "grad_norm": 0.3294198513031006, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0067, + "step": 6260 + }, + { + "epoch": 0.38374441520288877, + "grad_norm": 0.34797370433807373, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0065, + "step": 6270 + }, + { + "epoch": 0.38435644776302097, + "grad_norm": 0.37710750102996826, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0061, + "step": 6280 + }, + { + "epoch": 0.3849684803231532, + "grad_norm": 0.39949893951416016, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0066, + "step": 6290 + }, + { + "epoch": 0.3855805128832854, + "grad_norm": 0.33014294505119324, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0095, + "step": 6300 + }, + { + "epoch": 0.3861925454434176, + "grad_norm": 0.4329249858856201, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0089, + "step": 6310 + }, + { + "epoch": 0.3868045780035498, + "grad_norm": 0.298330157995224, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0076, + "step": 6320 + }, + { + "epoch": 0.387416610563682, + "grad_norm": 0.2672661542892456, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0074, + "step": 6330 + }, + { + "epoch": 0.3880286431238142, + "grad_norm": 0.48193076252937317, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0103, + "step": 6340 + }, + { + "epoch": 0.3886406756839464, + "grad_norm": 0.29180601239204407, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0075, + "step": 6350 + }, + { + "epoch": 0.3892527082440786, + "grad_norm": 0.21320492029190063, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0077, + "step": 6360 + }, + { + "epoch": 0.3898647408042108, + "grad_norm": 0.37252935767173767, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0078, + "step": 6370 + }, + { + "epoch": 0.390476773364343, + "grad_norm": 0.284586101770401, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0089, + "step": 6380 + }, + { + "epoch": 0.3910888059244752, + "grad_norm": 0.5030382871627808, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0088, + "step": 6390 + }, + { + "epoch": 0.3917008384846074, + "grad_norm": 0.357239305973053, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0069, + "step": 6400 + }, + { + "epoch": 0.39231287104473955, + "grad_norm": 0.20308594405651093, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0062, + "step": 6410 + }, + { + "epoch": 0.39292490360487176, + "grad_norm": 0.2678150534629822, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0085, + "step": 6420 + }, + { + "epoch": 0.39353693616500396, + "grad_norm": 0.35160595178604126, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0072, + "step": 6430 + }, + { + "epoch": 0.39414896872513616, + "grad_norm": 0.33254173398017883, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0083, + "step": 6440 + }, + { + "epoch": 0.39476100128526836, + "grad_norm": 0.22763408720493317, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0061, + "step": 6450 + }, + { + "epoch": 0.39537303384540057, + "grad_norm": 0.20889192819595337, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0067, + "step": 6460 + }, + { + "epoch": 0.39598506640553277, + "grad_norm": 0.22515206038951874, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0086, + "step": 6470 + }, + { + "epoch": 0.396597098965665, + "grad_norm": 0.36421817541122437, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.0064, + "step": 6480 + }, + { + "epoch": 0.3972091315257972, + "grad_norm": 0.3869773745536804, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0084, + "step": 6490 + }, + { + "epoch": 0.3978211640859294, + "grad_norm": 0.26248687505722046, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0061, + "step": 6500 + }, + { + "epoch": 0.3984331966460616, + "grad_norm": 0.22152310609817505, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0057, + "step": 6510 + }, + { + "epoch": 0.3990452292061938, + "grad_norm": 0.25921961665153503, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0071, + "step": 6520 + }, + { + "epoch": 0.399657261766326, + "grad_norm": 0.3289903998374939, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0076, + "step": 6530 + }, + { + "epoch": 0.4002692943264582, + "grad_norm": 0.2767571210861206, + "learning_rate": 1.8427795928237e-05, + "loss": 0.01, + "step": 6540 + }, + { + "epoch": 0.40088132688659034, + "grad_norm": 0.46339666843414307, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0064, + "step": 6550 + }, + { + "epoch": 0.40149335944672254, + "grad_norm": 0.2942553460597992, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0067, + "step": 6560 + }, + { + "epoch": 0.40210539200685474, + "grad_norm": 0.3868240714073181, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0085, + "step": 6570 + }, + { + "epoch": 0.40271742456698695, + "grad_norm": 0.3999684154987335, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0067, + "step": 6580 + }, + { + "epoch": 0.40332945712711915, + "grad_norm": 0.42856812477111816, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0091, + "step": 6590 + }, + { + "epoch": 0.40394148968725135, + "grad_norm": 0.3099806010723114, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0084, + "step": 6600 + }, + { + "epoch": 0.40455352224738356, + "grad_norm": 0.3798827826976776, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0066, + "step": 6610 + }, + { + "epoch": 0.40516555480751576, + "grad_norm": 0.19007280468940735, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0068, + "step": 6620 + }, + { + "epoch": 0.40577758736764796, + "grad_norm": 0.3723277151584625, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0085, + "step": 6630 + }, + { + "epoch": 0.40638961992778017, + "grad_norm": 0.21034900844097137, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0069, + "step": 6640 + }, + { + "epoch": 0.40700165248791237, + "grad_norm": 0.29838645458221436, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0095, + "step": 6650 + }, + { + "epoch": 0.40761368504804457, + "grad_norm": 0.2645854353904724, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0061, + "step": 6660 + }, + { + "epoch": 0.4082257176081768, + "grad_norm": 0.21633592247962952, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.006, + "step": 6670 + }, + { + "epoch": 0.408837750168309, + "grad_norm": 0.25387731194496155, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.008, + "step": 6680 + }, + { + "epoch": 0.4094497827284412, + "grad_norm": 0.3752288520336151, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0092, + "step": 6690 + }, + { + "epoch": 0.41006181528857333, + "grad_norm": 0.33368971943855286, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0072, + "step": 6700 + }, + { + "epoch": 0.41067384784870553, + "grad_norm": 0.34388917684555054, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0074, + "step": 6710 + }, + { + "epoch": 0.41128588040883773, + "grad_norm": 0.2683192789554596, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.007, + "step": 6720 + }, + { + "epoch": 0.41189791296896994, + "grad_norm": 0.5121234059333801, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0074, + "step": 6730 + }, + { + "epoch": 0.41250994552910214, + "grad_norm": 0.333406925201416, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0117, + "step": 6740 + }, + { + "epoch": 0.41312197808923434, + "grad_norm": 0.26011794805526733, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0062, + "step": 6750 + }, + { + "epoch": 0.41373401064936655, + "grad_norm": 0.28925821185112, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0066, + "step": 6760 + }, + { + "epoch": 0.41434604320949875, + "grad_norm": 0.2202957570552826, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0058, + "step": 6770 + }, + { + "epoch": 0.41495807576963095, + "grad_norm": 0.2740793824195862, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0072, + "step": 6780 + }, + { + "epoch": 0.41557010832976315, + "grad_norm": 0.46569427847862244, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0069, + "step": 6790 + }, + { + "epoch": 0.41618214088989536, + "grad_norm": 0.3959881067276001, + "learning_rate": 1.828172598376902e-05, + "loss": 0.009, + "step": 6800 + }, + { + "epoch": 0.41679417345002756, + "grad_norm": 0.2465214729309082, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0068, + "step": 6810 + }, + { + "epoch": 0.41740620601015976, + "grad_norm": 0.3207756280899048, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0083, + "step": 6820 + }, + { + "epoch": 0.41801823857029197, + "grad_norm": 0.5600990653038025, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0076, + "step": 6830 + }, + { + "epoch": 0.4186302711304241, + "grad_norm": 0.32832831144332886, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0072, + "step": 6840 + }, + { + "epoch": 0.4192423036905563, + "grad_norm": 0.3397129774093628, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0083, + "step": 6850 + }, + { + "epoch": 0.4198543362506885, + "grad_norm": 0.3481312096118927, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0064, + "step": 6860 + }, + { + "epoch": 0.4204663688108207, + "grad_norm": 0.4542059898376465, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0104, + "step": 6870 + }, + { + "epoch": 0.4210784013709529, + "grad_norm": 0.2517620325088501, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0089, + "step": 6880 + }, + { + "epoch": 0.42169043393108513, + "grad_norm": 0.3671923875808716, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0068, + "step": 6890 + }, + { + "epoch": 0.42230246649121733, + "grad_norm": 0.41340726613998413, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0084, + "step": 6900 + }, + { + "epoch": 0.42291449905134954, + "grad_norm": 0.22815965116024017, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0079, + "step": 6910 + }, + { + "epoch": 0.42352653161148174, + "grad_norm": 0.35324010252952576, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0073, + "step": 6920 + }, + { + "epoch": 0.42413856417161394, + "grad_norm": 0.30134323239326477, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0071, + "step": 6930 + }, + { + "epoch": 0.42475059673174614, + "grad_norm": 0.4007415771484375, + "learning_rate": 1.82006727813775e-05, + "loss": 0.006, + "step": 6940 + }, + { + "epoch": 0.42536262929187835, + "grad_norm": 0.3320179879665375, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0074, + "step": 6950 + }, + { + "epoch": 0.42597466185201055, + "grad_norm": 0.311971515417099, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0062, + "step": 6960 + }, + { + "epoch": 0.42658669441214275, + "grad_norm": 0.34347453713417053, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0068, + "step": 6970 + }, + { + "epoch": 0.4271987269722749, + "grad_norm": 0.25632336735725403, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0113, + "step": 6980 + }, + { + "epoch": 0.4278107595324071, + "grad_norm": 0.21711130440235138, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0068, + "step": 6990 + }, + { + "epoch": 0.4284227920925393, + "grad_norm": 0.3381270170211792, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0064, + "step": 7000 + }, + { + "epoch": 0.4290348246526715, + "grad_norm": 0.32262885570526123, + "learning_rate": 1.815952390818299e-05, + "loss": 0.0091, + "step": 7010 + }, + { + "epoch": 0.4296468572128037, + "grad_norm": 0.65865558385849, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0104, + "step": 7020 + }, + { + "epoch": 0.4302588897729359, + "grad_norm": 0.3021128177642822, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.009, + "step": 7030 + }, + { + "epoch": 0.4308709223330681, + "grad_norm": 0.2859005331993103, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0065, + "step": 7040 + }, + { + "epoch": 0.4314829548932003, + "grad_norm": 0.3379405736923218, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0062, + "step": 7050 + }, + { + "epoch": 0.4320949874533325, + "grad_norm": 0.22009991109371185, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.007, + "step": 7060 + }, + { + "epoch": 0.4327070200134647, + "grad_norm": 0.24766206741333008, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0072, + "step": 7070 + }, + { + "epoch": 0.43331905257359693, + "grad_norm": 0.3557615280151367, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0096, + "step": 7080 + }, + { + "epoch": 0.43393108513372913, + "grad_norm": 0.5700691938400269, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0066, + "step": 7090 + }, + { + "epoch": 0.43454311769386134, + "grad_norm": 0.3194892704486847, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0065, + "step": 7100 + }, + { + "epoch": 0.43515515025399354, + "grad_norm": 0.2766750752925873, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0074, + "step": 7110 + }, + { + "epoch": 0.4357671828141257, + "grad_norm": 0.2775132656097412, + "learning_rate": 1.809403050791396e-05, + "loss": 0.007, + "step": 7120 + }, + { + "epoch": 0.4363792153742579, + "grad_norm": 0.4468507170677185, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0066, + "step": 7130 + }, + { + "epoch": 0.4369912479343901, + "grad_norm": 0.3282400369644165, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0185, + "step": 7140 + }, + { + "epoch": 0.4376032804945223, + "grad_norm": 0.2625710964202881, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0088, + "step": 7150 + }, + { + "epoch": 0.4382153130546545, + "grad_norm": 0.47729599475860596, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.008, + "step": 7160 + }, + { + "epoch": 0.4388273456147867, + "grad_norm": 0.30350950360298157, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0091, + "step": 7170 + }, + { + "epoch": 0.4394393781749189, + "grad_norm": 0.3514627516269684, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0065, + "step": 7180 + }, + { + "epoch": 0.4400514107350511, + "grad_norm": 0.26150578260421753, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0087, + "step": 7190 + }, + { + "epoch": 0.4406634432951833, + "grad_norm": 0.374138206243515, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0073, + "step": 7200 + }, + { + "epoch": 0.4412754758553155, + "grad_norm": 0.2980635166168213, + "learning_rate": 1.803969531201634e-05, + "loss": 0.007, + "step": 7210 + }, + { + "epoch": 0.4418875084154477, + "grad_norm": 0.38190510869026184, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0077, + "step": 7220 + }, + { + "epoch": 0.4424995409755799, + "grad_norm": 0.28819066286087036, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0065, + "step": 7230 + }, + { + "epoch": 0.4431115735357121, + "grad_norm": 0.43382275104522705, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0077, + "step": 7240 + }, + { + "epoch": 0.4437236060958443, + "grad_norm": 0.31589648127555847, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0065, + "step": 7250 + }, + { + "epoch": 0.4443356386559765, + "grad_norm": 0.3744536340236664, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0067, + "step": 7260 + }, + { + "epoch": 0.4449476712161087, + "grad_norm": 0.2600225806236267, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.008, + "step": 7270 + }, + { + "epoch": 0.4455597037762409, + "grad_norm": 0.28064799308776855, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0061, + "step": 7280 + }, + { + "epoch": 0.4461717363363731, + "grad_norm": 0.2745135426521301, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0075, + "step": 7290 + }, + { + "epoch": 0.4467837688965053, + "grad_norm": 0.23609793186187744, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0056, + "step": 7300 + }, + { + "epoch": 0.4473958014566375, + "grad_norm": 0.35910022258758545, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0079, + "step": 7310 + }, + { + "epoch": 0.4480078340167697, + "grad_norm": 0.22230662405490875, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0065, + "step": 7320 + }, + { + "epoch": 0.4486198665769019, + "grad_norm": 0.3835199475288391, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.008, + "step": 7330 + }, + { + "epoch": 0.4492318991370341, + "grad_norm": 0.37863102555274963, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0083, + "step": 7340 + }, + { + "epoch": 0.4498439316971663, + "grad_norm": 0.25412216782569885, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0056, + "step": 7350 + }, + { + "epoch": 0.4504559642572985, + "grad_norm": 0.43248918652534485, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0061, + "step": 7360 + }, + { + "epoch": 0.4510679968174307, + "grad_norm": 0.2937811613082886, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0073, + "step": 7370 + }, + { + "epoch": 0.4516800293775629, + "grad_norm": 0.3018436133861542, + "learning_rate": 1.793524061803872e-05, + "loss": 0.007, + "step": 7380 + }, + { + "epoch": 0.4522920619376951, + "grad_norm": 0.32781726121902466, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0079, + "step": 7390 + }, + { + "epoch": 0.45290409449782726, + "grad_norm": 0.2843719720840454, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0067, + "step": 7400 + }, + { + "epoch": 0.45351612705795946, + "grad_norm": 0.27588292956352234, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0092, + "step": 7410 + }, + { + "epoch": 0.45412815961809166, + "grad_norm": 0.38858234882354736, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0067, + "step": 7420 + }, + { + "epoch": 0.45474019217822387, + "grad_norm": 0.4235166609287262, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0082, + "step": 7430 + }, + { + "epoch": 0.45535222473835607, + "grad_norm": 0.272210031747818, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0059, + "step": 7440 + }, + { + "epoch": 0.4559642572984883, + "grad_norm": 0.23851896822452545, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0071, + "step": 7450 + }, + { + "epoch": 0.4565762898586205, + "grad_norm": 0.37179476022720337, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0073, + "step": 7460 + }, + { + "epoch": 0.4571883224187527, + "grad_norm": 0.31902605295181274, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.009, + "step": 7470 + }, + { + "epoch": 0.4578003549788849, + "grad_norm": 0.47023633122444153, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0091, + "step": 7480 + }, + { + "epoch": 0.4584123875390171, + "grad_norm": 0.35726839303970337, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0079, + "step": 7490 + }, + { + "epoch": 0.4590244200991493, + "grad_norm": 0.27567291259765625, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0073, + "step": 7500 + }, + { + "epoch": 0.4596364526592815, + "grad_norm": 0.23053516447544098, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0065, + "step": 7510 + }, + { + "epoch": 0.4602484852194137, + "grad_norm": 0.2169056385755539, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0054, + "step": 7520 + }, + { + "epoch": 0.4608605177795459, + "grad_norm": 0.2912258207798004, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0083, + "step": 7530 + }, + { + "epoch": 0.46147255033967804, + "grad_norm": 0.2527846097946167, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.006, + "step": 7540 + }, + { + "epoch": 0.46208458289981025, + "grad_norm": 0.3878445029258728, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0079, + "step": 7550 + }, + { + "epoch": 0.46269661545994245, + "grad_norm": 0.3981980085372925, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0081, + "step": 7560 + }, + { + "epoch": 0.46330864802007465, + "grad_norm": 0.48834845423698425, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0086, + "step": 7570 + }, + { + "epoch": 0.46392068058020686, + "grad_norm": 0.3045276701450348, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0085, + "step": 7580 + }, + { + "epoch": 0.46453271314033906, + "grad_norm": 0.23345299065113068, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0072, + "step": 7590 + }, + { + "epoch": 0.46514474570047126, + "grad_norm": 0.3632943034172058, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0085, + "step": 7600 + }, + { + "epoch": 0.46575677826060347, + "grad_norm": 0.19813670217990875, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0073, + "step": 7610 + }, + { + "epoch": 0.46636881082073567, + "grad_norm": 0.36094173789024353, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0085, + "step": 7620 + }, + { + "epoch": 0.46698084338086787, + "grad_norm": 0.30049464106559753, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0085, + "step": 7630 + }, + { + "epoch": 0.4675928759410001, + "grad_norm": 0.27693697810173035, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0057, + "step": 7640 + }, + { + "epoch": 0.4682049085011323, + "grad_norm": 0.3656866252422333, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0064, + "step": 7650 + }, + { + "epoch": 0.4688169410612645, + "grad_norm": 0.602168083190918, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0076, + "step": 7660 + }, + { + "epoch": 0.4694289736213967, + "grad_norm": 0.3553078770637512, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0062, + "step": 7670 + }, + { + "epoch": 0.47004100618152883, + "grad_norm": 0.326695054769516, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0062, + "step": 7680 + }, + { + "epoch": 0.47065303874166103, + "grad_norm": 0.2762170732021332, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0075, + "step": 7690 + }, + { + "epoch": 0.47126507130179324, + "grad_norm": 0.35057321190834045, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0063, + "step": 7700 + }, + { + "epoch": 0.47187710386192544, + "grad_norm": 0.3906462788581848, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0086, + "step": 7710 + }, + { + "epoch": 0.47248913642205764, + "grad_norm": 0.290752112865448, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0087, + "step": 7720 + }, + { + "epoch": 0.47310116898218985, + "grad_norm": 0.2242034673690796, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0068, + "step": 7730 + }, + { + "epoch": 0.47371320154232205, + "grad_norm": 0.3283435106277466, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0068, + "step": 7740 + }, + { + "epoch": 0.47432523410245425, + "grad_norm": 0.24059069156646729, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.0056, + "step": 7750 + }, + { + "epoch": 0.47493726666258645, + "grad_norm": 0.2978667914867401, + "learning_rate": 1.769330275540774e-05, + "loss": 0.007, + "step": 7760 + }, + { + "epoch": 0.47554929922271866, + "grad_norm": 0.2605571150779724, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0084, + "step": 7770 + }, + { + "epoch": 0.47616133178285086, + "grad_norm": 0.4010445475578308, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0075, + "step": 7780 + }, + { + "epoch": 0.47677336434298306, + "grad_norm": 0.31932029128074646, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0076, + "step": 7790 + }, + { + "epoch": 0.47738539690311527, + "grad_norm": 0.3508684039115906, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0067, + "step": 7800 + }, + { + "epoch": 0.47799742946324747, + "grad_norm": 0.2835206091403961, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0064, + "step": 7810 + }, + { + "epoch": 0.4786094620233796, + "grad_norm": 0.2661663293838501, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0051, + "step": 7820 + }, + { + "epoch": 0.4792214945835118, + "grad_norm": 0.4146379828453064, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0073, + "step": 7830 + }, + { + "epoch": 0.479833527143644, + "grad_norm": 0.38621196150779724, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0076, + "step": 7840 + }, + { + "epoch": 0.4804455597037762, + "grad_norm": 0.19052188098430634, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.008, + "step": 7850 + }, + { + "epoch": 0.48105759226390843, + "grad_norm": 0.3699149489402771, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0071, + "step": 7860 + }, + { + "epoch": 0.48166962482404063, + "grad_norm": 0.3756427764892578, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0071, + "step": 7870 + }, + { + "epoch": 0.48228165738417283, + "grad_norm": 0.2987386882305145, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0069, + "step": 7880 + }, + { + "epoch": 0.48289368994430504, + "grad_norm": 0.24891899526119232, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0077, + "step": 7890 + }, + { + "epoch": 0.48350572250443724, + "grad_norm": 0.44080299139022827, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.011, + "step": 7900 + }, + { + "epoch": 0.48411775506456944, + "grad_norm": 0.20801177620887756, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0089, + "step": 7910 + }, + { + "epoch": 0.48472978762470165, + "grad_norm": 0.31475305557250977, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0091, + "step": 7920 + }, + { + "epoch": 0.48534182018483385, + "grad_norm": 0.29783639311790466, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0082, + "step": 7930 + }, + { + "epoch": 0.48595385274496605, + "grad_norm": 0.3330203890800476, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0071, + "step": 7940 + }, + { + "epoch": 0.48656588530509826, + "grad_norm": 0.3537667691707611, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0068, + "step": 7950 + }, + { + "epoch": 0.4871779178652304, + "grad_norm": 0.2810688316822052, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0059, + "step": 7960 + }, + { + "epoch": 0.4877899504253626, + "grad_norm": 0.3359779715538025, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0059, + "step": 7970 + }, + { + "epoch": 0.4884019829854948, + "grad_norm": 0.36015257239341736, + "learning_rate": 1.754802282200567e-05, + "loss": 0.008, + "step": 7980 + }, + { + "epoch": 0.489014015545627, + "grad_norm": 0.2647690176963806, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0065, + "step": 7990 + }, + { + "epoch": 0.4896260481057592, + "grad_norm": 0.23366811871528625, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0068, + "step": 8000 + }, + { + "epoch": 0.4902380806658914, + "grad_norm": 0.2904139757156372, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0054, + "step": 8010 + }, + { + "epoch": 0.4908501132260236, + "grad_norm": 0.30941230058670044, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0069, + "step": 8020 + }, + { + "epoch": 0.4914621457861558, + "grad_norm": 0.1959473341703415, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0063, + "step": 8030 + }, + { + "epoch": 0.492074178346288, + "grad_norm": 0.33349713683128357, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0059, + "step": 8040 + }, + { + "epoch": 0.49268621090642023, + "grad_norm": 0.39017921686172485, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0067, + "step": 8050 + }, + { + "epoch": 0.49329824346655243, + "grad_norm": 0.36401957273483276, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0061, + "step": 8060 + }, + { + "epoch": 0.49391027602668464, + "grad_norm": 0.22296921908855438, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0071, + "step": 8070 + }, + { + "epoch": 0.49452230858681684, + "grad_norm": 0.8712129592895508, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0104, + "step": 8080 + }, + { + "epoch": 0.49513434114694904, + "grad_norm": 0.39942649006843567, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0068, + "step": 8090 + }, + { + "epoch": 0.4957463737070812, + "grad_norm": 0.3821292817592621, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0076, + "step": 8100 + }, + { + "epoch": 0.4963584062672134, + "grad_norm": 0.35861077904701233, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0086, + "step": 8110 + }, + { + "epoch": 0.4969704388273456, + "grad_norm": 0.38629451394081116, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0068, + "step": 8120 + }, + { + "epoch": 0.4975824713874778, + "grad_norm": 3.412374973297119, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0168, + "step": 8130 + }, + { + "epoch": 0.49819450394761, + "grad_norm": 0.2893833816051483, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0065, + "step": 8140 + }, + { + "epoch": 0.4988065365077422, + "grad_norm": 0.37679117918014526, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0058, + "step": 8150 + }, + { + "epoch": 0.4994185690678744, + "grad_norm": 0.2745130658149719, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0082, + "step": 8160 + }, + { + "epoch": 0.5000306016280066, + "grad_norm": 0.30250442028045654, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0065, + "step": 8170 + }, + { + "epoch": 0.5006426341881388, + "grad_norm": 0.19602464139461517, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0056, + "step": 8180 + }, + { + "epoch": 0.501254666748271, + "grad_norm": 0.4736115634441376, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0062, + "step": 8190 + }, + { + "epoch": 0.5018666993084032, + "grad_norm": 0.25439244508743286, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0069, + "step": 8200 + }, + { + "epoch": 0.5024787318685354, + "grad_norm": 0.19290995597839355, + "learning_rate": 1.739216409306913e-05, + "loss": 0.007, + "step": 8210 + }, + { + "epoch": 0.5030907644286676, + "grad_norm": 0.24844267964363098, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0071, + "step": 8220 + }, + { + "epoch": 0.5037027969887998, + "grad_norm": 0.21179668605327606, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0071, + "step": 8230 + }, + { + "epoch": 0.504314829548932, + "grad_norm": 0.29139387607574463, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0082, + "step": 8240 + }, + { + "epoch": 0.5049268621090642, + "grad_norm": 0.2621973752975464, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0068, + "step": 8250 + }, + { + "epoch": 0.5055388946691964, + "grad_norm": 0.23394125699996948, + "learning_rate": 1.735775329110705e-05, + "loss": 0.006, + "step": 8260 + }, + { + "epoch": 0.5061509272293286, + "grad_norm": 0.28399863839149475, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0067, + "step": 8270 + }, + { + "epoch": 0.5067629597894608, + "grad_norm": 0.5048072934150696, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.008, + "step": 8280 + }, + { + "epoch": 0.507374992349593, + "grad_norm": 0.33848801255226135, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0054, + "step": 8290 + }, + { + "epoch": 0.5079870249097252, + "grad_norm": 0.28341951966285706, + "learning_rate": 1.733009030001197e-05, + "loss": 0.008, + "step": 8300 + }, + { + "epoch": 0.5085990574698575, + "grad_norm": 0.3223153054714203, + "learning_rate": 1.732315596014244e-05, + "loss": 0.007, + "step": 8310 + }, + { + "epoch": 0.5092110900299895, + "grad_norm": 0.23227599263191223, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0049, + "step": 8320 + }, + { + "epoch": 0.5098231225901217, + "grad_norm": 0.2847786247730255, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.007, + "step": 8330 + }, + { + "epoch": 0.510435155150254, + "grad_norm": 0.2026357650756836, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.007, + "step": 8340 + }, + { + "epoch": 0.5110471877103862, + "grad_norm": 0.3617453873157501, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0067, + "step": 8350 + }, + { + "epoch": 0.5116592202705184, + "grad_norm": 0.4439109265804291, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0067, + "step": 8360 + }, + { + "epoch": 0.5122712528306506, + "grad_norm": 0.26640209555625916, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0086, + "step": 8370 + }, + { + "epoch": 0.5128832853907828, + "grad_norm": 0.38045984506607056, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0075, + "step": 8380 + }, + { + "epoch": 0.513495317950915, + "grad_norm": 0.23035791516304016, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.006, + "step": 8390 + }, + { + "epoch": 0.5141073505110472, + "grad_norm": 0.40618664026260376, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0071, + "step": 8400 + }, + { + "epoch": 0.5147193830711794, + "grad_norm": 0.2593354880809784, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0064, + "step": 8410 + }, + { + "epoch": 0.5153314156313116, + "grad_norm": 0.27723655104637146, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0062, + "step": 8420 + }, + { + "epoch": 0.5159434481914438, + "grad_norm": 0.3793911039829254, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0059, + "step": 8430 + }, + { + "epoch": 0.516555480751576, + "grad_norm": 0.28634312748908997, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0063, + "step": 8440 + }, + { + "epoch": 0.5171675133117082, + "grad_norm": 0.39417290687561035, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0078, + "step": 8450 + }, + { + "epoch": 0.5177795458718404, + "grad_norm": 0.3043057322502136, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0064, + "step": 8460 + }, + { + "epoch": 0.5183915784319726, + "grad_norm": 0.36794111132621765, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0106, + "step": 8470 + }, + { + "epoch": 0.5190036109921048, + "grad_norm": 0.312161922454834, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0067, + "step": 8480 + }, + { + "epoch": 0.519615643552237, + "grad_norm": 0.39240267872810364, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0065, + "step": 8490 + }, + { + "epoch": 0.5202276761123692, + "grad_norm": 0.4500446915626526, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0073, + "step": 8500 + }, + { + "epoch": 0.5208397086725014, + "grad_norm": 0.22808927297592163, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0072, + "step": 8510 + }, + { + "epoch": 0.5214517412326336, + "grad_norm": 0.3262411057949066, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0065, + "step": 8520 + }, + { + "epoch": 0.5220637737927658, + "grad_norm": 0.472229927778244, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0068, + "step": 8530 + }, + { + "epoch": 0.522675806352898, + "grad_norm": 0.31563568115234375, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0074, + "step": 8540 + }, + { + "epoch": 0.5232878389130302, + "grad_norm": 0.27949750423431396, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0064, + "step": 8550 + }, + { + "epoch": 0.5238998714731624, + "grad_norm": 0.30297499895095825, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0075, + "step": 8560 + }, + { + "epoch": 0.5245119040332946, + "grad_norm": 0.3946770429611206, + "learning_rate": 1.714028248198457e-05, + "loss": 0.011, + "step": 8570 + }, + { + "epoch": 0.5251239365934268, + "grad_norm": 0.3405992090702057, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0082, + "step": 8580 + }, + { + "epoch": 0.525735969153559, + "grad_norm": 0.2963511347770691, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0066, + "step": 8590 + }, + { + "epoch": 0.5263480017136911, + "grad_norm": 0.1909177303314209, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.006, + "step": 8600 + }, + { + "epoch": 0.5269600342738233, + "grad_norm": 0.3378836512565613, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0058, + "step": 8610 + }, + { + "epoch": 0.5275720668339555, + "grad_norm": 0.30862805247306824, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0067, + "step": 8620 + }, + { + "epoch": 0.5281840993940877, + "grad_norm": 0.397293359041214, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0071, + "step": 8630 + }, + { + "epoch": 0.5287961319542199, + "grad_norm": 0.3665411174297333, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0068, + "step": 8640 + }, + { + "epoch": 0.5294081645143521, + "grad_norm": 0.34842419624328613, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0068, + "step": 8650 + }, + { + "epoch": 0.5300201970744843, + "grad_norm": 0.38205671310424805, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0065, + "step": 8660 + }, + { + "epoch": 0.5306322296346165, + "grad_norm": 0.35549092292785645, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0068, + "step": 8670 + }, + { + "epoch": 0.5312442621947487, + "grad_norm": 0.15676020085811615, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0055, + "step": 8680 + }, + { + "epoch": 0.5318562947548809, + "grad_norm": 0.22985056042671204, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0071, + "step": 8690 + }, + { + "epoch": 0.5324683273150131, + "grad_norm": 0.2743426263332367, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0047, + "step": 8700 + }, + { + "epoch": 0.5330803598751453, + "grad_norm": 0.2503803074359894, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0079, + "step": 8710 + }, + { + "epoch": 0.5336923924352776, + "grad_norm": 0.5036469101905823, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0095, + "step": 8720 + }, + { + "epoch": 0.5343044249954098, + "grad_norm": 0.2349964827299118, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0068, + "step": 8730 + }, + { + "epoch": 0.534916457555542, + "grad_norm": 0.28706061840057373, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0065, + "step": 8740 + }, + { + "epoch": 0.5355284901156742, + "grad_norm": 0.21812452375888824, + "learning_rate": 1.701081551967764e-05, + "loss": 0.008, + "step": 8750 + }, + { + "epoch": 0.5361405226758064, + "grad_norm": 0.301618754863739, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0069, + "step": 8760 + }, + { + "epoch": 0.5367525552359386, + "grad_norm": 0.35402950644493103, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0067, + "step": 8770 + }, + { + "epoch": 0.5373645877960708, + "grad_norm": 0.2875203788280487, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0097, + "step": 8780 + }, + { + "epoch": 0.537976620356203, + "grad_norm": 0.2358965128660202, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0053, + "step": 8790 + }, + { + "epoch": 0.5385886529163352, + "grad_norm": 0.14462094008922577, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0059, + "step": 8800 + }, + { + "epoch": 0.5392006854764674, + "grad_norm": 0.17893171310424805, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0062, + "step": 8810 + }, + { + "epoch": 0.5398127180365996, + "grad_norm": 0.2923351526260376, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0078, + "step": 8820 + }, + { + "epoch": 0.5404247505967318, + "grad_norm": 0.3288479745388031, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0067, + "step": 8830 + }, + { + "epoch": 0.541036783156864, + "grad_norm": 0.3996310532093048, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.006, + "step": 8840 + }, + { + "epoch": 0.5416488157169962, + "grad_norm": 0.24345380067825317, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0067, + "step": 8850 + }, + { + "epoch": 0.5422608482771284, + "grad_norm": 0.26688340306282043, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0072, + "step": 8860 + }, + { + "epoch": 0.5428728808372606, + "grad_norm": 0.4816153645515442, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0099, + "step": 8870 + }, + { + "epoch": 0.5434849133973927, + "grad_norm": 0.22544988989830017, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.007, + "step": 8880 + }, + { + "epoch": 0.5440969459575249, + "grad_norm": 0.2820419669151306, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0064, + "step": 8890 + }, + { + "epoch": 0.5447089785176571, + "grad_norm": 0.2758846879005432, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0072, + "step": 8900 + }, + { + "epoch": 0.5453210110777893, + "grad_norm": 0.4620129466056824, + "learning_rate": 1.689381359053773e-05, + "loss": 0.008, + "step": 8910 + }, + { + "epoch": 0.5459330436379215, + "grad_norm": 0.5567039847373962, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0079, + "step": 8920 + }, + { + "epoch": 0.5465450761980537, + "grad_norm": 0.347251832485199, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.006, + "step": 8930 + }, + { + "epoch": 0.5471571087581859, + "grad_norm": 0.31768012046813965, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0065, + "step": 8940 + }, + { + "epoch": 0.5477691413183181, + "grad_norm": 0.24245156347751617, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0052, + "step": 8950 + }, + { + "epoch": 0.5483811738784503, + "grad_norm": 0.2124931961297989, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0074, + "step": 8960 + }, + { + "epoch": 0.5489932064385825, + "grad_norm": 0.18998636305332184, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0056, + "step": 8970 + }, + { + "epoch": 0.5496052389987147, + "grad_norm": 0.2667362689971924, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0056, + "step": 8980 + }, + { + "epoch": 0.5502172715588469, + "grad_norm": 0.4424617886543274, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0091, + "step": 8990 + }, + { + "epoch": 0.5508293041189791, + "grad_norm": 0.33623644709587097, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0061, + "step": 9000 + }, + { + "epoch": 0.5514413366791113, + "grad_norm": 0.29990604519844055, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0059, + "step": 9010 + }, + { + "epoch": 0.5520533692392435, + "grad_norm": 0.4384118914604187, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0059, + "step": 9020 + }, + { + "epoch": 0.5526654017993757, + "grad_norm": 0.3468496799468994, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0068, + "step": 9030 + }, + { + "epoch": 0.5532774343595079, + "grad_norm": 0.3473573327064514, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0062, + "step": 9040 + }, + { + "epoch": 0.5538894669196401, + "grad_norm": 0.36125242710113525, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0062, + "step": 9050 + }, + { + "epoch": 0.5545014994797723, + "grad_norm": 0.2603420615196228, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0091, + "step": 9060 + }, + { + "epoch": 0.5551135320399045, + "grad_norm": 0.27355659008026123, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0074, + "step": 9070 + }, + { + "epoch": 0.5557255646000367, + "grad_norm": 0.24741119146347046, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0064, + "step": 9080 + }, + { + "epoch": 0.556337597160169, + "grad_norm": 0.2001475840806961, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0094, + "step": 9090 + }, + { + "epoch": 0.5569496297203012, + "grad_norm": 0.41522347927093506, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0088, + "step": 9100 + }, + { + "epoch": 0.5575616622804334, + "grad_norm": 0.27282488346099854, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0062, + "step": 9110 + }, + { + "epoch": 0.5581736948405656, + "grad_norm": 0.26905956864356995, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.007, + "step": 9120 + }, + { + "epoch": 0.5587857274006978, + "grad_norm": 0.24747484922409058, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0084, + "step": 9130 + }, + { + "epoch": 0.55939775996083, + "grad_norm": 0.1863871067762375, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0061, + "step": 9140 + }, + { + "epoch": 0.5600097925209622, + "grad_norm": 0.3599740266799927, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0063, + "step": 9150 + }, + { + "epoch": 0.5606218250810943, + "grad_norm": 0.2238125205039978, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0047, + "step": 9160 + }, + { + "epoch": 0.5612338576412265, + "grad_norm": 0.272077351808548, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.006, + "step": 9170 + }, + { + "epoch": 0.5618458902013587, + "grad_norm": 0.2371625155210495, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0064, + "step": 9180 + }, + { + "epoch": 0.5624579227614909, + "grad_norm": 0.12783293426036835, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0054, + "step": 9190 + }, + { + "epoch": 0.5630699553216231, + "grad_norm": 0.3144581615924835, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0064, + "step": 9200 + }, + { + "epoch": 0.5636819878817553, + "grad_norm": 0.31995031237602234, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0082, + "step": 9210 + }, + { + "epoch": 0.5642940204418875, + "grad_norm": 0.31995660066604614, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0083, + "step": 9220 + }, + { + "epoch": 0.5649060530020197, + "grad_norm": 0.5018982291221619, + "learning_rate": 1.665453350687773e-05, + "loss": 0.007, + "step": 9230 + }, + { + "epoch": 0.5655180855621519, + "grad_norm": 0.2927841544151306, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0104, + "step": 9240 + }, + { + "epoch": 0.5661301181222841, + "grad_norm": 0.21124979853630066, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0068, + "step": 9250 + }, + { + "epoch": 0.5667421506824163, + "grad_norm": 0.25787463784217834, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0059, + "step": 9260 + }, + { + "epoch": 0.5673541832425485, + "grad_norm": 0.3194720447063446, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0065, + "step": 9270 + }, + { + "epoch": 0.5679662158026807, + "grad_norm": 0.24165599048137665, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.006, + "step": 9280 + }, + { + "epoch": 0.5685782483628129, + "grad_norm": 0.4880482256412506, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0096, + "step": 9290 + }, + { + "epoch": 0.5691902809229451, + "grad_norm": 0.24660199880599976, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0098, + "step": 9300 + }, + { + "epoch": 0.5698023134830773, + "grad_norm": 0.24707400798797607, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0052, + "step": 9310 + }, + { + "epoch": 0.5704143460432095, + "grad_norm": 0.33855682611465454, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.011, + "step": 9320 + }, + { + "epoch": 0.5710263786033417, + "grad_norm": 0.22913751006126404, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0074, + "step": 9330 + }, + { + "epoch": 0.5716384111634739, + "grad_norm": 0.24127185344696045, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0062, + "step": 9340 + }, + { + "epoch": 0.5722504437236061, + "grad_norm": 0.26104915142059326, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0065, + "step": 9350 + }, + { + "epoch": 0.5728624762837383, + "grad_norm": 0.21698857843875885, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0062, + "step": 9360 + }, + { + "epoch": 0.5734745088438705, + "grad_norm": 0.29092445969581604, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0081, + "step": 9370 + }, + { + "epoch": 0.5740865414040027, + "grad_norm": 0.2534378468990326, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0064, + "step": 9380 + }, + { + "epoch": 0.5746985739641349, + "grad_norm": 0.28900131583213806, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0064, + "step": 9390 + }, + { + "epoch": 0.5753106065242671, + "grad_norm": 0.3028101921081543, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0104, + "step": 9400 + }, + { + "epoch": 0.5759226390843993, + "grad_norm": 0.28851139545440674, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0069, + "step": 9410 + }, + { + "epoch": 0.5765346716445315, + "grad_norm": 0.5735841393470764, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0072, + "step": 9420 + }, + { + "epoch": 0.5771467042046637, + "grad_norm": 0.20355567336082458, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0056, + "step": 9430 + }, + { + "epoch": 0.5777587367647958, + "grad_norm": 0.37027955055236816, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.006, + "step": 9440 + }, + { + "epoch": 0.578370769324928, + "grad_norm": 0.2701684832572937, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0066, + "step": 9450 + }, + { + "epoch": 0.5789828018850602, + "grad_norm": 0.17381855845451355, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0054, + "step": 9460 + }, + { + "epoch": 0.5795948344451924, + "grad_norm": 0.250261515378952, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0096, + "step": 9470 + }, + { + "epoch": 0.5802068670053246, + "grad_norm": 0.22972841560840607, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0058, + "step": 9480 + }, + { + "epoch": 0.5808188995654568, + "grad_norm": 0.22654809057712555, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0061, + "step": 9490 + }, + { + "epoch": 0.581430932125589, + "grad_norm": 0.17165100574493408, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0058, + "step": 9500 + }, + { + "epoch": 0.5820429646857213, + "grad_norm": 0.2462143450975418, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0054, + "step": 9510 + }, + { + "epoch": 0.5826549972458535, + "grad_norm": 0.3970383107662201, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0069, + "step": 9520 + }, + { + "epoch": 0.5832670298059857, + "grad_norm": 0.21578988432884216, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0054, + "step": 9530 + }, + { + "epoch": 0.5838790623661179, + "grad_norm": 0.5680915713310242, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0072, + "step": 9540 + }, + { + "epoch": 0.5844910949262501, + "grad_norm": 0.24070246517658234, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0073, + "step": 9550 + }, + { + "epoch": 0.5851031274863823, + "grad_norm": 0.2524685263633728, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0068, + "step": 9560 + }, + { + "epoch": 0.5857151600465145, + "grad_norm": 0.27286672592163086, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.008, + "step": 9570 + }, + { + "epoch": 0.5863271926066467, + "grad_norm": 0.3459629714488983, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0088, + "step": 9580 + }, + { + "epoch": 0.5869392251667789, + "grad_norm": 0.2964814603328705, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0066, + "step": 9590 + }, + { + "epoch": 0.5875512577269111, + "grad_norm": 0.3559853434562683, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0064, + "step": 9600 + }, + { + "epoch": 0.5881632902870433, + "grad_norm": 0.256898432970047, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0056, + "step": 9610 + }, + { + "epoch": 0.5887753228471755, + "grad_norm": 0.25032711029052734, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0052, + "step": 9620 + }, + { + "epoch": 0.5893873554073077, + "grad_norm": 0.2467224895954132, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0048, + "step": 9630 + }, + { + "epoch": 0.5899993879674399, + "grad_norm": 0.5331161618232727, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0078, + "step": 9640 + }, + { + "epoch": 0.5906114205275721, + "grad_norm": 0.33348897099494934, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0068, + "step": 9650 + }, + { + "epoch": 0.5912234530877043, + "grad_norm": 0.21435993909835815, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0058, + "step": 9660 + }, + { + "epoch": 0.5918354856478365, + "grad_norm": 0.35850396752357483, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0068, + "step": 9670 + }, + { + "epoch": 0.5924475182079687, + "grad_norm": 0.3007623851299286, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0063, + "step": 9680 + }, + { + "epoch": 0.5930595507681009, + "grad_norm": 0.22949714958667755, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0054, + "step": 9690 + }, + { + "epoch": 0.5936715833282331, + "grad_norm": 0.23259367048740387, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0048, + "step": 9700 + }, + { + "epoch": 0.5942836158883653, + "grad_norm": 0.2305079996585846, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0047, + "step": 9710 + }, + { + "epoch": 0.5948956484484974, + "grad_norm": 0.33875930309295654, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0063, + "step": 9720 + }, + { + "epoch": 0.5955076810086296, + "grad_norm": 0.3981896936893463, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0076, + "step": 9730 + }, + { + "epoch": 0.5961197135687618, + "grad_norm": 0.280831515789032, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0075, + "step": 9740 + }, + { + "epoch": 0.596731746128894, + "grad_norm": 0.26045629382133484, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0064, + "step": 9750 + }, + { + "epoch": 0.5973437786890262, + "grad_norm": 0.23102521896362305, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0071, + "step": 9760 + }, + { + "epoch": 0.5979558112491584, + "grad_norm": 0.5013224482536316, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0063, + "step": 9770 + }, + { + "epoch": 0.5985678438092906, + "grad_norm": 0.45689067244529724, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0088, + "step": 9780 + }, + { + "epoch": 0.5991798763694228, + "grad_norm": 0.27118632197380066, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0065, + "step": 9790 + }, + { + "epoch": 0.599791908929555, + "grad_norm": 0.420202374458313, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0076, + "step": 9800 + }, + { + "epoch": 0.6004039414896872, + "grad_norm": 0.35844025015830994, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0056, + "step": 9810 + }, + { + "epoch": 0.6010159740498194, + "grad_norm": 0.2205585241317749, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0082, + "step": 9820 + }, + { + "epoch": 0.6016280066099516, + "grad_norm": 0.18860426545143127, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.007, + "step": 9830 + }, + { + "epoch": 0.6022400391700838, + "grad_norm": 0.25045180320739746, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0082, + "step": 9840 + }, + { + "epoch": 0.602852071730216, + "grad_norm": 0.2581705152988434, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0065, + "step": 9850 + }, + { + "epoch": 0.6034641042903482, + "grad_norm": 0.25894811749458313, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0058, + "step": 9860 + }, + { + "epoch": 0.6040761368504804, + "grad_norm": 0.43305444717407227, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0066, + "step": 9870 + }, + { + "epoch": 0.6046881694106127, + "grad_norm": 0.2295757383108139, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0069, + "step": 9880 + }, + { + "epoch": 0.6053002019707449, + "grad_norm": 0.29785802960395813, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0074, + "step": 9890 + }, + { + "epoch": 0.6059122345308771, + "grad_norm": 0.3353278338909149, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0068, + "step": 9900 + }, + { + "epoch": 0.6065242670910093, + "grad_norm": 0.29115045070648193, + "learning_rate": 1.612387195896372e-05, + "loss": 0.008, + "step": 9910 + }, + { + "epoch": 0.6071362996511415, + "grad_norm": 0.3202555477619171, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0071, + "step": 9920 + }, + { + "epoch": 0.6077483322112737, + "grad_norm": 0.2849314212799072, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.005, + "step": 9930 + }, + { + "epoch": 0.6083603647714059, + "grad_norm": 0.2768756151199341, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0051, + "step": 9940 + }, + { + "epoch": 0.6089723973315381, + "grad_norm": 0.3138035535812378, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0058, + "step": 9950 + }, + { + "epoch": 0.6095844298916703, + "grad_norm": 0.20827682316303253, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0058, + "step": 9960 + }, + { + "epoch": 0.6101964624518025, + "grad_norm": 0.29986995458602905, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0076, + "step": 9970 + }, + { + "epoch": 0.6108084950119347, + "grad_norm": 0.23564326763153076, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0056, + "step": 9980 + }, + { + "epoch": 0.6114205275720669, + "grad_norm": 0.24854765832424164, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0066, + "step": 9990 + }, + { + "epoch": 0.6120325601321991, + "grad_norm": 0.5696694850921631, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0072, + "step": 10000 + }, + { + "epoch": 0.6126445926923312, + "grad_norm": 0.24267911911010742, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.0061, + "step": 10010 + }, + { + "epoch": 0.6132566252524634, + "grad_norm": 0.1955283135175705, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0076, + "step": 10020 + }, + { + "epoch": 0.6138686578125956, + "grad_norm": 0.3427830934524536, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0073, + "step": 10030 + }, + { + "epoch": 0.6144806903727278, + "grad_norm": 0.38532915711402893, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0078, + "step": 10040 + }, + { + "epoch": 0.61509272293286, + "grad_norm": 0.4302294850349426, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0071, + "step": 10050 + }, + { + "epoch": 0.6157047554929922, + "grad_norm": 0.38420233130455017, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0072, + "step": 10060 + }, + { + "epoch": 0.6163167880531244, + "grad_norm": 0.23822636902332306, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.004, + "step": 10070 + }, + { + "epoch": 0.6169288206132566, + "grad_norm": 0.25123289227485657, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0065, + "step": 10080 + }, + { + "epoch": 0.6175408531733888, + "grad_norm": 0.23007746040821075, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0059, + "step": 10090 + }, + { + "epoch": 0.618152885733521, + "grad_norm": 0.24051082134246826, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0089, + "step": 10100 + }, + { + "epoch": 0.6187649182936532, + "grad_norm": 0.26246321201324463, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0052, + "step": 10110 + }, + { + "epoch": 0.6193769508537854, + "grad_norm": 0.3160432279109955, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0059, + "step": 10120 + }, + { + "epoch": 0.6199889834139176, + "grad_norm": 0.42534199357032776, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0071, + "step": 10130 + }, + { + "epoch": 0.6206010159740498, + "grad_norm": 0.22966268658638, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0052, + "step": 10140 + }, + { + "epoch": 0.621213048534182, + "grad_norm": 0.22234882414340973, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0073, + "step": 10150 + }, + { + "epoch": 0.6218250810943142, + "grad_norm": 0.31061676144599915, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0066, + "step": 10160 + }, + { + "epoch": 0.6224371136544464, + "grad_norm": 0.34178492426872253, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0063, + "step": 10170 + }, + { + "epoch": 0.6230491462145786, + "grad_norm": 0.263583779335022, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0079, + "step": 10180 + }, + { + "epoch": 0.6236611787747108, + "grad_norm": 0.3774336278438568, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0066, + "step": 10190 + }, + { + "epoch": 0.624273211334843, + "grad_norm": 0.29274430871009827, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.007, + "step": 10200 + }, + { + "epoch": 0.6248852438949752, + "grad_norm": 0.31850868463516235, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0056, + "step": 10210 + }, + { + "epoch": 0.6254972764551074, + "grad_norm": 0.3084369897842407, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0086, + "step": 10220 + }, + { + "epoch": 0.6261093090152396, + "grad_norm": 0.21596118807792664, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0072, + "step": 10230 + }, + { + "epoch": 0.6267213415753718, + "grad_norm": 0.16397996246814728, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0075, + "step": 10240 + }, + { + "epoch": 0.627333374135504, + "grad_norm": 0.15055827796459198, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0046, + "step": 10250 + }, + { + "epoch": 0.6279454066956363, + "grad_norm": 0.23483684659004211, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0064, + "step": 10260 + }, + { + "epoch": 0.6285574392557685, + "grad_norm": 0.3131091594696045, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0069, + "step": 10270 + }, + { + "epoch": 0.6291694718159007, + "grad_norm": 0.27958226203918457, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0067, + "step": 10280 + }, + { + "epoch": 0.6297815043760328, + "grad_norm": 0.23422567546367645, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0086, + "step": 10290 + }, + { + "epoch": 0.630393536936165, + "grad_norm": 0.4644703269004822, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0067, + "step": 10300 + }, + { + "epoch": 0.6310055694962972, + "grad_norm": 0.45787107944488525, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0068, + "step": 10310 + }, + { + "epoch": 0.6316176020564294, + "grad_norm": 0.21038737893104553, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0074, + "step": 10320 + }, + { + "epoch": 0.6322296346165616, + "grad_norm": 0.23812010884284973, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0054, + "step": 10330 + }, + { + "epoch": 0.6328416671766938, + "grad_norm": 0.36856284737586975, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0061, + "step": 10340 + }, + { + "epoch": 0.633453699736826, + "grad_norm": 0.3540131151676178, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0074, + "step": 10350 + }, + { + "epoch": 0.6340657322969582, + "grad_norm": 0.3004823923110962, + "learning_rate": 1.575723252169281e-05, + "loss": 0.006, + "step": 10360 + }, + { + "epoch": 0.6346777648570904, + "grad_norm": 0.17188489437103271, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0053, + "step": 10370 + }, + { + "epoch": 0.6352897974172226, + "grad_norm": 0.21710847318172455, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0062, + "step": 10380 + }, + { + "epoch": 0.6359018299773548, + "grad_norm": 0.2356785386800766, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0061, + "step": 10390 + }, + { + "epoch": 0.636513862537487, + "grad_norm": 0.2736414670944214, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0063, + "step": 10400 + }, + { + "epoch": 0.6371258950976192, + "grad_norm": 0.23872444033622742, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.006, + "step": 10410 + }, + { + "epoch": 0.6377379276577514, + "grad_norm": 0.24478361010551453, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0073, + "step": 10420 + }, + { + "epoch": 0.6383499602178836, + "grad_norm": 0.2964334487915039, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0048, + "step": 10430 + }, + { + "epoch": 0.6389619927780158, + "grad_norm": 0.2760549783706665, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0051, + "step": 10440 + }, + { + "epoch": 0.639574025338148, + "grad_norm": 0.2598065137863159, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0072, + "step": 10450 + }, + { + "epoch": 0.6401860578982802, + "grad_norm": 0.346999853849411, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0052, + "step": 10460 + }, + { + "epoch": 0.6407980904584124, + "grad_norm": 0.31291016936302185, + "learning_rate": 1.56658563993822e-05, + "loss": 0.007, + "step": 10470 + }, + { + "epoch": 0.6414101230185446, + "grad_norm": 0.2631952166557312, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0058, + "step": 10480 + }, + { + "epoch": 0.6420221555786768, + "grad_norm": 0.30895209312438965, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.006, + "step": 10490 + }, + { + "epoch": 0.642634188138809, + "grad_norm": 0.17614217102527618, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0069, + "step": 10500 + }, + { + "epoch": 0.6432462206989412, + "grad_norm": 0.38792312145233154, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0077, + "step": 10510 + }, + { + "epoch": 0.6438582532590734, + "grad_norm": 0.1722564697265625, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0054, + "step": 10520 + }, + { + "epoch": 0.6444702858192056, + "grad_norm": 0.2741699516773224, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0062, + "step": 10530 + }, + { + "epoch": 0.6450823183793378, + "grad_norm": 0.2059863954782486, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0073, + "step": 10540 + }, + { + "epoch": 0.64569435093947, + "grad_norm": 0.2702447474002838, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0044, + "step": 10550 + }, + { + "epoch": 0.6463063834996022, + "grad_norm": 0.2299312800168991, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0051, + "step": 10560 + }, + { + "epoch": 0.6469184160597343, + "grad_norm": 0.1995723992586136, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0057, + "step": 10570 + }, + { + "epoch": 0.6475304486198665, + "grad_norm": 0.30346980690956116, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0063, + "step": 10580 + }, + { + "epoch": 0.6481424811799987, + "grad_norm": 0.5040738582611084, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0064, + "step": 10590 + }, + { + "epoch": 0.6487545137401309, + "grad_norm": 0.16984818875789642, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0071, + "step": 10600 + }, + { + "epoch": 0.6493665463002631, + "grad_norm": 0.26560020446777344, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0116, + "step": 10610 + }, + { + "epoch": 0.6499785788603953, + "grad_norm": 0.4563823342323303, + "learning_rate": 1.554018740860716e-05, + "loss": 0.008, + "step": 10620 + }, + { + "epoch": 0.6505906114205275, + "grad_norm": 0.23272818326950073, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.006, + "step": 10630 + }, + { + "epoch": 0.6512026439806597, + "grad_norm": 0.19166870415210724, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0066, + "step": 10640 + }, + { + "epoch": 0.651814676540792, + "grad_norm": 0.2822705805301666, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0062, + "step": 10650 + }, + { + "epoch": 0.6524267091009242, + "grad_norm": 0.24001267552375793, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0069, + "step": 10660 + }, + { + "epoch": 0.6530387416610564, + "grad_norm": 0.2563900947570801, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0068, + "step": 10670 + }, + { + "epoch": 0.6536507742211886, + "grad_norm": 0.2747437357902527, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0058, + "step": 10680 + }, + { + "epoch": 0.6542628067813208, + "grad_norm": 0.39710354804992676, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.005, + "step": 10690 + }, + { + "epoch": 0.654874839341453, + "grad_norm": 0.30690231919288635, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0102, + "step": 10700 + }, + { + "epoch": 0.6554868719015852, + "grad_norm": 0.2879253923892975, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0072, + "step": 10710 + }, + { + "epoch": 0.6560989044617174, + "grad_norm": 0.19964110851287842, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0065, + "step": 10720 + }, + { + "epoch": 0.6567109370218496, + "grad_norm": 0.20109151303768158, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0066, + "step": 10730 + }, + { + "epoch": 0.6573229695819818, + "grad_norm": 0.21469832956790924, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0048, + "step": 10740 + }, + { + "epoch": 0.657935002142114, + "grad_norm": 0.19622936844825745, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0057, + "step": 10750 + }, + { + "epoch": 0.6585470347022462, + "grad_norm": 0.2255190759897232, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0052, + "step": 10760 + }, + { + "epoch": 0.6591590672623784, + "grad_norm": 0.47484955191612244, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0066, + "step": 10770 + }, + { + "epoch": 0.6597710998225106, + "grad_norm": 0.32192179560661316, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0067, + "step": 10780 + }, + { + "epoch": 0.6603831323826428, + "grad_norm": 0.33044904470443726, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0061, + "step": 10790 + }, + { + "epoch": 0.660995164942775, + "grad_norm": 0.3206661343574524, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0072, + "step": 10800 + }, + { + "epoch": 0.6616071975029072, + "grad_norm": 0.34903818368911743, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0055, + "step": 10810 + }, + { + "epoch": 0.6622192300630394, + "grad_norm": 0.1982222944498062, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0063, + "step": 10820 + }, + { + "epoch": 0.6628312626231716, + "grad_norm": 0.25388309359550476, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0072, + "step": 10830 + }, + { + "epoch": 0.6634432951833038, + "grad_norm": 0.2325269728899002, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0078, + "step": 10840 + }, + { + "epoch": 0.6640553277434359, + "grad_norm": 0.3364964425563812, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0054, + "step": 10850 + }, + { + "epoch": 0.6646673603035681, + "grad_norm": 0.198661208152771, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0061, + "step": 10860 + }, + { + "epoch": 0.6652793928637003, + "grad_norm": 0.333836168050766, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0058, + "step": 10870 + }, + { + "epoch": 0.6658914254238325, + "grad_norm": 0.21908101439476013, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0087, + "step": 10880 + }, + { + "epoch": 0.6665034579839647, + "grad_norm": 0.3094167709350586, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0062, + "step": 10890 + }, + { + "epoch": 0.6671154905440969, + "grad_norm": 0.28113746643066406, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0066, + "step": 10900 + }, + { + "epoch": 0.6677275231042291, + "grad_norm": 0.20239399373531342, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0071, + "step": 10910 + }, + { + "epoch": 0.6683395556643613, + "grad_norm": 0.32829156517982483, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0056, + "step": 10920 + }, + { + "epoch": 0.6689515882244935, + "grad_norm": 0.2950859069824219, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 0.6695636207846257, + "grad_norm": 0.36404141783714294, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0075, + "step": 10940 + }, + { + "epoch": 0.6701756533447579, + "grad_norm": 0.2479381114244461, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0055, + "step": 10950 + }, + { + "epoch": 0.6707876859048901, + "grad_norm": 0.1934390366077423, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.005, + "step": 10960 + }, + { + "epoch": 0.6713997184650223, + "grad_norm": 0.20912423729896545, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0064, + "step": 10970 + }, + { + "epoch": 0.6720117510251545, + "grad_norm": 0.1781405806541443, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0048, + "step": 10980 + }, + { + "epoch": 0.6726237835852867, + "grad_norm": 0.18812811374664307, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0048, + "step": 10990 + }, + { + "epoch": 0.6732358161454189, + "grad_norm": 0.2006077766418457, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0073, + "step": 11000 + }, + { + "epoch": 0.6738478487055511, + "grad_norm": 0.20471568405628204, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0068, + "step": 11010 + }, + { + "epoch": 0.6744598812656833, + "grad_norm": 0.2979716658592224, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0067, + "step": 11020 + }, + { + "epoch": 0.6750719138258156, + "grad_norm": 0.3256290853023529, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0054, + "step": 11030 + }, + { + "epoch": 0.6756839463859478, + "grad_norm": 0.3346560001373291, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0061, + "step": 11040 + }, + { + "epoch": 0.67629597894608, + "grad_norm": 0.35791122913360596, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0054, + "step": 11050 + }, + { + "epoch": 0.6769080115062122, + "grad_norm": 0.30428826808929443, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0066, + "step": 11060 + }, + { + "epoch": 0.6775200440663444, + "grad_norm": 0.31254154443740845, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0065, + "step": 11070 + }, + { + "epoch": 0.6781320766264766, + "grad_norm": 0.263028621673584, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0062, + "step": 11080 + }, + { + "epoch": 0.6787441091866088, + "grad_norm": 0.22496990859508514, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0056, + "step": 11090 + }, + { + "epoch": 0.679356141746741, + "grad_norm": 0.2647632360458374, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0072, + "step": 11100 + }, + { + "epoch": 0.6799681743068732, + "grad_norm": 0.2517150342464447, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0064, + "step": 11110 + }, + { + "epoch": 0.6805802068670054, + "grad_norm": 0.30550616979599, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0055, + "step": 11120 + }, + { + "epoch": 0.6811922394271375, + "grad_norm": 0.21312931180000305, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0074, + "step": 11130 + }, + { + "epoch": 0.6818042719872697, + "grad_norm": 0.21152199804782867, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0047, + "step": 11140 + }, + { + "epoch": 0.6824163045474019, + "grad_norm": 0.2030613273382187, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0045, + "step": 11150 + }, + { + "epoch": 0.6830283371075341, + "grad_norm": 0.30646151304244995, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0045, + "step": 11160 + }, + { + "epoch": 0.6836403696676663, + "grad_norm": 0.2693783938884735, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0061, + "step": 11170 + }, + { + "epoch": 0.6842524022277985, + "grad_norm": 0.25288495421409607, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0068, + "step": 11180 + }, + { + "epoch": 0.6848644347879307, + "grad_norm": 0.34989964962005615, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.007, + "step": 11190 + }, + { + "epoch": 0.6854764673480629, + "grad_norm": 0.192350834608078, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0064, + "step": 11200 + }, + { + "epoch": 0.6860884999081951, + "grad_norm": 0.3841196894645691, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0069, + "step": 11210 + }, + { + "epoch": 0.6867005324683273, + "grad_norm": 0.2168666571378708, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0063, + "step": 11220 + }, + { + "epoch": 0.6873125650284595, + "grad_norm": 0.2756234109401703, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0068, + "step": 11230 + }, + { + "epoch": 0.6879245975885917, + "grad_norm": 0.1971903294324875, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.006, + "step": 11240 + }, + { + "epoch": 0.6885366301487239, + "grad_norm": 0.3857499659061432, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0063, + "step": 11250 + }, + { + "epoch": 0.6891486627088561, + "grad_norm": 0.194110706448555, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0049, + "step": 11260 + }, + { + "epoch": 0.6897606952689883, + "grad_norm": 0.24935179948806763, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0054, + "step": 11270 + }, + { + "epoch": 0.6903727278291205, + "grad_norm": 0.5208527445793152, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0062, + "step": 11280 + }, + { + "epoch": 0.6909847603892527, + "grad_norm": 0.2917899191379547, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0058, + "step": 11290 + }, + { + "epoch": 0.6915967929493849, + "grad_norm": 0.42692577838897705, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0072, + "step": 11300 + }, + { + "epoch": 0.6922088255095171, + "grad_norm": 0.36888429522514343, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0059, + "step": 11310 + }, + { + "epoch": 0.6928208580696493, + "grad_norm": 0.26246029138565063, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0057, + "step": 11320 + }, + { + "epoch": 0.6934328906297815, + "grad_norm": 0.22163739800453186, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0078, + "step": 11330 + }, + { + "epoch": 0.6940449231899137, + "grad_norm": 0.33411458134651184, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0053, + "step": 11340 + }, + { + "epoch": 0.6946569557500459, + "grad_norm": 0.2792898118495941, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0095, + "step": 11350 + }, + { + "epoch": 0.6952689883101781, + "grad_norm": 0.2770175039768219, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0059, + "step": 11360 + }, + { + "epoch": 0.6958810208703103, + "grad_norm": 0.14913171529769897, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0046, + "step": 11370 + }, + { + "epoch": 0.6964930534304425, + "grad_norm": 0.22906239330768585, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0054, + "step": 11380 + }, + { + "epoch": 0.6971050859905747, + "grad_norm": 0.2854336202144623, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0052, + "step": 11390 + }, + { + "epoch": 0.697717118550707, + "grad_norm": 0.21835818886756897, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0064, + "step": 11400 + }, + { + "epoch": 0.698329151110839, + "grad_norm": 0.42180293798446655, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0071, + "step": 11410 + }, + { + "epoch": 0.6989411836709712, + "grad_norm": 0.3056841492652893, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0092, + "step": 11420 + }, + { + "epoch": 0.6995532162311034, + "grad_norm": 0.15149559080600739, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0049, + "step": 11430 + }, + { + "epoch": 0.7001652487912357, + "grad_norm": 0.15561188757419586, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0051, + "step": 11440 + }, + { + "epoch": 0.7007772813513679, + "grad_norm": 0.2941122055053711, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0065, + "step": 11450 + }, + { + "epoch": 0.7013893139115001, + "grad_norm": 0.3008195757865906, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0059, + "step": 11460 + }, + { + "epoch": 0.7020013464716323, + "grad_norm": 0.3787235617637634, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0068, + "step": 11470 + }, + { + "epoch": 0.7026133790317645, + "grad_norm": 0.2069675624370575, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.005, + "step": 11480 + }, + { + "epoch": 0.7032254115918967, + "grad_norm": 0.33505553007125854, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0058, + "step": 11490 + }, + { + "epoch": 0.7038374441520289, + "grad_norm": 0.281213641166687, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0064, + "step": 11500 + }, + { + "epoch": 0.7044494767121611, + "grad_norm": 0.28471192717552185, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0066, + "step": 11510 + }, + { + "epoch": 0.7050615092722933, + "grad_norm": 0.3166801929473877, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0062, + "step": 11520 + }, + { + "epoch": 0.7056735418324255, + "grad_norm": 0.26893407106399536, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.005, + "step": 11530 + }, + { + "epoch": 0.7062855743925577, + "grad_norm": 0.17421478033065796, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0063, + "step": 11540 + }, + { + "epoch": 0.7068976069526899, + "grad_norm": 0.40999990701675415, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0069, + "step": 11550 + }, + { + "epoch": 0.7075096395128221, + "grad_norm": 0.190180242061615, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0061, + "step": 11560 + }, + { + "epoch": 0.7081216720729543, + "grad_norm": 0.20383603870868683, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0049, + "step": 11570 + }, + { + "epoch": 0.7087337046330865, + "grad_norm": 0.28741395473480225, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0059, + "step": 11580 + }, + { + "epoch": 0.7093457371932187, + "grad_norm": 0.24231962859630585, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.008, + "step": 11590 + }, + { + "epoch": 0.7099577697533509, + "grad_norm": 0.2221115529537201, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0053, + "step": 11600 + }, + { + "epoch": 0.7105698023134831, + "grad_norm": 0.18564820289611816, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0072, + "step": 11610 + }, + { + "epoch": 0.7111818348736153, + "grad_norm": 0.3734343647956848, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0089, + "step": 11620 + }, + { + "epoch": 0.7117938674337475, + "grad_norm": 0.3215912878513336, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0093, + "step": 11630 + }, + { + "epoch": 0.7124058999938797, + "grad_norm": 0.22602899372577667, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0062, + "step": 11640 + }, + { + "epoch": 0.7130179325540119, + "grad_norm": 0.3115978538990021, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.006, + "step": 11650 + }, + { + "epoch": 0.7136299651141441, + "grad_norm": 0.26148155331611633, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0071, + "step": 11660 + }, + { + "epoch": 0.7142419976742763, + "grad_norm": 0.142781600356102, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0051, + "step": 11670 + }, + { + "epoch": 0.7148540302344085, + "grad_norm": 0.21306048333644867, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0068, + "step": 11680 + }, + { + "epoch": 0.7154660627945407, + "grad_norm": 0.3439876437187195, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.006, + "step": 11690 + }, + { + "epoch": 0.7160780953546728, + "grad_norm": 0.4010280966758728, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0062, + "step": 11700 + }, + { + "epoch": 0.716690127914805, + "grad_norm": 0.2760031819343567, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.006, + "step": 11710 + }, + { + "epoch": 0.7173021604749372, + "grad_norm": 0.45097261667251587, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0062, + "step": 11720 + }, + { + "epoch": 0.7179141930350694, + "grad_norm": 0.20118115842342377, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0054, + "step": 11730 + }, + { + "epoch": 0.7185262255952016, + "grad_norm": 0.3090760409832001, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0054, + "step": 11740 + }, + { + "epoch": 0.7191382581553338, + "grad_norm": 0.25016647577285767, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0077, + "step": 11750 + }, + { + "epoch": 0.719750290715466, + "grad_norm": 0.2310703545808792, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0053, + "step": 11760 + }, + { + "epoch": 0.7203623232755982, + "grad_norm": 0.2269359678030014, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.006, + "step": 11770 + }, + { + "epoch": 0.7209743558357304, + "grad_norm": 0.3917788565158844, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0062, + "step": 11780 + }, + { + "epoch": 0.7215863883958626, + "grad_norm": 0.25999465584754944, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0064, + "step": 11790 + }, + { + "epoch": 0.7221984209559948, + "grad_norm": 0.19340357184410095, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0071, + "step": 11800 + }, + { + "epoch": 0.722810453516127, + "grad_norm": 0.25046268105506897, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0078, + "step": 11810 + }, + { + "epoch": 0.7234224860762593, + "grad_norm": 0.19819264113903046, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.005, + "step": 11820 + }, + { + "epoch": 0.7240345186363915, + "grad_norm": 0.43484950065612793, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0078, + "step": 11830 + }, + { + "epoch": 0.7246465511965237, + "grad_norm": 0.29191601276397705, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0064, + "step": 11840 + }, + { + "epoch": 0.7252585837566559, + "grad_norm": 0.21717441082000732, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0056, + "step": 11850 + }, + { + "epoch": 0.7258706163167881, + "grad_norm": 0.3210129737854004, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0072, + "step": 11860 + }, + { + "epoch": 0.7264826488769203, + "grad_norm": 0.33192649483680725, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0061, + "step": 11870 + }, + { + "epoch": 0.7270946814370525, + "grad_norm": 0.14648163318634033, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0083, + "step": 11880 + }, + { + "epoch": 0.7277067139971847, + "grad_norm": 0.20028764009475708, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0052, + "step": 11890 + }, + { + "epoch": 0.7283187465573169, + "grad_norm": 0.21449612081050873, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0063, + "step": 11900 + }, + { + "epoch": 0.7289307791174491, + "grad_norm": 0.27472081780433655, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0062, + "step": 11910 + }, + { + "epoch": 0.7295428116775813, + "grad_norm": 0.2919130027294159, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0048, + "step": 11920 + }, + { + "epoch": 0.7301548442377135, + "grad_norm": 0.153092160820961, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0053, + "step": 11930 + }, + { + "epoch": 0.7307668767978457, + "grad_norm": 0.22820086777210236, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0058, + "step": 11940 + }, + { + "epoch": 0.7313789093579779, + "grad_norm": 0.24281881749629974, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0044, + "step": 11950 + }, + { + "epoch": 0.7319909419181101, + "grad_norm": 0.32581812143325806, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0051, + "step": 11960 + }, + { + "epoch": 0.7326029744782423, + "grad_norm": 0.3139822483062744, + "learning_rate": 1.435930222050582e-05, + "loss": 0.006, + "step": 11970 + }, + { + "epoch": 0.7332150070383744, + "grad_norm": 0.37985655665397644, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0052, + "step": 11980 + }, + { + "epoch": 0.7338270395985066, + "grad_norm": 0.1958508938550949, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.007, + "step": 11990 + }, + { + "epoch": 0.7344390721586388, + "grad_norm": 0.25318172574043274, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0084, + "step": 12000 + }, + { + "epoch": 0.735051104718771, + "grad_norm": 0.33245304226875305, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0051, + "step": 12010 + }, + { + "epoch": 0.7356631372789032, + "grad_norm": 0.2750372290611267, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0057, + "step": 12020 + }, + { + "epoch": 0.7362751698390354, + "grad_norm": 0.2057010382413864, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0057, + "step": 12030 + }, + { + "epoch": 0.7368872023991676, + "grad_norm": 0.30713731050491333, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0067, + "step": 12040 + }, + { + "epoch": 0.7374992349592998, + "grad_norm": 0.20423808693885803, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.006, + "step": 12050 + }, + { + "epoch": 0.738111267519432, + "grad_norm": 0.3129539489746094, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0067, + "step": 12060 + }, + { + "epoch": 0.7387233000795642, + "grad_norm": 0.25026270747184753, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0081, + "step": 12070 + }, + { + "epoch": 0.7393353326396964, + "grad_norm": 0.4147534668445587, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0061, + "step": 12080 + }, + { + "epoch": 0.7399473651998286, + "grad_norm": 0.20954278111457825, + "learning_rate": 1.425047976058418e-05, + "loss": 0.006, + "step": 12090 + }, + { + "epoch": 0.7405593977599608, + "grad_norm": 0.2700798809528351, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 0.741171430320093, + "grad_norm": 0.2597086429595947, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0059, + "step": 12110 + }, + { + "epoch": 0.7417834628802252, + "grad_norm": 0.2674495279788971, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0045, + "step": 12120 + }, + { + "epoch": 0.7423954954403574, + "grad_norm": 0.24583879113197327, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0061, + "step": 12130 + }, + { + "epoch": 0.7430075280004896, + "grad_norm": 0.23704801499843597, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0054, + "step": 12140 + }, + { + "epoch": 0.7436195605606218, + "grad_norm": 0.2381024807691574, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0073, + "step": 12150 + }, + { + "epoch": 0.744231593120754, + "grad_norm": 0.24937355518341064, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0049, + "step": 12160 + }, + { + "epoch": 0.7448436256808862, + "grad_norm": 0.20442882180213928, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0061, + "step": 12170 + }, + { + "epoch": 0.7454556582410184, + "grad_norm": 0.3053426742553711, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0087, + "step": 12180 + }, + { + "epoch": 0.7460676908011507, + "grad_norm": 0.3654315769672394, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0047, + "step": 12190 + }, + { + "epoch": 0.7466797233612829, + "grad_norm": 0.18926535546779633, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0065, + "step": 12200 + }, + { + "epoch": 0.7472917559214151, + "grad_norm": 0.21620485186576843, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0094, + "step": 12210 + }, + { + "epoch": 0.7479037884815473, + "grad_norm": 0.2754563093185425, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0059, + "step": 12220 + }, + { + "epoch": 0.7485158210416795, + "grad_norm": 0.39795419573783875, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.007, + "step": 12230 + }, + { + "epoch": 0.7491278536018117, + "grad_norm": 0.20502857863903046, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0048, + "step": 12240 + }, + { + "epoch": 0.7497398861619439, + "grad_norm": 0.23821429908275604, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0082, + "step": 12250 + }, + { + "epoch": 0.750351918722076, + "grad_norm": 0.45541366934776306, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0071, + "step": 12260 + }, + { + "epoch": 0.7509639512822082, + "grad_norm": 0.24881400167942047, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0061, + "step": 12270 + }, + { + "epoch": 0.7515759838423404, + "grad_norm": 0.2409125715494156, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0061, + "step": 12280 + }, + { + "epoch": 0.7521880164024726, + "grad_norm": 0.2930417060852051, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0054, + "step": 12290 + }, + { + "epoch": 0.7528000489626048, + "grad_norm": 0.30566394329071045, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0054, + "step": 12300 + }, + { + "epoch": 0.753412081522737, + "grad_norm": 0.32679763436317444, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0059, + "step": 12310 + }, + { + "epoch": 0.7540241140828692, + "grad_norm": 0.29273876547813416, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0067, + "step": 12320 + }, + { + "epoch": 0.7546361466430014, + "grad_norm": 0.19642773270606995, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0062, + "step": 12330 + }, + { + "epoch": 0.7552481792031336, + "grad_norm": 0.21928250789642334, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0066, + "step": 12340 + }, + { + "epoch": 0.7558602117632658, + "grad_norm": 0.2534322738647461, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0059, + "step": 12350 + }, + { + "epoch": 0.756472244323398, + "grad_norm": 0.20712649822235107, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0054, + "step": 12360 + }, + { + "epoch": 0.7570842768835302, + "grad_norm": 0.18670639395713806, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0063, + "step": 12370 + }, + { + "epoch": 0.7576963094436624, + "grad_norm": 0.26770254969596863, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0052, + "step": 12380 + }, + { + "epoch": 0.7583083420037946, + "grad_norm": 0.3621291518211365, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0056, + "step": 12390 + }, + { + "epoch": 0.7589203745639268, + "grad_norm": 0.31771939992904663, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0059, + "step": 12400 + }, + { + "epoch": 0.759532407124059, + "grad_norm": 0.44418177008628845, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0065, + "step": 12410 + }, + { + "epoch": 0.7601444396841912, + "grad_norm": 0.2183474898338318, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0046, + "step": 12420 + }, + { + "epoch": 0.7607564722443234, + "grad_norm": 0.4400590658187866, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0061, + "step": 12430 + }, + { + "epoch": 0.7613685048044556, + "grad_norm": 0.296539843082428, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0059, + "step": 12440 + }, + { + "epoch": 0.7619805373645878, + "grad_norm": 0.352870374917984, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0055, + "step": 12450 + }, + { + "epoch": 0.76259256992472, + "grad_norm": 0.19494596123695374, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0061, + "step": 12460 + }, + { + "epoch": 0.7632046024848522, + "grad_norm": 0.3799489438533783, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0057, + "step": 12470 + }, + { + "epoch": 0.7638166350449844, + "grad_norm": 0.3572365641593933, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0053, + "step": 12480 + }, + { + "epoch": 0.7644286676051166, + "grad_norm": 0.2559097707271576, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0062, + "step": 12490 + }, + { + "epoch": 0.7650407001652488, + "grad_norm": 0.13144978880882263, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0065, + "step": 12500 + }, + { + "epoch": 0.765652732725381, + "grad_norm": 0.34635287523269653, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0081, + "step": 12510 + }, + { + "epoch": 0.7662647652855132, + "grad_norm": 0.25615188479423523, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0057, + "step": 12520 + }, + { + "epoch": 0.7668767978456454, + "grad_norm": 0.17619644105434418, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0047, + "step": 12530 + }, + { + "epoch": 0.7674888304057775, + "grad_norm": 0.20169994235038757, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0068, + "step": 12540 + }, + { + "epoch": 0.7681008629659097, + "grad_norm": 0.49686071276664734, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0066, + "step": 12550 + }, + { + "epoch": 0.7687128955260419, + "grad_norm": 0.28179335594177246, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0058, + "step": 12560 + }, + { + "epoch": 0.7693249280861741, + "grad_norm": 0.28156182169914246, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.005, + "step": 12570 + }, + { + "epoch": 0.7699369606463063, + "grad_norm": 0.15054315328598022, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0051, + "step": 12580 + }, + { + "epoch": 0.7705489932064385, + "grad_norm": 0.22872644662857056, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0066, + "step": 12590 + }, + { + "epoch": 0.7711610257665708, + "grad_norm": 0.25821951031684875, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0054, + "step": 12600 + }, + { + "epoch": 0.771773058326703, + "grad_norm": 0.23592771589756012, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0059, + "step": 12610 + }, + { + "epoch": 0.7723850908868352, + "grad_norm": 0.34409141540527344, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0053, + "step": 12620 + }, + { + "epoch": 0.7729971234469674, + "grad_norm": 0.2803158760070801, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0042, + "step": 12630 + }, + { + "epoch": 0.7736091560070996, + "grad_norm": 0.32796284556388855, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0074, + "step": 12640 + }, + { + "epoch": 0.7742211885672318, + "grad_norm": 0.34749120473861694, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0054, + "step": 12650 + }, + { + "epoch": 0.774833221127364, + "grad_norm": 0.34066343307495117, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0082, + "step": 12660 + }, + { + "epoch": 0.7754452536874962, + "grad_norm": 0.4294384717941284, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0062, + "step": 12670 + }, + { + "epoch": 0.7760572862476284, + "grad_norm": 0.2355230748653412, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0069, + "step": 12680 + }, + { + "epoch": 0.7766693188077606, + "grad_norm": 0.3181976079940796, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0068, + "step": 12690 + }, + { + "epoch": 0.7772813513678928, + "grad_norm": 0.2763727605342865, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0052, + "step": 12700 + }, + { + "epoch": 0.777893383928025, + "grad_norm": 0.2938949465751648, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0041, + "step": 12710 + }, + { + "epoch": 0.7785054164881572, + "grad_norm": 0.31331220269203186, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0062, + "step": 12720 + }, + { + "epoch": 0.7791174490482894, + "grad_norm": 0.3389904797077179, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0061, + "step": 12730 + }, + { + "epoch": 0.7797294816084216, + "grad_norm": 0.2848975360393524, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0065, + "step": 12740 + }, + { + "epoch": 0.7803415141685538, + "grad_norm": 0.29838478565216064, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0061, + "step": 12750 + }, + { + "epoch": 0.780953546728686, + "grad_norm": 0.47004032135009766, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0066, + "step": 12760 + }, + { + "epoch": 0.7815655792888182, + "grad_norm": 0.26898056268692017, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0063, + "step": 12770 + }, + { + "epoch": 0.7821776118489504, + "grad_norm": 0.29459917545318604, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0065, + "step": 12780 + }, + { + "epoch": 0.7827896444090826, + "grad_norm": 0.3481508791446686, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0058, + "step": 12790 + }, + { + "epoch": 0.7834016769692148, + "grad_norm": 0.1707627922296524, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0053, + "step": 12800 + }, + { + "epoch": 0.784013709529347, + "grad_norm": 0.14735333621501923, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0058, + "step": 12810 + }, + { + "epoch": 0.7846257420894791, + "grad_norm": 0.28002044558525085, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.006, + "step": 12820 + }, + { + "epoch": 0.7852377746496113, + "grad_norm": 0.39598894119262695, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0062, + "step": 12830 + }, + { + "epoch": 0.7858498072097435, + "grad_norm": 0.19379247725009918, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0058, + "step": 12840 + }, + { + "epoch": 0.7864618397698757, + "grad_norm": 0.27260729670524597, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.006, + "step": 12850 + }, + { + "epoch": 0.7870738723300079, + "grad_norm": 0.2845087945461273, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0052, + "step": 12860 + }, + { + "epoch": 0.7876859048901401, + "grad_norm": 0.37151217460632324, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0043, + "step": 12870 + }, + { + "epoch": 0.7882979374502723, + "grad_norm": 0.3387412130832672, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0046, + "step": 12880 + }, + { + "epoch": 0.7889099700104045, + "grad_norm": 0.42672809958457947, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0065, + "step": 12890 + }, + { + "epoch": 0.7895220025705367, + "grad_norm": 0.20378202199935913, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0062, + "step": 12900 + }, + { + "epoch": 0.7901340351306689, + "grad_norm": 0.16417330503463745, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0045, + "step": 12910 + }, + { + "epoch": 0.7907460676908011, + "grad_norm": 0.1704142540693283, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0054, + "step": 12920 + }, + { + "epoch": 0.7913581002509333, + "grad_norm": 0.21494890749454498, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0061, + "step": 12930 + }, + { + "epoch": 0.7919701328110655, + "grad_norm": 0.3430638909339905, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0046, + "step": 12940 + }, + { + "epoch": 0.7925821653711977, + "grad_norm": 0.22641201317310333, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0049, + "step": 12950 + }, + { + "epoch": 0.79319419793133, + "grad_norm": 0.27153971791267395, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0057, + "step": 12960 + }, + { + "epoch": 0.7938062304914622, + "grad_norm": 0.2648560702800751, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.0048, + "step": 12970 + }, + { + "epoch": 0.7944182630515944, + "grad_norm": 0.2148633897304535, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0048, + "step": 12980 + }, + { + "epoch": 0.7950302956117266, + "grad_norm": 0.35170191526412964, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0069, + "step": 12990 + }, + { + "epoch": 0.7956423281718588, + "grad_norm": 0.3539712429046631, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0067, + "step": 13000 + }, + { + "epoch": 0.796254360731991, + "grad_norm": 0.29938259720802307, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0102, + "step": 13010 + }, + { + "epoch": 0.7968663932921232, + "grad_norm": 0.35241010785102844, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0073, + "step": 13020 + }, + { + "epoch": 0.7974784258522554, + "grad_norm": 0.2929113805294037, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0061, + "step": 13030 + }, + { + "epoch": 0.7980904584123876, + "grad_norm": 0.24052929878234863, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0046, + "step": 13040 + }, + { + "epoch": 0.7987024909725198, + "grad_norm": 0.21611042320728302, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0043, + "step": 13050 + }, + { + "epoch": 0.799314523532652, + "grad_norm": 0.23498570919036865, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0046, + "step": 13060 + }, + { + "epoch": 0.7999265560927842, + "grad_norm": 0.30229923129081726, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0068, + "step": 13070 + }, + { + "epoch": 0.8005385886529164, + "grad_norm": 0.2916681170463562, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0062, + "step": 13080 + }, + { + "epoch": 0.8011506212130486, + "grad_norm": 0.31905195116996765, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0048, + "step": 13090 + }, + { + "epoch": 0.8017626537731807, + "grad_norm": 0.22307109832763672, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0049, + "step": 13100 + }, + { + "epoch": 0.8023746863333129, + "grad_norm": 0.2815198004245758, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0054, + "step": 13110 + }, + { + "epoch": 0.8029867188934451, + "grad_norm": 0.18762829899787903, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0064, + "step": 13120 + }, + { + "epoch": 0.8035987514535773, + "grad_norm": 0.1918255090713501, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0064, + "step": 13130 + }, + { + "epoch": 0.8042107840137095, + "grad_norm": 0.3726229667663574, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0065, + "step": 13140 + }, + { + "epoch": 0.8048228165738417, + "grad_norm": 0.423285573720932, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0062, + "step": 13150 + }, + { + "epoch": 0.8054348491339739, + "grad_norm": 0.1709958165884018, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0052, + "step": 13160 + }, + { + "epoch": 0.8060468816941061, + "grad_norm": 0.3615981936454773, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0054, + "step": 13170 + }, + { + "epoch": 0.8066589142542383, + "grad_norm": 0.2101999819278717, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0053, + "step": 13180 + }, + { + "epoch": 0.8072709468143705, + "grad_norm": 0.14393582940101624, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0047, + "step": 13190 + }, + { + "epoch": 0.8078829793745027, + "grad_norm": 0.3704521656036377, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0056, + "step": 13200 + }, + { + "epoch": 0.8084950119346349, + "grad_norm": 0.23275913298130035, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0051, + "step": 13210 + }, + { + "epoch": 0.8091070444947671, + "grad_norm": 0.18429698050022125, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0045, + "step": 13220 + }, + { + "epoch": 0.8097190770548993, + "grad_norm": 0.21721667051315308, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0052, + "step": 13230 + }, + { + "epoch": 0.8103311096150315, + "grad_norm": 0.29456019401550293, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 0.8109431421751637, + "grad_norm": 0.19854630529880524, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0071, + "step": 13250 + }, + { + "epoch": 0.8115551747352959, + "grad_norm": 0.4318163990974426, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0059, + "step": 13260 + }, + { + "epoch": 0.8121672072954281, + "grad_norm": 0.3421531915664673, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.006, + "step": 13270 + }, + { + "epoch": 0.8127792398555603, + "grad_norm": 0.2370125651359558, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0054, + "step": 13280 + }, + { + "epoch": 0.8133912724156925, + "grad_norm": 0.2996460497379303, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 0.8140033049758247, + "grad_norm": 0.2911904454231262, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0053, + "step": 13300 + }, + { + "epoch": 0.8146153375359569, + "grad_norm": 0.26010408997535706, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0053, + "step": 13310 + }, + { + "epoch": 0.8152273700960891, + "grad_norm": 0.404702752828598, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0081, + "step": 13320 + }, + { + "epoch": 0.8158394026562213, + "grad_norm": 0.25591781735420227, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0057, + "step": 13330 + }, + { + "epoch": 0.8164514352163535, + "grad_norm": 0.1437849998474121, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0064, + "step": 13340 + }, + { + "epoch": 0.8170634677764858, + "grad_norm": 0.12252022325992584, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0047, + "step": 13350 + }, + { + "epoch": 0.817675500336618, + "grad_norm": 0.1861230581998825, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0063, + "step": 13360 + }, + { + "epoch": 0.8182875328967502, + "grad_norm": 0.2313026636838913, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0066, + "step": 13370 + }, + { + "epoch": 0.8188995654568824, + "grad_norm": 0.5445839166641235, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0076, + "step": 13380 + }, + { + "epoch": 0.8195115980170145, + "grad_norm": 0.21818871796131134, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0068, + "step": 13390 + }, + { + "epoch": 0.8201236305771467, + "grad_norm": 0.21823963522911072, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0072, + "step": 13400 + }, + { + "epoch": 0.8207356631372789, + "grad_norm": 0.1730659157037735, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0051, + "step": 13410 + }, + { + "epoch": 0.8213476956974111, + "grad_norm": 0.1301007866859436, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0075, + "step": 13420 + }, + { + "epoch": 0.8219597282575433, + "grad_norm": 0.32452520728111267, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.005, + "step": 13430 + }, + { + "epoch": 0.8225717608176755, + "grad_norm": 0.24771001935005188, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0058, + "step": 13440 + }, + { + "epoch": 0.8231837933778077, + "grad_norm": 0.4575227200984955, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0062, + "step": 13450 + }, + { + "epoch": 0.8237958259379399, + "grad_norm": 0.16441279649734497, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0081, + "step": 13460 + }, + { + "epoch": 0.8244078584980721, + "grad_norm": 0.26582902669906616, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0069, + "step": 13470 + }, + { + "epoch": 0.8250198910582043, + "grad_norm": 0.18871302902698517, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0068, + "step": 13480 + }, + { + "epoch": 0.8256319236183365, + "grad_norm": 0.23244783282279968, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0063, + "step": 13490 + }, + { + "epoch": 0.8262439561784687, + "grad_norm": 0.2399880290031433, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0036, + "step": 13500 + }, + { + "epoch": 0.8268559887386009, + "grad_norm": 0.25766822695732117, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0074, + "step": 13510 + }, + { + "epoch": 0.8274680212987331, + "grad_norm": 0.24792100489139557, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0066, + "step": 13520 + }, + { + "epoch": 0.8280800538588653, + "grad_norm": 0.3371896743774414, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0069, + "step": 13530 + }, + { + "epoch": 0.8286920864189975, + "grad_norm": 0.16249819099903107, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0079, + "step": 13540 + }, + { + "epoch": 0.8293041189791297, + "grad_norm": 0.2705139219760895, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0069, + "step": 13550 + }, + { + "epoch": 0.8299161515392619, + "grad_norm": 0.1905352771282196, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0055, + "step": 13560 + }, + { + "epoch": 0.8305281840993941, + "grad_norm": 0.23938500881195068, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0055, + "step": 13570 + }, + { + "epoch": 0.8311402166595263, + "grad_norm": 0.3562251031398773, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0054, + "step": 13580 + }, + { + "epoch": 0.8317522492196585, + "grad_norm": 0.2934769093990326, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0064, + "step": 13590 + }, + { + "epoch": 0.8323642817797907, + "grad_norm": 0.252366840839386, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0078, + "step": 13600 + }, + { + "epoch": 0.8329763143399229, + "grad_norm": 0.16646964848041534, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0046, + "step": 13610 + }, + { + "epoch": 0.8335883469000551, + "grad_norm": 0.22584658861160278, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0071, + "step": 13620 + }, + { + "epoch": 0.8342003794601873, + "grad_norm": 0.3578774034976959, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0049, + "step": 13630 + }, + { + "epoch": 0.8348124120203195, + "grad_norm": 0.3447739779949188, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0065, + "step": 13640 + }, + { + "epoch": 0.8354244445804517, + "grad_norm": 0.381954550743103, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0057, + "step": 13650 + }, + { + "epoch": 0.8360364771405839, + "grad_norm": 0.3563731908798218, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0065, + "step": 13660 + }, + { + "epoch": 0.836648509700716, + "grad_norm": 0.29516372084617615, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0053, + "step": 13670 + }, + { + "epoch": 0.8372605422608482, + "grad_norm": 0.22686618566513062, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0043, + "step": 13680 + }, + { + "epoch": 0.8378725748209804, + "grad_norm": 0.4608387351036072, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.005, + "step": 13690 + }, + { + "epoch": 0.8384846073811126, + "grad_norm": 0.31025534868240356, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0055, + "step": 13700 + }, + { + "epoch": 0.8390966399412448, + "grad_norm": 0.32904690504074097, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0055, + "step": 13710 + }, + { + "epoch": 0.839708672501377, + "grad_norm": 0.2547053098678589, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0061, + "step": 13720 + }, + { + "epoch": 0.8403207050615092, + "grad_norm": 0.30524104833602905, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.005, + "step": 13730 + }, + { + "epoch": 0.8409327376216414, + "grad_norm": 0.17741642892360687, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0051, + "step": 13740 + }, + { + "epoch": 0.8415447701817736, + "grad_norm": 0.23125578463077545, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0053, + "step": 13750 + }, + { + "epoch": 0.8421568027419059, + "grad_norm": 0.3080023229122162, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0075, + "step": 13760 + }, + { + "epoch": 0.842768835302038, + "grad_norm": 0.2509821951389313, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0053, + "step": 13770 + }, + { + "epoch": 0.8433808678621703, + "grad_norm": 0.17483864724636078, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.005, + "step": 13780 + }, + { + "epoch": 0.8439929004223025, + "grad_norm": 0.3952518403530121, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0056, + "step": 13790 + }, + { + "epoch": 0.8446049329824347, + "grad_norm": 0.2945535480976105, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0055, + "step": 13800 + }, + { + "epoch": 0.8452169655425669, + "grad_norm": 0.13024291396141052, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0073, + "step": 13810 + }, + { + "epoch": 0.8458289981026991, + "grad_norm": 0.1840520054101944, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0061, + "step": 13820 + }, + { + "epoch": 0.8464410306628313, + "grad_norm": 0.2368786782026291, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0058, + "step": 13830 + }, + { + "epoch": 0.8470530632229635, + "grad_norm": 0.2885456085205078, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0055, + "step": 13840 + }, + { + "epoch": 0.8476650957830957, + "grad_norm": 0.2782488167285919, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0046, + "step": 13850 + }, + { + "epoch": 0.8482771283432279, + "grad_norm": 0.1711442470550537, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0055, + "step": 13860 + }, + { + "epoch": 0.8488891609033601, + "grad_norm": 0.22235877811908722, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0056, + "step": 13870 + }, + { + "epoch": 0.8495011934634923, + "grad_norm": 0.1937183290719986, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0047, + "step": 13880 + }, + { + "epoch": 0.8501132260236245, + "grad_norm": 0.33960190415382385, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0063, + "step": 13890 + }, + { + "epoch": 0.8507252585837567, + "grad_norm": 0.1983388215303421, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0045, + "step": 13900 + }, + { + "epoch": 0.8513372911438889, + "grad_norm": 0.2968246638774872, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0051, + "step": 13910 + }, + { + "epoch": 0.8519493237040211, + "grad_norm": 0.25328314304351807, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0057, + "step": 13920 + }, + { + "epoch": 0.8525613562641533, + "grad_norm": 0.2435184270143509, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0056, + "step": 13930 + }, + { + "epoch": 0.8531733888242855, + "grad_norm": 0.24512560665607452, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0053, + "step": 13940 + }, + { + "epoch": 0.8537854213844176, + "grad_norm": 0.22028976678848267, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.006, + "step": 13950 + }, + { + "epoch": 0.8543974539445498, + "grad_norm": 0.24743935465812683, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0065, + "step": 13960 + }, + { + "epoch": 0.855009486504682, + "grad_norm": 0.1393810361623764, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0081, + "step": 13970 + }, + { + "epoch": 0.8556215190648142, + "grad_norm": 0.25975972414016724, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0063, + "step": 13980 + }, + { + "epoch": 0.8562335516249464, + "grad_norm": 0.1944616585969925, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0049, + "step": 13990 + }, + { + "epoch": 0.8568455841850786, + "grad_norm": 0.21936742961406708, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0056, + "step": 14000 + }, + { + "epoch": 0.8574576167452108, + "grad_norm": 0.1556629091501236, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0047, + "step": 14010 + }, + { + "epoch": 0.858069649305343, + "grad_norm": 0.23696991801261902, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.006, + "step": 14020 + }, + { + "epoch": 0.8586816818654752, + "grad_norm": 0.32507795095443726, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0061, + "step": 14030 + }, + { + "epoch": 0.8592937144256074, + "grad_norm": 0.35332199931144714, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0058, + "step": 14040 + }, + { + "epoch": 0.8599057469857396, + "grad_norm": 0.1835644394159317, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0046, + "step": 14050 + }, + { + "epoch": 0.8605177795458718, + "grad_norm": 0.19127517938613892, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0044, + "step": 14060 + }, + { + "epoch": 0.861129812106004, + "grad_norm": 0.30748996138572693, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0055, + "step": 14070 + }, + { + "epoch": 0.8617418446661362, + "grad_norm": 0.178785502910614, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0049, + "step": 14080 + }, + { + "epoch": 0.8623538772262684, + "grad_norm": 0.16979056596755981, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0044, + "step": 14090 + }, + { + "epoch": 0.8629659097864006, + "grad_norm": 0.19519983232021332, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0063, + "step": 14100 + }, + { + "epoch": 0.8635779423465328, + "grad_norm": 0.2722550928592682, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0057, + "step": 14110 + }, + { + "epoch": 0.864189974906665, + "grad_norm": 0.1956222504377365, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0054, + "step": 14120 + }, + { + "epoch": 0.8648020074667973, + "grad_norm": 0.32274308800697327, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0066, + "step": 14130 + }, + { + "epoch": 0.8654140400269295, + "grad_norm": 0.25953641533851624, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.0056, + "step": 14140 + }, + { + "epoch": 0.8660260725870617, + "grad_norm": 0.3293299674987793, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0057, + "step": 14150 + }, + { + "epoch": 0.8666381051471939, + "grad_norm": 0.35404127836227417, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0072, + "step": 14160 + }, + { + "epoch": 0.8672501377073261, + "grad_norm": 0.24674376845359802, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0064, + "step": 14170 + }, + { + "epoch": 0.8678621702674583, + "grad_norm": 0.23506462574005127, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0047, + "step": 14180 + }, + { + "epoch": 0.8684742028275905, + "grad_norm": 0.30500903725624084, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0059, + "step": 14190 + }, + { + "epoch": 0.8690862353877227, + "grad_norm": 0.23000167310237885, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0051, + "step": 14200 + }, + { + "epoch": 0.8696982679478549, + "grad_norm": 0.17339368164539337, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0041, + "step": 14210 + }, + { + "epoch": 0.8703103005079871, + "grad_norm": 0.2505367696285248, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.0059, + "step": 14220 + }, + { + "epoch": 0.8709223330681192, + "grad_norm": 0.22645734250545502, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0044, + "step": 14230 + }, + { + "epoch": 0.8715343656282514, + "grad_norm": 0.3509127199649811, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0043, + "step": 14240 + }, + { + "epoch": 0.8721463981883836, + "grad_norm": 0.2758972644805908, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0055, + "step": 14250 + }, + { + "epoch": 0.8727584307485158, + "grad_norm": 0.1943834275007248, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.006, + "step": 14260 + }, + { + "epoch": 0.873370463308648, + "grad_norm": 0.32881075143814087, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0061, + "step": 14270 + }, + { + "epoch": 0.8739824958687802, + "grad_norm": 0.35203438997268677, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0054, + "step": 14280 + }, + { + "epoch": 0.8745945284289124, + "grad_norm": 0.13618917763233185, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0044, + "step": 14290 + }, + { + "epoch": 0.8752065609890446, + "grad_norm": 0.22939404845237732, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0044, + "step": 14300 + }, + { + "epoch": 0.8758185935491768, + "grad_norm": 0.2027491182088852, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0051, + "step": 14310 + }, + { + "epoch": 0.876430626109309, + "grad_norm": 0.21950028836727142, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0105, + "step": 14320 + }, + { + "epoch": 0.8770426586694412, + "grad_norm": 0.307913213968277, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0051, + "step": 14330 + }, + { + "epoch": 0.8776546912295734, + "grad_norm": 0.1669110357761383, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0058, + "step": 14340 + }, + { + "epoch": 0.8782667237897056, + "grad_norm": 0.3033636808395386, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0066, + "step": 14350 + }, + { + "epoch": 0.8788787563498378, + "grad_norm": 0.25514236092567444, + "learning_rate": 1.210961823379053e-05, + "loss": 0.005, + "step": 14360 + }, + { + "epoch": 0.87949078890997, + "grad_norm": 0.2574418783187866, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0069, + "step": 14370 + }, + { + "epoch": 0.8801028214701022, + "grad_norm": 0.17803016304969788, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.006, + "step": 14380 + }, + { + "epoch": 0.8807148540302344, + "grad_norm": 0.31375741958618164, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0077, + "step": 14390 + }, + { + "epoch": 0.8813268865903666, + "grad_norm": 0.18031778931617737, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0052, + "step": 14400 + }, + { + "epoch": 0.8819389191504988, + "grad_norm": 0.18077519536018372, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0055, + "step": 14410 + }, + { + "epoch": 0.882550951710631, + "grad_norm": 0.22171644866466522, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0059, + "step": 14420 + }, + { + "epoch": 0.8831629842707632, + "grad_norm": 0.16187389194965363, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0044, + "step": 14430 + }, + { + "epoch": 0.8837750168308954, + "grad_norm": 0.27667325735092163, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0063, + "step": 14440 + }, + { + "epoch": 0.8843870493910276, + "grad_norm": 0.2493051290512085, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0072, + "step": 14450 + }, + { + "epoch": 0.8849990819511598, + "grad_norm": 0.3519611656665802, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 0.885611114511292, + "grad_norm": 0.17942464351654053, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0057, + "step": 14470 + }, + { + "epoch": 0.8862231470714242, + "grad_norm": 0.24518658220767975, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0044, + "step": 14480 + }, + { + "epoch": 0.8868351796315564, + "grad_norm": 0.28493785858154297, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0055, + "step": 14490 + }, + { + "epoch": 0.8874472121916887, + "grad_norm": 0.22260263562202454, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0062, + "step": 14500 + }, + { + "epoch": 0.8880592447518207, + "grad_norm": 0.2804561257362366, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0051, + "step": 14510 + }, + { + "epoch": 0.888671277311953, + "grad_norm": 0.24349385499954224, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0045, + "step": 14520 + }, + { + "epoch": 0.8892833098720851, + "grad_norm": 0.262207955121994, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0082, + "step": 14530 + }, + { + "epoch": 0.8898953424322174, + "grad_norm": 0.15527820587158203, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0061, + "step": 14540 + }, + { + "epoch": 0.8905073749923496, + "grad_norm": 0.23850804567337036, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0042, + "step": 14550 + }, + { + "epoch": 0.8911194075524818, + "grad_norm": 0.2665582001209259, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0053, + "step": 14560 + }, + { + "epoch": 0.891731440112614, + "grad_norm": 0.2652167081832886, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 0.8923434726727462, + "grad_norm": 0.21386243402957916, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0072, + "step": 14580 + }, + { + "epoch": 0.8929555052328784, + "grad_norm": 0.3087247312068939, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0082, + "step": 14590 + }, + { + "epoch": 0.8935675377930106, + "grad_norm": 0.2003909796476364, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0048, + "step": 14600 + }, + { + "epoch": 0.8941795703531428, + "grad_norm": 0.2214624583721161, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0062, + "step": 14610 + }, + { + "epoch": 0.894791602913275, + "grad_norm": 0.2500647306442261, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0052, + "step": 14620 + }, + { + "epoch": 0.8954036354734072, + "grad_norm": 0.2615419030189514, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0054, + "step": 14630 + }, + { + "epoch": 0.8960156680335394, + "grad_norm": 0.21347551047801971, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0048, + "step": 14640 + }, + { + "epoch": 0.8966277005936716, + "grad_norm": 0.35483887791633606, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0054, + "step": 14650 + }, + { + "epoch": 0.8972397331538038, + "grad_norm": 0.2423439472913742, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0055, + "step": 14660 + }, + { + "epoch": 0.897851765713936, + "grad_norm": 0.16826359927654266, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0067, + "step": 14670 + }, + { + "epoch": 0.8984637982740682, + "grad_norm": 0.3589499294757843, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0059, + "step": 14680 + }, + { + "epoch": 0.8990758308342004, + "grad_norm": 0.3081042468547821, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0057, + "step": 14690 + }, + { + "epoch": 0.8996878633943326, + "grad_norm": 0.31996914744377136, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0063, + "step": 14700 + }, + { + "epoch": 0.9002998959544648, + "grad_norm": 0.301209419965744, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0073, + "step": 14710 + }, + { + "epoch": 0.900911928514597, + "grad_norm": 0.19257168471813202, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0055, + "step": 14720 + }, + { + "epoch": 0.9015239610747292, + "grad_norm": 0.15221600234508514, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0053, + "step": 14730 + }, + { + "epoch": 0.9021359936348614, + "grad_norm": 0.21519577503204346, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0055, + "step": 14740 + }, + { + "epoch": 0.9027480261949936, + "grad_norm": 0.23772196471691132, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.006, + "step": 14750 + }, + { + "epoch": 0.9033600587551258, + "grad_norm": 0.2872219979763031, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0057, + "step": 14760 + }, + { + "epoch": 0.903972091315258, + "grad_norm": 0.2589483857154846, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0056, + "step": 14770 + }, + { + "epoch": 0.9045841238753902, + "grad_norm": 0.31850162148475647, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0051, + "step": 14780 + }, + { + "epoch": 0.9051961564355223, + "grad_norm": 0.27179282903671265, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0051, + "step": 14790 + }, + { + "epoch": 0.9058081889956545, + "grad_norm": 0.4132739007472992, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.005, + "step": 14800 + }, + { + "epoch": 0.9064202215557867, + "grad_norm": 0.19336774945259094, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0041, + "step": 14810 + }, + { + "epoch": 0.9070322541159189, + "grad_norm": 0.20783282816410065, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0065, + "step": 14820 + }, + { + "epoch": 0.9076442866760511, + "grad_norm": 0.26141899824142456, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0069, + "step": 14830 + }, + { + "epoch": 0.9082563192361833, + "grad_norm": 0.2158539742231369, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0081, + "step": 14840 + }, + { + "epoch": 0.9088683517963155, + "grad_norm": 0.3233732581138611, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0065, + "step": 14850 + }, + { + "epoch": 0.9094803843564477, + "grad_norm": 0.23924769461154938, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0042, + "step": 14860 + }, + { + "epoch": 0.9100924169165799, + "grad_norm": 0.17663812637329102, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.004, + "step": 14870 + }, + { + "epoch": 0.9107044494767121, + "grad_norm": 0.34379643201828003, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.005, + "step": 14880 + }, + { + "epoch": 0.9113164820368443, + "grad_norm": 0.29971349239349365, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0059, + "step": 14890 + }, + { + "epoch": 0.9119285145969765, + "grad_norm": 0.24832949042320251, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0042, + "step": 14900 + }, + { + "epoch": 0.9125405471571088, + "grad_norm": 0.22288024425506592, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0065, + "step": 14910 + }, + { + "epoch": 0.913152579717241, + "grad_norm": 0.2806689441204071, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0043, + "step": 14920 + }, + { + "epoch": 0.9137646122773732, + "grad_norm": 0.3908274173736572, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0047, + "step": 14930 + }, + { + "epoch": 0.9143766448375054, + "grad_norm": 0.16255778074264526, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0068, + "step": 14940 + }, + { + "epoch": 0.9149886773976376, + "grad_norm": 0.430791437625885, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0085, + "step": 14950 + }, + { + "epoch": 0.9156007099577698, + "grad_norm": 0.1739969551563263, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0056, + "step": 14960 + }, + { + "epoch": 0.916212742517902, + "grad_norm": 0.24298283457756042, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0055, + "step": 14970 + }, + { + "epoch": 0.9168247750780342, + "grad_norm": 0.21269915997982025, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0051, + "step": 14980 + }, + { + "epoch": 0.9174368076381664, + "grad_norm": 0.263388991355896, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0058, + "step": 14990 + }, + { + "epoch": 0.9180488401982986, + "grad_norm": 0.28030532598495483, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0068, + "step": 15000 + }, + { + "epoch": 0.9186608727584308, + "grad_norm": 0.17051894962787628, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 0.919272905318563, + "grad_norm": 0.2763383388519287, + "learning_rate": 1.146875176249365e-05, + "loss": 0.004, + "step": 15020 + }, + { + "epoch": 0.9198849378786952, + "grad_norm": 0.2616822421550751, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0052, + "step": 15030 + }, + { + "epoch": 0.9204969704388274, + "grad_norm": 0.21407093107700348, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0062, + "step": 15040 + }, + { + "epoch": 0.9211090029989596, + "grad_norm": 0.23936578631401062, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0073, + "step": 15050 + }, + { + "epoch": 0.9217210355590918, + "grad_norm": 0.26383110880851746, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.006, + "step": 15060 + }, + { + "epoch": 0.922333068119224, + "grad_norm": 0.19477945566177368, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0043, + "step": 15070 + }, + { + "epoch": 0.9229451006793561, + "grad_norm": 0.16677282750606537, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0061, + "step": 15080 + }, + { + "epoch": 0.9235571332394883, + "grad_norm": 0.26856037974357605, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0065, + "step": 15090 + }, + { + "epoch": 0.9241691657996205, + "grad_norm": 0.20086173713207245, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0056, + "step": 15100 + }, + { + "epoch": 0.9247811983597527, + "grad_norm": 0.26998719573020935, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0034, + "step": 15110 + }, + { + "epoch": 0.9253932309198849, + "grad_norm": 0.12727728486061096, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0043, + "step": 15120 + }, + { + "epoch": 0.9260052634800171, + "grad_norm": 0.11288347095251083, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0055, + "step": 15130 + }, + { + "epoch": 0.9266172960401493, + "grad_norm": 0.1109771579504013, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0048, + "step": 15140 + }, + { + "epoch": 0.9272293286002815, + "grad_norm": 0.2556479275226593, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0056, + "step": 15150 + }, + { + "epoch": 0.9278413611604137, + "grad_norm": 0.2149561196565628, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.005, + "step": 15160 + }, + { + "epoch": 0.9284533937205459, + "grad_norm": 0.16953054070472717, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0063, + "step": 15170 + }, + { + "epoch": 0.9290654262806781, + "grad_norm": 0.18306049704551697, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.004, + "step": 15180 + }, + { + "epoch": 0.9296774588408103, + "grad_norm": 0.15755385160446167, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0035, + "step": 15190 + }, + { + "epoch": 0.9302894914009425, + "grad_norm": 0.21062517166137695, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0049, + "step": 15200 + }, + { + "epoch": 0.9309015239610747, + "grad_norm": 0.1403888463973999, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0051, + "step": 15210 + }, + { + "epoch": 0.9315135565212069, + "grad_norm": 0.4044550359249115, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0062, + "step": 15220 + }, + { + "epoch": 0.9321255890813391, + "grad_norm": 0.22543896734714508, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 0.9327376216414713, + "grad_norm": 0.2025403380393982, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0055, + "step": 15240 + }, + { + "epoch": 0.9333496542016035, + "grad_norm": 1.0549683570861816, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0092, + "step": 15250 + }, + { + "epoch": 0.9339616867617357, + "grad_norm": 0.3442397117614746, + "learning_rate": 1.123494277220359e-05, + "loss": 0.005, + "step": 15260 + }, + { + "epoch": 0.934573719321868, + "grad_norm": 0.1678813248872757, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.005, + "step": 15270 + }, + { + "epoch": 0.9351857518820001, + "grad_norm": 0.31081119179725647, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0052, + "step": 15280 + }, + { + "epoch": 0.9357977844421324, + "grad_norm": 0.25498780608177185, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.006, + "step": 15290 + }, + { + "epoch": 0.9364098170022646, + "grad_norm": 0.21825125813484192, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0054, + "step": 15300 + }, + { + "epoch": 0.9370218495623968, + "grad_norm": 0.19719983637332916, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0074, + "step": 15310 + }, + { + "epoch": 0.937633882122529, + "grad_norm": 0.32297465205192566, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0058, + "step": 15320 + }, + { + "epoch": 0.9382459146826612, + "grad_norm": 0.2717733383178711, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0035, + "step": 15330 + }, + { + "epoch": 0.9388579472427934, + "grad_norm": 0.22138433158397675, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0048, + "step": 15340 + }, + { + "epoch": 0.9394699798029256, + "grad_norm": 0.1943465769290924, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0063, + "step": 15350 + }, + { + "epoch": 0.9400820123630577, + "grad_norm": 0.18422184884548187, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0054, + "step": 15360 + }, + { + "epoch": 0.9406940449231899, + "grad_norm": 0.17614246904850006, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0054, + "step": 15370 + }, + { + "epoch": 0.9413060774833221, + "grad_norm": 0.17661592364311218, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0053, + "step": 15380 + }, + { + "epoch": 0.9419181100434543, + "grad_norm": 0.42976850271224976, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0054, + "step": 15390 + }, + { + "epoch": 0.9425301426035865, + "grad_norm": 0.34272316098213196, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0044, + "step": 15400 + }, + { + "epoch": 0.9431421751637187, + "grad_norm": 0.3346613645553589, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0042, + "step": 15410 + }, + { + "epoch": 0.9437542077238509, + "grad_norm": 0.15300114452838898, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0057, + "step": 15420 + }, + { + "epoch": 0.9443662402839831, + "grad_norm": 0.23935656249523163, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0084, + "step": 15430 + }, + { + "epoch": 0.9449782728441153, + "grad_norm": 0.21595227718353271, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0051, + "step": 15440 + }, + { + "epoch": 0.9455903054042475, + "grad_norm": 0.2670149505138397, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0059, + "step": 15450 + }, + { + "epoch": 0.9462023379643797, + "grad_norm": 0.2214009314775467, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0054, + "step": 15460 + }, + { + "epoch": 0.9468143705245119, + "grad_norm": 0.3491996228694916, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 0.9474264030846441, + "grad_norm": 0.28213024139404297, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0054, + "step": 15480 + }, + { + "epoch": 0.9480384356447763, + "grad_norm": 0.30218765139579773, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0049, + "step": 15490 + }, + { + "epoch": 0.9486504682049085, + "grad_norm": 0.17068025469779968, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0046, + "step": 15500 + }, + { + "epoch": 0.9492625007650407, + "grad_norm": 0.23325121402740479, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0054, + "step": 15510 + }, + { + "epoch": 0.9498745333251729, + "grad_norm": 0.22118528187274933, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0035, + "step": 15520 + }, + { + "epoch": 0.9504865658853051, + "grad_norm": 0.20202121138572693, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0043, + "step": 15530 + }, + { + "epoch": 0.9510985984454373, + "grad_norm": 0.28455010056495667, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0039, + "step": 15540 + }, + { + "epoch": 0.9517106310055695, + "grad_norm": 0.26871445775032043, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0046, + "step": 15550 + }, + { + "epoch": 0.9523226635657017, + "grad_norm": 0.33665943145751953, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0058, + "step": 15560 + }, + { + "epoch": 0.9529346961258339, + "grad_norm": 0.3182595670223236, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0053, + "step": 15570 + }, + { + "epoch": 0.9535467286859661, + "grad_norm": 0.2867930829524994, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0068, + "step": 15580 + }, + { + "epoch": 0.9541587612460983, + "grad_norm": 0.21562239527702332, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.0051, + "step": 15590 + }, + { + "epoch": 0.9547707938062305, + "grad_norm": 0.19122859835624695, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0046, + "step": 15600 + }, + { + "epoch": 0.9553828263663627, + "grad_norm": 0.24596959352493286, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.005, + "step": 15610 + }, + { + "epoch": 0.9559948589264949, + "grad_norm": 0.182195246219635, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0038, + "step": 15620 + }, + { + "epoch": 0.9566068914866271, + "grad_norm": 0.3122585415840149, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0053, + "step": 15630 + }, + { + "epoch": 0.9572189240467592, + "grad_norm": 0.25725093483924866, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0063, + "step": 15640 + }, + { + "epoch": 0.9578309566068914, + "grad_norm": 0.19965514540672302, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0053, + "step": 15650 + }, + { + "epoch": 0.9584429891670236, + "grad_norm": 0.3474758267402649, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.006, + "step": 15660 + }, + { + "epoch": 0.9590550217271558, + "grad_norm": 0.18151336908340454, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0048, + "step": 15670 + }, + { + "epoch": 0.959667054287288, + "grad_norm": 0.18923020362854004, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0037, + "step": 15680 + }, + { + "epoch": 0.9602790868474202, + "grad_norm": 0.19792871177196503, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0049, + "step": 15690 + }, + { + "epoch": 0.9608911194075525, + "grad_norm": 0.20296797156333923, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0047, + "step": 15700 + }, + { + "epoch": 0.9615031519676847, + "grad_norm": 0.2556051015853882, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0054, + "step": 15710 + }, + { + "epoch": 0.9621151845278169, + "grad_norm": 0.35538288950920105, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0037, + "step": 15720 + }, + { + "epoch": 0.9627272170879491, + "grad_norm": 0.45357266068458557, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0065, + "step": 15730 + }, + { + "epoch": 0.9633392496480813, + "grad_norm": 0.23721693456172943, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0046, + "step": 15740 + }, + { + "epoch": 0.9639512822082135, + "grad_norm": 0.2727845013141632, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0052, + "step": 15750 + }, + { + "epoch": 0.9645633147683457, + "grad_norm": 0.2647950351238251, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0054, + "step": 15760 + }, + { + "epoch": 0.9651753473284779, + "grad_norm": 0.23364882171154022, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.005, + "step": 15770 + }, + { + "epoch": 0.9657873798886101, + "grad_norm": 0.2035825401544571, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0054, + "step": 15780 + }, + { + "epoch": 0.9663994124487423, + "grad_norm": 0.2411692589521408, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0062, + "step": 15790 + }, + { + "epoch": 0.9670114450088745, + "grad_norm": 0.23559266328811646, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 0.9676234775690067, + "grad_norm": 0.23872418701648712, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0063, + "step": 15810 + }, + { + "epoch": 0.9682355101291389, + "grad_norm": 0.27072128653526306, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0052, + "step": 15820 + }, + { + "epoch": 0.9688475426892711, + "grad_norm": 0.42610588669776917, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0056, + "step": 15830 + }, + { + "epoch": 0.9694595752494033, + "grad_norm": 0.13065233826637268, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0044, + "step": 15840 + }, + { + "epoch": 0.9700716078095355, + "grad_norm": 0.2479996383190155, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0049, + "step": 15850 + }, + { + "epoch": 0.9706836403696677, + "grad_norm": 0.22867974638938904, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 0.9712956729297999, + "grad_norm": 0.21570387482643127, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0048, + "step": 15870 + }, + { + "epoch": 0.9719077054899321, + "grad_norm": 0.26354169845581055, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0073, + "step": 15880 + }, + { + "epoch": 0.9725197380500643, + "grad_norm": 0.19785451889038086, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0044, + "step": 15890 + }, + { + "epoch": 0.9731317706101965, + "grad_norm": 0.09346124529838562, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0051, + "step": 15900 + }, + { + "epoch": 0.9737438031703287, + "grad_norm": 0.18946298956871033, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0049, + "step": 15910 + }, + { + "epoch": 0.9743558357304608, + "grad_norm": 0.1761726588010788, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0057, + "step": 15920 + }, + { + "epoch": 0.974967868290593, + "grad_norm": 0.2610328495502472, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0061, + "step": 15930 + }, + { + "epoch": 0.9755799008507252, + "grad_norm": 0.1841743141412735, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0046, + "step": 15940 + }, + { + "epoch": 0.9761919334108574, + "grad_norm": 0.14279355108737946, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0038, + "step": 15950 + }, + { + "epoch": 0.9768039659709896, + "grad_norm": 0.1717681884765625, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0035, + "step": 15960 + }, + { + "epoch": 0.9774159985311218, + "grad_norm": 0.2102527618408203, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.007, + "step": 15970 + }, + { + "epoch": 0.978028031091254, + "grad_norm": 0.29462379217147827, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0058, + "step": 15980 + }, + { + "epoch": 0.9786400636513862, + "grad_norm": 0.1863207072019577, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0058, + "step": 15990 + }, + { + "epoch": 0.9792520962115184, + "grad_norm": 0.2764773964881897, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0051, + "step": 16000 + }, + { + "epoch": 0.9798641287716506, + "grad_norm": 0.2723250091075897, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0056, + "step": 16010 + }, + { + "epoch": 0.9804761613317828, + "grad_norm": 0.21564331650733948, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0048, + "step": 16020 + }, + { + "epoch": 0.981088193891915, + "grad_norm": 0.20242232084274292, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0058, + "step": 16030 + }, + { + "epoch": 0.9817002264520472, + "grad_norm": 0.21522754430770874, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0039, + "step": 16040 + }, + { + "epoch": 0.9823122590121794, + "grad_norm": 0.20013833045959473, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0051, + "step": 16050 + }, + { + "epoch": 0.9829242915723116, + "grad_norm": 0.3008810579776764, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 0.9835363241324439, + "grad_norm": 0.2994979918003082, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0033, + "step": 16070 + }, + { + "epoch": 0.984148356692576, + "grad_norm": 0.22704628109931946, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0046, + "step": 16080 + }, + { + "epoch": 0.9847603892527083, + "grad_norm": 0.3253551423549652, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0043, + "step": 16090 + }, + { + "epoch": 0.9853724218128405, + "grad_norm": 0.14902091026306152, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0042, + "step": 16100 + }, + { + "epoch": 0.9859844543729727, + "grad_norm": 0.15155524015426636, + "learning_rate": 1.04066696184376e-05, + "loss": 0.005, + "step": 16110 + }, + { + "epoch": 0.9865964869331049, + "grad_norm": 0.1859518140554428, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0063, + "step": 16120 + }, + { + "epoch": 0.9872085194932371, + "grad_norm": 0.5434902906417847, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0072, + "step": 16130 + }, + { + "epoch": 0.9878205520533693, + "grad_norm": 0.19308103621006012, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0046, + "step": 16140 + }, + { + "epoch": 0.9884325846135015, + "grad_norm": 0.21260593831539154, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0077, + "step": 16150 + }, + { + "epoch": 0.9890446171736337, + "grad_norm": 0.15255668759346008, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0059, + "step": 16160 + }, + { + "epoch": 0.9896566497337659, + "grad_norm": 0.18739885091781616, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0047, + "step": 16170 + }, + { + "epoch": 0.9902686822938981, + "grad_norm": 0.2112029641866684, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0049, + "step": 16180 + }, + { + "epoch": 0.9908807148540303, + "grad_norm": 0.35941991209983826, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.005, + "step": 16190 + }, + { + "epoch": 0.9914927474141624, + "grad_norm": 0.16792108118534088, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0051, + "step": 16200 + }, + { + "epoch": 0.9921047799742946, + "grad_norm": 0.1985466182231903, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0063, + "step": 16210 + }, + { + "epoch": 0.9927168125344268, + "grad_norm": 0.17579570412635803, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0046, + "step": 16220 + }, + { + "epoch": 0.993328845094559, + "grad_norm": 0.23352178931236267, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0061, + "step": 16230 + }, + { + "epoch": 0.9939408776546912, + "grad_norm": 0.3543553054332733, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0054, + "step": 16240 + }, + { + "epoch": 0.9945529102148234, + "grad_norm": 0.18603719770908356, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0049, + "step": 16250 + }, + { + "epoch": 0.9951649427749556, + "grad_norm": 0.31745344400405884, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0061, + "step": 16260 + }, + { + "epoch": 0.9957769753350878, + "grad_norm": 0.1416773498058319, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0063, + "step": 16270 + }, + { + "epoch": 0.99638900789522, + "grad_norm": 0.18451642990112305, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0055, + "step": 16280 + }, + { + "epoch": 0.9970010404553522, + "grad_norm": 0.13422183692455292, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0047, + "step": 16290 + }, + { + "epoch": 0.9976130730154844, + "grad_norm": 0.15831588208675385, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0045, + "step": 16300 + }, + { + "epoch": 0.9982251055756166, + "grad_norm": 0.42520084977149963, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0053, + "step": 16310 + }, + { + "epoch": 0.9988371381357488, + "grad_norm": 0.20889437198638916, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0043, + "step": 16320 + }, + { + "epoch": 0.999449170695881, + "grad_norm": 0.17016667127609253, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0072, + "step": 16330 + }, + { + "epoch": 1.0000612032560132, + "grad_norm": 0.3129214346408844, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0054, + "step": 16340 + }, + { + "epoch": 1.0006732358161454, + "grad_norm": 0.334224134683609, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.0037, + "step": 16350 + }, + { + "epoch": 1.0012852683762776, + "grad_norm": 0.28502705693244934, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0065, + "step": 16360 + }, + { + "epoch": 1.0018973009364098, + "grad_norm": 0.21431966125965118, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0046, + "step": 16370 + }, + { + "epoch": 1.002509333496542, + "grad_norm": 0.22898051142692566, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.006, + "step": 16380 + }, + { + "epoch": 1.0031213660566742, + "grad_norm": 0.41625624895095825, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0068, + "step": 16390 + }, + { + "epoch": 1.0037333986168064, + "grad_norm": 0.2510327398777008, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0043, + "step": 16400 + }, + { + "epoch": 1.0043454311769386, + "grad_norm": 0.23560962080955505, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0049, + "step": 16410 + }, + { + "epoch": 1.0049574637370708, + "grad_norm": 0.2081199437379837, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0061, + "step": 16420 + }, + { + "epoch": 1.005569496297203, + "grad_norm": 0.12456244230270386, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0057, + "step": 16430 + }, + { + "epoch": 1.0061815288573353, + "grad_norm": 0.22212636470794678, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0052, + "step": 16440 + }, + { + "epoch": 1.0067935614174675, + "grad_norm": 0.27772897481918335, + "learning_rate": 1.007637577910799e-05, + "loss": 0.007, + "step": 16450 + }, + { + "epoch": 1.0074055939775997, + "grad_norm": 0.40040507912635803, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0051, + "step": 16460 + }, + { + "epoch": 1.0080176265377319, + "grad_norm": 0.19763565063476562, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0047, + "step": 16470 + }, + { + "epoch": 1.008629659097864, + "grad_norm": 0.2906181514263153, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0055, + "step": 16480 + }, + { + "epoch": 1.0092416916579963, + "grad_norm": 0.29949888586997986, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0045, + "step": 16490 + }, + { + "epoch": 1.0098537242181285, + "grad_norm": 0.3900962769985199, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0053, + "step": 16500 + }, + { + "epoch": 1.0104657567782607, + "grad_norm": 0.22380846738815308, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0043, + "step": 16510 + }, + { + "epoch": 1.0110777893383929, + "grad_norm": 0.3426673412322998, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0052, + "step": 16520 + }, + { + "epoch": 1.011689821898525, + "grad_norm": 0.2452230006456375, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0055, + "step": 16530 + }, + { + "epoch": 1.0123018544586573, + "grad_norm": 0.24280408024787903, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0042, + "step": 16540 + }, + { + "epoch": 1.0129138870187895, + "grad_norm": 0.18271701037883759, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0047, + "step": 16550 + }, + { + "epoch": 1.0135259195789217, + "grad_norm": 0.2874322235584259, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0059, + "step": 16560 + }, + { + "epoch": 1.0141379521390539, + "grad_norm": 0.17367394268512726, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0048, + "step": 16570 + }, + { + "epoch": 1.014749984699186, + "grad_norm": 0.167460098862648, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0074, + "step": 16580 + }, + { + "epoch": 1.0153620172593183, + "grad_norm": 0.21867765486240387, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0042, + "step": 16590 + }, + { + "epoch": 1.0159740498194505, + "grad_norm": 0.2539086639881134, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0057, + "step": 16600 + }, + { + "epoch": 1.0165860823795827, + "grad_norm": 0.1415795534849167, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0044, + "step": 16610 + }, + { + "epoch": 1.0171981149397147, + "grad_norm": 0.12702493369579315, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0038, + "step": 16620 + }, + { + "epoch": 1.0178101474998469, + "grad_norm": 0.16548305749893188, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0042, + "step": 16630 + }, + { + "epoch": 1.018422180059979, + "grad_norm": 0.4413173496723175, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0059, + "step": 16640 + }, + { + "epoch": 1.0190342126201113, + "grad_norm": 0.30871614813804626, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0045, + "step": 16650 + }, + { + "epoch": 1.0196462451802435, + "grad_norm": 0.259650319814682, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0044, + "step": 16660 + }, + { + "epoch": 1.0202582777403757, + "grad_norm": 0.36035388708114624, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0068, + "step": 16670 + }, + { + "epoch": 1.020870310300508, + "grad_norm": 0.3487808406352997, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0038, + "step": 16680 + }, + { + "epoch": 1.02148234286064, + "grad_norm": 0.2898370623588562, + "learning_rate": 9.843955128197274e-06, + "loss": 0.004, + "step": 16690 + }, + { + "epoch": 1.0220943754207723, + "grad_norm": 0.2942182719707489, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0042, + "step": 16700 + }, + { + "epoch": 1.0227064079809045, + "grad_norm": 0.27839869260787964, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0042, + "step": 16710 + }, + { + "epoch": 1.0233184405410367, + "grad_norm": 0.17199957370758057, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0059, + "step": 16720 + }, + { + "epoch": 1.023930473101169, + "grad_norm": 0.2521669566631317, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0073, + "step": 16730 + }, + { + "epoch": 1.0245425056613011, + "grad_norm": 0.19908513128757477, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0047, + "step": 16740 + }, + { + "epoch": 1.0251545382214333, + "grad_norm": 0.23300328850746155, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0055, + "step": 16750 + }, + { + "epoch": 1.0257665707815655, + "grad_norm": 0.24671277403831482, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0043, + "step": 16760 + }, + { + "epoch": 1.0263786033416977, + "grad_norm": 0.23183101415634155, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0052, + "step": 16770 + }, + { + "epoch": 1.02699063590183, + "grad_norm": 0.13460612297058105, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0035, + "step": 16780 + }, + { + "epoch": 1.0276026684619621, + "grad_norm": 0.1990940123796463, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0044, + "step": 16790 + }, + { + "epoch": 1.0282147010220943, + "grad_norm": 0.21223406493663788, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0036, + "step": 16800 + }, + { + "epoch": 1.0288267335822265, + "grad_norm": 0.2649106979370117, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0044, + "step": 16810 + }, + { + "epoch": 1.0294387661423587, + "grad_norm": 0.2524845600128174, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0048, + "step": 16820 + }, + { + "epoch": 1.030050798702491, + "grad_norm": 0.22169779241085052, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 1.0306628312626231, + "grad_norm": 0.16642418503761292, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0048, + "step": 16840 + }, + { + "epoch": 1.0312748638227553, + "grad_norm": 0.22939598560333252, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0048, + "step": 16850 + }, + { + "epoch": 1.0318868963828876, + "grad_norm": 0.2131129503250122, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0055, + "step": 16860 + }, + { + "epoch": 1.0324989289430198, + "grad_norm": 0.20492705702781677, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0041, + "step": 16870 + }, + { + "epoch": 1.033110961503152, + "grad_norm": 0.2988845705986023, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0042, + "step": 16880 + }, + { + "epoch": 1.0337229940632842, + "grad_norm": 0.18579600751399994, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0042, + "step": 16890 + }, + { + "epoch": 1.0343350266234164, + "grad_norm": 0.2553490698337555, + "learning_rate": 9.641222698101725e-06, + "loss": 0.005, + "step": 16900 + }, + { + "epoch": 1.0349470591835486, + "grad_norm": 0.338440865278244, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0036, + "step": 16910 + }, + { + "epoch": 1.0355590917436808, + "grad_norm": 0.12755723297595978, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0044, + "step": 16920 + }, + { + "epoch": 1.036171124303813, + "grad_norm": 0.12222232669591904, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0037, + "step": 16930 + }, + { + "epoch": 1.0367831568639452, + "grad_norm": 0.20246204733848572, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0055, + "step": 16940 + }, + { + "epoch": 1.0373951894240774, + "grad_norm": 0.36903291940689087, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0051, + "step": 16950 + }, + { + "epoch": 1.0380072219842096, + "grad_norm": 0.3166116178035736, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0045, + "step": 16960 + }, + { + "epoch": 1.0386192545443418, + "grad_norm": 0.2777375280857086, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0041, + "step": 16970 + }, + { + "epoch": 1.039231287104474, + "grad_norm": 0.3173989951610565, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0053, + "step": 16980 + }, + { + "epoch": 1.0398433196646062, + "grad_norm": 0.2135571539402008, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0051, + "step": 16990 + }, + { + "epoch": 1.0404553522247384, + "grad_norm": 0.18536782264709473, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0037, + "step": 17000 + }, + { + "epoch": 1.0410673847848706, + "grad_norm": 0.17782410979270935, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0052, + "step": 17010 + }, + { + "epoch": 1.0416794173450028, + "grad_norm": 0.31509512662887573, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0099, + "step": 17020 + }, + { + "epoch": 1.042291449905135, + "grad_norm": 0.22748225927352905, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 1.0429034824652672, + "grad_norm": 0.14924705028533936, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0053, + "step": 17040 + }, + { + "epoch": 1.0435155150253994, + "grad_norm": 0.21390999853610992, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0044, + "step": 17050 + }, + { + "epoch": 1.0441275475855316, + "grad_norm": 0.25828516483306885, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0042, + "step": 17060 + }, + { + "epoch": 1.0447395801456638, + "grad_norm": 0.24069662392139435, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0069, + "step": 17070 + }, + { + "epoch": 1.045351612705796, + "grad_norm": 0.1090504601597786, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0036, + "step": 17080 + }, + { + "epoch": 1.0459636452659282, + "grad_norm": 0.17990687489509583, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0049, + "step": 17090 + }, + { + "epoch": 1.0465756778260604, + "grad_norm": 0.21505555510520935, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0051, + "step": 17100 + }, + { + "epoch": 1.0471877103861926, + "grad_norm": 0.2157493680715561, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0063, + "step": 17110 + }, + { + "epoch": 1.0477997429463248, + "grad_norm": 0.30865493416786194, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0053, + "step": 17120 + }, + { + "epoch": 1.048411775506457, + "grad_norm": 0.16882938146591187, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0043, + "step": 17130 + }, + { + "epoch": 1.0490238080665892, + "grad_norm": 0.14921846985816956, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0043, + "step": 17140 + }, + { + "epoch": 1.0496358406267214, + "grad_norm": 0.15723800659179688, + "learning_rate": 9.400800085133245e-06, + "loss": 0.005, + "step": 17150 + }, + { + "epoch": 1.0502478731868536, + "grad_norm": 0.19597285985946655, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0046, + "step": 17160 + }, + { + "epoch": 1.0508599057469858, + "grad_norm": 0.1684723198413849, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0053, + "step": 17170 + }, + { + "epoch": 1.051471938307118, + "grad_norm": 0.1733175367116928, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0053, + "step": 17180 + }, + { + "epoch": 1.0520839708672503, + "grad_norm": 0.23111647367477417, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0048, + "step": 17190 + }, + { + "epoch": 1.0526960034273822, + "grad_norm": 0.36174628138542175, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0049, + "step": 17200 + }, + { + "epoch": 1.0533080359875144, + "grad_norm": 0.15791575610637665, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0048, + "step": 17210 + }, + { + "epoch": 1.0539200685476466, + "grad_norm": 0.16026809811592102, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0047, + "step": 17220 + }, + { + "epoch": 1.0545321011077788, + "grad_norm": 0.13964296877384186, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0033, + "step": 17230 + }, + { + "epoch": 1.055144133667911, + "grad_norm": 0.22623896598815918, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0041, + "step": 17240 + }, + { + "epoch": 1.0557561662280432, + "grad_norm": 0.15534555912017822, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0067, + "step": 17250 + }, + { + "epoch": 1.0563681987881754, + "grad_norm": 0.09519665688276291, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0035, + "step": 17260 + }, + { + "epoch": 1.0569802313483077, + "grad_norm": 0.19323785603046417, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0045, + "step": 17270 + }, + { + "epoch": 1.0575922639084399, + "grad_norm": 0.21194952726364136, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0047, + "step": 17280 + }, + { + "epoch": 1.058204296468572, + "grad_norm": 0.28977999091148376, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0049, + "step": 17290 + }, + { + "epoch": 1.0588163290287043, + "grad_norm": 0.1739121824502945, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0062, + "step": 17300 + }, + { + "epoch": 1.0594283615888365, + "grad_norm": 0.23189865052700043, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0055, + "step": 17310 + }, + { + "epoch": 1.0600403941489687, + "grad_norm": 0.15705449879169464, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0033, + "step": 17320 + }, + { + "epoch": 1.0606524267091009, + "grad_norm": 0.23189882934093475, + "learning_rate": 9.228411903689187e-06, + "loss": 0.003, + "step": 17330 + }, + { + "epoch": 1.061264459269233, + "grad_norm": 0.19559095799922943, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0051, + "step": 17340 + }, + { + "epoch": 1.0618764918293653, + "grad_norm": 0.2560543715953827, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0049, + "step": 17350 + }, + { + "epoch": 1.0624885243894975, + "grad_norm": 0.35167232155799866, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0042, + "step": 17360 + }, + { + "epoch": 1.0631005569496297, + "grad_norm": 0.17626497149467468, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0039, + "step": 17370 + }, + { + "epoch": 1.0637125895097619, + "grad_norm": 0.18818546831607819, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0043, + "step": 17380 + }, + { + "epoch": 1.064324622069894, + "grad_norm": 0.10237561911344528, + "learning_rate": 9.171095634265995e-06, + "loss": 0.0053, + "step": 17390 + }, + { + "epoch": 1.0649366546300263, + "grad_norm": 0.21828459203243256, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0042, + "step": 17400 + }, + { + "epoch": 1.0655486871901585, + "grad_norm": 0.09354235231876373, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0034, + "step": 17410 + }, + { + "epoch": 1.0661607197502907, + "grad_norm": 0.18106088042259216, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0051, + "step": 17420 + }, + { + "epoch": 1.066772752310423, + "grad_norm": 0.21538101136684418, + "learning_rate": 9.132927564918328e-06, + "loss": 0.0056, + "step": 17430 + }, + { + "epoch": 1.067384784870555, + "grad_norm": 0.18729519844055176, + "learning_rate": 9.1233909973763e-06, + "loss": 0.004, + "step": 17440 + }, + { + "epoch": 1.0679968174306873, + "grad_norm": 0.3791484832763672, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0052, + "step": 17450 + }, + { + "epoch": 1.0686088499908195, + "grad_norm": 0.19206254184246063, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0042, + "step": 17460 + }, + { + "epoch": 1.0692208825509517, + "grad_norm": 0.15434518456459045, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0061, + "step": 17470 + }, + { + "epoch": 1.069832915111084, + "grad_norm": 0.17898093163967133, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0045, + "step": 17480 + }, + { + "epoch": 1.0704449476712161, + "grad_norm": 0.21975649893283844, + "learning_rate": 9.07574141798717e-06, + "loss": 0.005, + "step": 17490 + }, + { + "epoch": 1.0710569802313483, + "grad_norm": 0.1380346417427063, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0032, + "step": 17500 + }, + { + "epoch": 1.0716690127914805, + "grad_norm": 0.28567400574684143, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0044, + "step": 17510 + }, + { + "epoch": 1.0722810453516127, + "grad_norm": 0.22925534844398499, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0041, + "step": 17520 + }, + { + "epoch": 1.072893077911745, + "grad_norm": 0.27094215154647827, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0047, + "step": 17530 + }, + { + "epoch": 1.0735051104718771, + "grad_norm": 0.32299691438674927, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0048, + "step": 17540 + }, + { + "epoch": 1.0741171430320093, + "grad_norm": 0.26789531111717224, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0047, + "step": 17550 + }, + { + "epoch": 1.0747291755921415, + "grad_norm": 0.3175952434539795, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0077, + "step": 17560 + }, + { + "epoch": 1.0753412081522737, + "grad_norm": 0.24784249067306519, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0048, + "step": 17570 + }, + { + "epoch": 1.075953240712406, + "grad_norm": 0.3081960380077362, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0046, + "step": 17580 + }, + { + "epoch": 1.0765652732725381, + "grad_norm": 0.25334152579307556, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0039, + "step": 17590 + }, + { + "epoch": 1.0771773058326704, + "grad_norm": 0.24747619032859802, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0059, + "step": 17600 + }, + { + "epoch": 1.0777893383928026, + "grad_norm": 0.19048908352851868, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0049, + "step": 17610 + }, + { + "epoch": 1.0784013709529348, + "grad_norm": 0.18883349001407623, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0047, + "step": 17620 + }, + { + "epoch": 1.079013403513067, + "grad_norm": 0.18653099238872528, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0044, + "step": 17630 + }, + { + "epoch": 1.0796254360731992, + "grad_norm": 0.1320251226425171, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0042, + "step": 17640 + }, + { + "epoch": 1.0802374686333314, + "grad_norm": 0.14996238052845, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0041, + "step": 17650 + }, + { + "epoch": 1.0808495011934636, + "grad_norm": 0.4576573073863983, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0059, + "step": 17660 + }, + { + "epoch": 1.0814615337535958, + "grad_norm": 0.19582511484622955, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0051, + "step": 17670 + }, + { + "epoch": 1.082073566313728, + "grad_norm": 0.21973003447055817, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0059, + "step": 17680 + }, + { + "epoch": 1.0826855988738602, + "grad_norm": 0.18183568120002747, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0056, + "step": 17690 + }, + { + "epoch": 1.0832976314339924, + "grad_norm": 0.1761978417634964, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0049, + "step": 17700 + }, + { + "epoch": 1.0839096639941246, + "grad_norm": 0.10185366123914719, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0041, + "step": 17710 + }, + { + "epoch": 1.0845216965542568, + "grad_norm": 0.262513130903244, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0046, + "step": 17720 + }, + { + "epoch": 1.0851337291143888, + "grad_norm": 0.36413198709487915, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0043, + "step": 17730 + }, + { + "epoch": 1.085745761674521, + "grad_norm": 0.2258218675851822, + "learning_rate": 8.83836825410936e-06, + "loss": 0.005, + "step": 17740 + }, + { + "epoch": 1.0863577942346532, + "grad_norm": 0.20840497314929962, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0067, + "step": 17750 + }, + { + "epoch": 1.0869698267947854, + "grad_norm": 0.33392995595932007, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0063, + "step": 17760 + }, + { + "epoch": 1.0875818593549176, + "grad_norm": 0.18477876484394073, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0039, + "step": 17770 + }, + { + "epoch": 1.0881938919150498, + "grad_norm": 0.14785899221897125, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0063, + "step": 17780 + }, + { + "epoch": 1.088805924475182, + "grad_norm": 0.12930043041706085, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0055, + "step": 17790 + }, + { + "epoch": 1.0894179570353142, + "grad_norm": 0.1541786789894104, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0035, + "step": 17800 + }, + { + "epoch": 1.0900299895954464, + "grad_norm": 0.1781499683856964, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0054, + "step": 17810 + }, + { + "epoch": 1.0906420221555786, + "grad_norm": 0.13659314811229706, + "learning_rate": 8.762735374981932e-06, + "loss": 0.0047, + "step": 17820 + }, + { + "epoch": 1.0912540547157108, + "grad_norm": 0.18936918675899506, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0041, + "step": 17830 + }, + { + "epoch": 1.091866087275843, + "grad_norm": 0.24795638024806976, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0043, + "step": 17840 + }, + { + "epoch": 1.0924781198359752, + "grad_norm": 0.28090324997901917, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0036, + "step": 17850 + }, + { + "epoch": 1.0930901523961074, + "grad_norm": 0.3130576014518738, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0042, + "step": 17860 + }, + { + "epoch": 1.0937021849562396, + "grad_norm": 0.19758646190166473, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0047, + "step": 17870 + }, + { + "epoch": 1.0943142175163718, + "grad_norm": 0.20309071242809296, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0042, + "step": 17880 + }, + { + "epoch": 1.094926250076504, + "grad_norm": 0.19741898775100708, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0057, + "step": 17890 + }, + { + "epoch": 1.0955382826366362, + "grad_norm": 0.19182747602462769, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0042, + "step": 17900 + }, + { + "epoch": 1.0961503151967684, + "grad_norm": 0.14508575201034546, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0044, + "step": 17910 + }, + { + "epoch": 1.0967623477569006, + "grad_norm": 0.19854849576950073, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0064, + "step": 17920 + }, + { + "epoch": 1.0973743803170328, + "grad_norm": 0.15055720508098602, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0049, + "step": 17930 + }, + { + "epoch": 1.097986412877165, + "grad_norm": 0.1855372190475464, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0043, + "step": 17940 + }, + { + "epoch": 1.0985984454372972, + "grad_norm": 0.13770940899848938, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0058, + "step": 17950 + }, + { + "epoch": 1.0992104779974294, + "grad_norm": 0.24905221164226532, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0048, + "step": 17960 + }, + { + "epoch": 1.0998225105575616, + "grad_norm": 0.1951165348291397, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0043, + "step": 17970 + }, + { + "epoch": 1.1004345431176938, + "grad_norm": 0.18365852534770966, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0051, + "step": 17980 + }, + { + "epoch": 1.101046575677826, + "grad_norm": 0.16304127871990204, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0034, + "step": 17990 + }, + { + "epoch": 1.1016586082379582, + "grad_norm": 0.262677401304245, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0042, + "step": 18000 + }, + { + "epoch": 1.1022706407980905, + "grad_norm": 0.6157310605049133, + "learning_rate": 8.583791146965244e-06, + "loss": 0.007, + "step": 18010 + }, + { + "epoch": 1.1028826733582227, + "grad_norm": 0.2832951247692108, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0056, + "step": 18020 + }, + { + "epoch": 1.1034947059183549, + "grad_norm": 0.1781810224056244, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0049, + "step": 18030 + }, + { + "epoch": 1.104106738478487, + "grad_norm": 0.23228950798511505, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0045, + "step": 18040 + }, + { + "epoch": 1.1047187710386193, + "grad_norm": 0.2573170065879822, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0048, + "step": 18050 + }, + { + "epoch": 1.1053308035987515, + "grad_norm": 0.30996036529541016, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0054, + "step": 18060 + }, + { + "epoch": 1.1059428361588837, + "grad_norm": 0.24979132413864136, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0045, + "step": 18070 + }, + { + "epoch": 1.1065548687190159, + "grad_norm": 0.17564314603805542, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0033, + "step": 18080 + }, + { + "epoch": 1.107166901279148, + "grad_norm": 0.14539776742458344, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0047, + "step": 18090 + }, + { + "epoch": 1.1077789338392803, + "grad_norm": 0.2530387341976166, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0058, + "step": 18100 + }, + { + "epoch": 1.1083909663994125, + "grad_norm": 0.2038760781288147, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0052, + "step": 18110 + }, + { + "epoch": 1.1090029989595447, + "grad_norm": 0.1769075244665146, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0043, + "step": 18120 + }, + { + "epoch": 1.1096150315196769, + "grad_norm": 0.1686626374721527, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0055, + "step": 18130 + }, + { + "epoch": 1.110227064079809, + "grad_norm": 0.21752336621284485, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0052, + "step": 18140 + }, + { + "epoch": 1.1108390966399413, + "grad_norm": 0.2739295959472656, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0039, + "step": 18150 + }, + { + "epoch": 1.1114511292000735, + "grad_norm": 0.18259567022323608, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0038, + "step": 18160 + }, + { + "epoch": 1.1120631617602057, + "grad_norm": 0.21565310657024384, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0043, + "step": 18170 + }, + { + "epoch": 1.112675194320338, + "grad_norm": 0.2141607403755188, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0059, + "step": 18180 + }, + { + "epoch": 1.11328722688047, + "grad_norm": 0.3017563819885254, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0044, + "step": 18190 + }, + { + "epoch": 1.1138992594406023, + "grad_norm": 0.2021455019712448, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0044, + "step": 18200 + }, + { + "epoch": 1.1145112920007345, + "grad_norm": 0.2113070785999298, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0048, + "step": 18210 + }, + { + "epoch": 1.1151233245608667, + "grad_norm": 0.18945784866809845, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0029, + "step": 18220 + }, + { + "epoch": 1.115735357120999, + "grad_norm": 0.15259192883968353, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0043, + "step": 18230 + }, + { + "epoch": 1.1163473896811311, + "grad_norm": 0.17555822432041168, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0039, + "step": 18240 + }, + { + "epoch": 1.1169594222412633, + "grad_norm": 0.20105648040771484, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0058, + "step": 18250 + }, + { + "epoch": 1.1175714548013955, + "grad_norm": 0.31626567244529724, + "learning_rate": 8.349909816537207e-06, + "loss": 0.0044, + "step": 18260 + }, + { + "epoch": 1.1181834873615277, + "grad_norm": 0.16219007968902588, + "learning_rate": 8.340593854157868e-06, + "loss": 0.005, + "step": 18270 + }, + { + "epoch": 1.11879551992166, + "grad_norm": 0.2174186110496521, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0055, + "step": 18280 + }, + { + "epoch": 1.1194075524817921, + "grad_norm": 0.13639339804649353, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0043, + "step": 18290 + }, + { + "epoch": 1.1200195850419243, + "grad_norm": 0.15100249648094177, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0042, + "step": 18300 + }, + { + "epoch": 1.1206316176020565, + "grad_norm": 0.2114904671907425, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0043, + "step": 18310 + }, + { + "epoch": 1.1212436501621887, + "grad_norm": 0.2941966950893402, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0052, + "step": 18320 + }, + { + "epoch": 1.1218556827223207, + "grad_norm": 0.21695150434970856, + "learning_rate": 8.28476400245882e-06, + "loss": 0.005, + "step": 18330 + }, + { + "epoch": 1.122467715282453, + "grad_norm": 0.11768218129873276, + "learning_rate": 8.275470116190976e-06, + "loss": 0.005, + "step": 18340 + }, + { + "epoch": 1.1230797478425851, + "grad_norm": 0.1427483856678009, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0039, + "step": 18350 + }, + { + "epoch": 1.1236917804027173, + "grad_norm": 0.1837971955537796, + "learning_rate": 8.256891946721157e-06, + "loss": 0.004, + "step": 18360 + }, + { + "epoch": 1.1243038129628495, + "grad_norm": 0.30968883633613586, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0037, + "step": 18370 + }, + { + "epoch": 1.1249158455229817, + "grad_norm": 0.13366396725177765, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0042, + "step": 18380 + }, + { + "epoch": 1.125527878083114, + "grad_norm": 0.1829235553741455, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0039, + "step": 18390 + }, + { + "epoch": 1.1261399106432461, + "grad_norm": 0.3106991648674011, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0052, + "step": 18400 + }, + { + "epoch": 1.1267519432033783, + "grad_norm": 0.38655754923820496, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0046, + "step": 18410 + }, + { + "epoch": 1.1273639757635106, + "grad_norm": 0.23598383367061615, + "learning_rate": 8.201235047388747e-06, + "loss": 0.004, + "step": 18420 + }, + { + "epoch": 1.1279760083236428, + "grad_norm": 0.17428012192249298, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0046, + "step": 18430 + }, + { + "epoch": 1.128588040883775, + "grad_norm": 0.1847466081380844, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0043, + "step": 18440 + }, + { + "epoch": 1.1292000734439072, + "grad_norm": 0.14917762577533722, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0049, + "step": 18450 + }, + { + "epoch": 1.1298121060040394, + "grad_norm": 0.2882528305053711, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0046, + "step": 18460 + }, + { + "epoch": 1.1304241385641716, + "grad_norm": 0.36186549067497253, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0065, + "step": 18470 + }, + { + "epoch": 1.1310361711243038, + "grad_norm": 0.1604463905096054, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0037, + "step": 18480 + }, + { + "epoch": 1.131648203684436, + "grad_norm": 0.17751921713352203, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0034, + "step": 18490 + }, + { + "epoch": 1.1322602362445682, + "grad_norm": 0.15355733036994934, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0059, + "step": 18500 + }, + { + "epoch": 1.1328722688047004, + "grad_norm": 0.21558596193790436, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0044, + "step": 18510 + }, + { + "epoch": 1.1334843013648326, + "grad_norm": 0.20114412903785706, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 1.1340963339249648, + "grad_norm": 0.17260855436325073, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0053, + "step": 18530 + }, + { + "epoch": 1.134708366485097, + "grad_norm": 0.16089287400245667, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0032, + "step": 18540 + }, + { + "epoch": 1.1353203990452292, + "grad_norm": 0.14655937254428864, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0043, + "step": 18550 + }, + { + "epoch": 1.1359324316053614, + "grad_norm": 0.16373249888420105, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0058, + "step": 18560 + }, + { + "epoch": 1.1365444641654936, + "grad_norm": 0.14543801546096802, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0043, + "step": 18570 + }, + { + "epoch": 1.1371564967256258, + "grad_norm": 0.3515278100967407, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0043, + "step": 18580 + }, + { + "epoch": 1.137768529285758, + "grad_norm": 0.21776945888996124, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0053, + "step": 18590 + }, + { + "epoch": 1.1383805618458902, + "grad_norm": 0.21879829466342926, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0051, + "step": 18600 + }, + { + "epoch": 1.1389925944060224, + "grad_norm": 0.16967973113059998, + "learning_rate": 8.025779439806006e-06, + "loss": 0.0048, + "step": 18610 + }, + { + "epoch": 1.1396046269661546, + "grad_norm": 0.4298441410064697, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0056, + "step": 18620 + }, + { + "epoch": 1.1402166595262868, + "grad_norm": 0.1858961284160614, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0067, + "step": 18630 + }, + { + "epoch": 1.140828692086419, + "grad_norm": 0.25853803753852844, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0057, + "step": 18640 + }, + { + "epoch": 1.1414407246465512, + "grad_norm": 0.18566234409809113, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0048, + "step": 18650 + }, + { + "epoch": 1.1420527572066834, + "grad_norm": 0.3471083343029022, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0042, + "step": 18660 + }, + { + "epoch": 1.1426647897668156, + "grad_norm": 0.2092636376619339, + "learning_rate": 7.970630670012853e-06, + "loss": 0.004, + "step": 18670 + }, + { + "epoch": 1.1432768223269478, + "grad_norm": 0.3432580828666687, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0044, + "step": 18680 + }, + { + "epoch": 1.14388885488708, + "grad_norm": 0.14227882027626038, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0041, + "step": 18690 + }, + { + "epoch": 1.1445008874472122, + "grad_norm": 0.2128007709980011, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0037, + "step": 18700 + }, + { + "epoch": 1.1451129200073444, + "grad_norm": 0.25377482175827026, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0049, + "step": 18710 + }, + { + "epoch": 1.1457249525674766, + "grad_norm": 0.1905982494354248, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0037, + "step": 18720 + }, + { + "epoch": 1.1463369851276088, + "grad_norm": 0.3090096712112427, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0079, + "step": 18730 + }, + { + "epoch": 1.146949017687741, + "grad_norm": 0.15604345500469208, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0037, + "step": 18740 + }, + { + "epoch": 1.1475610502478732, + "grad_norm": 0.21756386756896973, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0043, + "step": 18750 + }, + { + "epoch": 1.1481730828080055, + "grad_norm": 0.23869304358959198, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0038, + "step": 18760 + }, + { + "epoch": 1.1487851153681377, + "grad_norm": 0.18082380294799805, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0073, + "step": 18770 + }, + { + "epoch": 1.1493971479282699, + "grad_norm": 0.4032754898071289, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0061, + "step": 18780 + }, + { + "epoch": 1.150009180488402, + "grad_norm": 0.3173290491104126, + "learning_rate": 7.860719408056385e-06, + "loss": 0.004, + "step": 18790 + }, + { + "epoch": 1.1506212130485343, + "grad_norm": 0.18892645835876465, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0044, + "step": 18800 + }, + { + "epoch": 1.1512332456086665, + "grad_norm": 0.26740241050720215, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0056, + "step": 18810 + }, + { + "epoch": 1.1518452781687987, + "grad_norm": 0.3046218752861023, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0072, + "step": 18820 + }, + { + "epoch": 1.1524573107289309, + "grad_norm": 0.17181983590126038, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0034, + "step": 18830 + }, + { + "epoch": 1.1530693432890629, + "grad_norm": 0.22095724940299988, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0045, + "step": 18840 + }, + { + "epoch": 1.153681375849195, + "grad_norm": 0.1514609307050705, + "learning_rate": 7.80596155940873e-06, + "loss": 0.004, + "step": 18850 + }, + { + "epoch": 1.1542934084093273, + "grad_norm": 0.15244366228580475, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0047, + "step": 18860 + }, + { + "epoch": 1.1549054409694595, + "grad_norm": 0.24359947443008423, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0039, + "step": 18870 + }, + { + "epoch": 1.1555174735295917, + "grad_norm": 0.15558156371116638, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0036, + "step": 18880 + }, + { + "epoch": 1.1561295060897239, + "grad_norm": 0.33679234981536865, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0041, + "step": 18890 + }, + { + "epoch": 1.156741538649856, + "grad_norm": 0.15811999142169952, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0062, + "step": 18900 + }, + { + "epoch": 1.1573535712099883, + "grad_norm": 0.14838527143001556, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0029, + "step": 18910 + }, + { + "epoch": 1.1579656037701205, + "grad_norm": 0.23024815320968628, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0038, + "step": 18920 + }, + { + "epoch": 1.1585776363302527, + "grad_norm": 0.18455618619918823, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0044, + "step": 18930 + }, + { + "epoch": 1.1591896688903849, + "grad_norm": 0.20213079452514648, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0038, + "step": 18940 + }, + { + "epoch": 1.159801701450517, + "grad_norm": 0.19000643491744995, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0043, + "step": 18950 + }, + { + "epoch": 1.1604137340106493, + "grad_norm": 0.14075686037540436, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0047, + "step": 18960 + }, + { + "epoch": 1.1610257665707815, + "grad_norm": 0.22101792693138123, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0045, + "step": 18970 + }, + { + "epoch": 1.1616377991309137, + "grad_norm": 0.1097906231880188, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0037, + "step": 18980 + }, + { + "epoch": 1.162249831691046, + "grad_norm": 0.16169370710849762, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0033, + "step": 18990 + }, + { + "epoch": 1.162861864251178, + "grad_norm": 0.32931753993034363, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0052, + "step": 19000 + }, + { + "epoch": 1.1634738968113103, + "grad_norm": 0.2494741678237915, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0057, + "step": 19010 + }, + { + "epoch": 1.1640859293714425, + "grad_norm": 0.18492171168327332, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0056, + "step": 19020 + }, + { + "epoch": 1.1646979619315747, + "grad_norm": 0.18830963969230652, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0036, + "step": 19030 + }, + { + "epoch": 1.165309994491707, + "grad_norm": 0.1331586092710495, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0046, + "step": 19040 + }, + { + "epoch": 1.1659220270518391, + "grad_norm": 0.2433806210756302, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0053, + "step": 19050 + }, + { + "epoch": 1.1665340596119713, + "grad_norm": 0.24491485953330994, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0037, + "step": 19060 + }, + { + "epoch": 1.1671460921721035, + "grad_norm": 0.1789211630821228, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0046, + "step": 19070 + }, + { + "epoch": 1.1677581247322357, + "grad_norm": 0.2729121148586273, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0043, + "step": 19080 + }, + { + "epoch": 1.168370157292368, + "grad_norm": 0.19535189867019653, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0056, + "step": 19090 + }, + { + "epoch": 1.1689821898525001, + "grad_norm": 0.2282983660697937, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0048, + "step": 19100 + }, + { + "epoch": 1.1695942224126323, + "grad_norm": 0.1281195729970932, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0045, + "step": 19110 + }, + { + "epoch": 1.1702062549727645, + "grad_norm": 0.2850968539714813, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0034, + "step": 19120 + }, + { + "epoch": 1.1708182875328967, + "grad_norm": 0.12891536951065063, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0054, + "step": 19130 + }, + { + "epoch": 1.171430320093029, + "grad_norm": 0.13464727997779846, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0033, + "step": 19140 + }, + { + "epoch": 1.1720423526531611, + "grad_norm": 0.2415568083524704, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0041, + "step": 19150 + }, + { + "epoch": 1.1726543852132933, + "grad_norm": 0.15686331689357758, + "learning_rate": 7.525246655150879e-06, + "loss": 0.004, + "step": 19160 + }, + { + "epoch": 1.1732664177734256, + "grad_norm": 0.15490666031837463, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0039, + "step": 19170 + }, + { + "epoch": 1.1738784503335578, + "grad_norm": 0.14095450937747955, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0034, + "step": 19180 + }, + { + "epoch": 1.17449048289369, + "grad_norm": 0.19024531543254852, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0084, + "step": 19190 + }, + { + "epoch": 1.1751025154538222, + "grad_norm": 0.2583692669868469, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0042, + "step": 19200 + }, + { + "epoch": 1.1757145480139544, + "grad_norm": 0.19117654860019684, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0038, + "step": 19210 + }, + { + "epoch": 1.1763265805740866, + "grad_norm": 0.15838374197483063, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0036, + "step": 19220 + }, + { + "epoch": 1.1769386131342188, + "grad_norm": 0.30352044105529785, + "learning_rate": 7.4623904967312e-06, + "loss": 0.0052, + "step": 19230 + }, + { + "epoch": 1.177550645694351, + "grad_norm": 0.229969322681427, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0043, + "step": 19240 + }, + { + "epoch": 1.1781626782544832, + "grad_norm": 0.17781461775302887, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0046, + "step": 19250 + }, + { + "epoch": 1.1787747108146154, + "grad_norm": 0.1306339055299759, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0032, + "step": 19260 + }, + { + "epoch": 1.1793867433747476, + "grad_norm": 0.15727253258228302, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0045, + "step": 19270 + }, + { + "epoch": 1.1799987759348798, + "grad_norm": 0.24909166991710663, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0045, + "step": 19280 + }, + { + "epoch": 1.180610808495012, + "grad_norm": 0.4604126811027527, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0053, + "step": 19290 + }, + { + "epoch": 1.1812228410551442, + "grad_norm": 0.12739762663841248, + "learning_rate": 7.399737764864619e-06, + "loss": 0.004, + "step": 19300 + }, + { + "epoch": 1.1818348736152764, + "grad_norm": 0.2849223017692566, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0043, + "step": 19310 + }, + { + "epoch": 1.1824469061754086, + "grad_norm": 0.26089897751808167, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0044, + "step": 19320 + }, + { + "epoch": 1.1830589387355408, + "grad_norm": 0.1752242147922516, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0076, + "step": 19330 + }, + { + "epoch": 1.183670971295673, + "grad_norm": 0.14917130768299103, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0097, + "step": 19340 + }, + { + "epoch": 1.1842830038558052, + "grad_norm": 0.1599114090204239, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0061, + "step": 19350 + }, + { + "epoch": 1.1848950364159374, + "grad_norm": 0.16370004415512085, + "learning_rate": 7.346200065486093e-06, + "loss": 0.0035, + "step": 19360 + }, + { + "epoch": 1.1855070689760696, + "grad_norm": 0.19354844093322754, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0032, + "step": 19370 + }, + { + "epoch": 1.1861191015362018, + "grad_norm": 0.19689561426639557, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0067, + "step": 19380 + }, + { + "epoch": 1.186731134096334, + "grad_norm": 0.22203278541564941, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0041, + "step": 19390 + }, + { + "epoch": 1.1873431666564662, + "grad_norm": 0.13579773902893066, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0048, + "step": 19400 + }, + { + "epoch": 1.1879551992165984, + "grad_norm": 0.12321218848228455, + "learning_rate": 7.301703138094429e-06, + "loss": 0.004, + "step": 19410 + }, + { + "epoch": 1.1885672317767306, + "grad_norm": 0.28819525241851807, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0042, + "step": 19420 + }, + { + "epoch": 1.1891792643368628, + "grad_norm": 0.2577916085720062, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0039, + "step": 19430 + }, + { + "epoch": 1.189791296896995, + "grad_norm": 0.26840633153915405, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0062, + "step": 19440 + }, + { + "epoch": 1.1904033294571272, + "grad_norm": 0.24222144484519958, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0044, + "step": 19450 + }, + { + "epoch": 1.1910153620172594, + "grad_norm": 0.157009556889534, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0038, + "step": 19460 + }, + { + "epoch": 1.1916273945773916, + "grad_norm": 0.19925500452518463, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0042, + "step": 19470 + }, + { + "epoch": 1.1922394271375236, + "grad_norm": 0.19200846552848816, + "learning_rate": 7.239590017751423e-06, + "loss": 0.004, + "step": 19480 + }, + { + "epoch": 1.1928514596976558, + "grad_norm": 0.18441490828990936, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0056, + "step": 19490 + }, + { + "epoch": 1.193463492257788, + "grad_norm": 0.27565324306488037, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0046, + "step": 19500 + }, + { + "epoch": 1.1940755248179202, + "grad_norm": 0.17830556631088257, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0043, + "step": 19510 + }, + { + "epoch": 1.1946875573780524, + "grad_norm": 0.2769330143928528, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0037, + "step": 19520 + }, + { + "epoch": 1.1952995899381846, + "grad_norm": 0.168451189994812, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0039, + "step": 19530 + }, + { + "epoch": 1.1959116224983168, + "grad_norm": 0.31246763467788696, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0046, + "step": 19540 + }, + { + "epoch": 1.196523655058449, + "grad_norm": 0.21112671494483948, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0041, + "step": 19550 + }, + { + "epoch": 1.1971356876185812, + "grad_norm": 0.31681302189826965, + "learning_rate": 7.168868583990693e-06, + "loss": 0.005, + "step": 19560 + }, + { + "epoch": 1.1977477201787134, + "grad_norm": 0.18634411692619324, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0042, + "step": 19570 + }, + { + "epoch": 1.1983597527388457, + "grad_norm": 0.17780153453350067, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0057, + "step": 19580 + }, + { + "epoch": 1.1989717852989779, + "grad_norm": 0.19183002412319183, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0043, + "step": 19590 + }, + { + "epoch": 1.19958381785911, + "grad_norm": 0.28469574451446533, + "learning_rate": 7.133615440411572e-06, + "loss": 0.004, + "step": 19600 + }, + { + "epoch": 1.2001958504192423, + "grad_norm": 0.22470368444919586, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0044, + "step": 19610 + }, + { + "epoch": 1.2008078829793745, + "grad_norm": 0.23563240468502045, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0041, + "step": 19620 + }, + { + "epoch": 1.2014199155395067, + "grad_norm": 0.18467430770397186, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0053, + "step": 19630 + }, + { + "epoch": 1.2020319480996389, + "grad_norm": 0.12539178133010864, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0047, + "step": 19640 + }, + { + "epoch": 1.202643980659771, + "grad_norm": 0.2552005648612976, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.004, + "step": 19650 + }, + { + "epoch": 1.2032560132199033, + "grad_norm": 0.13963459432125092, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0035, + "step": 19660 + }, + { + "epoch": 1.2038680457800355, + "grad_norm": 0.17387327551841736, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0038, + "step": 19670 + }, + { + "epoch": 1.2044800783401677, + "grad_norm": 0.1284111589193344, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0044, + "step": 19680 + }, + { + "epoch": 1.2050921109002999, + "grad_norm": 0.22337380051612854, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0041, + "step": 19690 + }, + { + "epoch": 1.205704143460432, + "grad_norm": 0.2254808247089386, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0033, + "step": 19700 + }, + { + "epoch": 1.2063161760205643, + "grad_norm": 0.19316980242729187, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0041, + "step": 19710 + }, + { + "epoch": 1.2069282085806965, + "grad_norm": 0.17951075732707977, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0038, + "step": 19720 + }, + { + "epoch": 1.2075402411408287, + "grad_norm": 0.3105165660381317, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0043, + "step": 19730 + }, + { + "epoch": 1.208152273700961, + "grad_norm": 0.21083533763885498, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0039, + "step": 19740 + }, + { + "epoch": 1.208764306261093, + "grad_norm": 0.20121195912361145, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0035, + "step": 19750 + }, + { + "epoch": 1.2093763388212253, + "grad_norm": 0.20067447423934937, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0047, + "step": 19760 + }, + { + "epoch": 1.2099883713813575, + "grad_norm": 0.15943066775798798, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0039, + "step": 19770 + }, + { + "epoch": 1.2106004039414897, + "grad_norm": 0.21581032872200012, + "learning_rate": 6.975884226362e-06, + "loss": 0.0045, + "step": 19780 + }, + { + "epoch": 1.211212436501622, + "grad_norm": 0.16258753836154938, + "learning_rate": 6.967165692827958e-06, + "loss": 0.004, + "step": 19790 + }, + { + "epoch": 1.2118244690617541, + "grad_norm": 0.18742400407791138, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0047, + "step": 19800 + }, + { + "epoch": 1.2124365016218863, + "grad_norm": 0.09035168588161469, + "learning_rate": 6.949742834253074e-06, + "loss": 0.004, + "step": 19810 + }, + { + "epoch": 1.2130485341820185, + "grad_norm": 0.21749694645404816, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0054, + "step": 19820 + }, + { + "epoch": 1.2136605667421507, + "grad_norm": 0.3189448416233063, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0043, + "step": 19830 + }, + { + "epoch": 1.214272599302283, + "grad_norm": 0.26815512776374817, + "learning_rate": 6.923644220932124e-06, + "loss": 0.005, + "step": 19840 + }, + { + "epoch": 1.2148846318624151, + "grad_norm": 0.19533704221248627, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0037, + "step": 19850 + }, + { + "epoch": 1.2154966644225473, + "grad_norm": 0.36249589920043945, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0064, + "step": 19860 + }, + { + "epoch": 1.2161086969826795, + "grad_norm": 0.19801265001296997, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0042, + "step": 19870 + }, + { + "epoch": 1.2167207295428117, + "grad_norm": 0.10341386497020721, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0053, + "step": 19880 + }, + { + "epoch": 1.217332762102944, + "grad_norm": 0.17985381186008453, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0045, + "step": 19890 + }, + { + "epoch": 1.2179447946630761, + "grad_norm": 0.18160982429981232, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0061, + "step": 19900 + }, + { + "epoch": 1.2185568272232083, + "grad_norm": 0.15552182495594025, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0047, + "step": 19910 + }, + { + "epoch": 1.2191688597833406, + "grad_norm": 0.34908807277679443, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0046, + "step": 19920 + }, + { + "epoch": 1.2197808923434728, + "grad_norm": 0.14835652709007263, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0042, + "step": 19930 + }, + { + "epoch": 1.220392924903605, + "grad_norm": 0.23276430368423462, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0042, + "step": 19940 + }, + { + "epoch": 1.2210049574637372, + "grad_norm": 0.1900823563337326, + "learning_rate": 6.828319751504063e-06, + "loss": 0.004, + "step": 19950 + }, + { + "epoch": 1.2216169900238694, + "grad_norm": 0.134046271443367, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0039, + "step": 19960 + }, + { + "epoch": 1.2222290225840013, + "grad_norm": 0.17264600098133087, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0036, + "step": 19970 + }, + { + "epoch": 1.2228410551441335, + "grad_norm": 0.24845834076404572, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0044, + "step": 19980 + }, + { + "epoch": 1.2234530877042658, + "grad_norm": 0.14805762469768524, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0049, + "step": 19990 + }, + { + "epoch": 1.224065120264398, + "grad_norm": 0.228907972574234, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0043, + "step": 20000 + }, + { + "epoch": 1.2246771528245302, + "grad_norm": 0.16869507730007172, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0041, + "step": 20010 + }, + { + "epoch": 1.2252891853846624, + "grad_norm": 0.1983603835105896, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0041, + "step": 20020 + }, + { + "epoch": 1.2259012179447946, + "grad_norm": 0.17656362056732178, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0028, + "step": 20030 + }, + { + "epoch": 1.2265132505049268, + "grad_norm": 0.1360313892364502, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0069, + "step": 20040 + }, + { + "epoch": 1.227125283065059, + "grad_norm": 0.21057721972465515, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0051, + "step": 20050 + }, + { + "epoch": 1.2277373156251912, + "grad_norm": 0.138632670044899, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0038, + "step": 20060 + }, + { + "epoch": 1.2283493481853234, + "grad_norm": 0.17815573513507843, + "learning_rate": 6.725005485342219e-06, + "loss": 0.003, + "step": 20070 + }, + { + "epoch": 1.2289613807454556, + "grad_norm": 0.1769353598356247, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0066, + "step": 20080 + }, + { + "epoch": 1.2295734133055878, + "grad_norm": 0.23068928718566895, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0048, + "step": 20090 + }, + { + "epoch": 1.23018544586572, + "grad_norm": 0.25139328837394714, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0049, + "step": 20100 + }, + { + "epoch": 1.2307974784258522, + "grad_norm": 0.09128634631633759, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0042, + "step": 20110 + }, + { + "epoch": 1.2314095109859844, + "grad_norm": 0.20516613125801086, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0031, + "step": 20120 + }, + { + "epoch": 1.2320215435461166, + "grad_norm": 0.1518358588218689, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0049, + "step": 20130 + }, + { + "epoch": 1.2326335761062488, + "grad_norm": 0.1673758625984192, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0044, + "step": 20140 + }, + { + "epoch": 1.233245608666381, + "grad_norm": 0.14084585011005402, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0053, + "step": 20150 + }, + { + "epoch": 1.2338576412265132, + "grad_norm": 0.23316942155361176, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0042, + "step": 20160 + }, + { + "epoch": 1.2344696737866454, + "grad_norm": 0.23793813586235046, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0045, + "step": 20170 + }, + { + "epoch": 1.2350817063467776, + "grad_norm": 0.4269389510154724, + "learning_rate": 6.630934952049143e-06, + "loss": 0.005, + "step": 20180 + }, + { + "epoch": 1.2356937389069098, + "grad_norm": 0.15654191374778748, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0039, + "step": 20190 + }, + { + "epoch": 1.236305771467042, + "grad_norm": 0.19204623997211456, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0032, + "step": 20200 + }, + { + "epoch": 1.2369178040271742, + "grad_norm": 0.15817691385746002, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0044, + "step": 20210 + }, + { + "epoch": 1.2375298365873064, + "grad_norm": 0.12637947499752045, + "learning_rate": 6.596880604028027e-06, + "loss": 0.004, + "step": 20220 + }, + { + "epoch": 1.2381418691474386, + "grad_norm": 0.26657921075820923, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0041, + "step": 20230 + }, + { + "epoch": 1.2387539017075708, + "grad_norm": 0.15207791328430176, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0045, + "step": 20240 + }, + { + "epoch": 1.239365934267703, + "grad_norm": 0.32583367824554443, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0042, + "step": 20250 + }, + { + "epoch": 1.2399779668278352, + "grad_norm": 0.15617726743221283, + "learning_rate": 6.562908932779455e-06, + "loss": 0.004, + "step": 20260 + }, + { + "epoch": 1.2405899993879674, + "grad_norm": 0.1935809850692749, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0041, + "step": 20270 + }, + { + "epoch": 1.2412020319480996, + "grad_norm": 0.17422369122505188, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0035, + "step": 20280 + }, + { + "epoch": 1.2418140645082318, + "grad_norm": 0.15332955121994019, + "learning_rate": 6.53748481975927e-06, + "loss": 0.0049, + "step": 20290 + }, + { + "epoch": 1.242426097068364, + "grad_norm": 0.16183018684387207, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0042, + "step": 20300 + }, + { + "epoch": 1.2430381296284962, + "grad_norm": 0.28421106934547424, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0045, + "step": 20310 + }, + { + "epoch": 1.2436501621886284, + "grad_norm": 0.23288874328136444, + "learning_rate": 6.512107839793337e-06, + "loss": 0.004, + "step": 20320 + }, + { + "epoch": 1.2442621947487607, + "grad_norm": 0.17955242097377777, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0036, + "step": 20330 + }, + { + "epoch": 1.2448742273088929, + "grad_norm": 0.20192117989063263, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0055, + "step": 20340 + }, + { + "epoch": 1.245486259869025, + "grad_norm": 0.15365810692310333, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0034, + "step": 20350 + }, + { + "epoch": 1.2460982924291573, + "grad_norm": 0.25220832228660583, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0039, + "step": 20360 + }, + { + "epoch": 1.2467103249892895, + "grad_norm": 0.25777462124824524, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0053, + "step": 20370 + }, + { + "epoch": 1.2473223575494217, + "grad_norm": 0.2693277895450592, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0053, + "step": 20380 + }, + { + "epoch": 1.2479343901095539, + "grad_norm": 0.22846420109272003, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0033, + "step": 20390 + }, + { + "epoch": 1.248546422669686, + "grad_norm": 0.17022505402565002, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0046, + "step": 20400 + }, + { + "epoch": 1.2491584552298183, + "grad_norm": 0.08295682072639465, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0035, + "step": 20410 + }, + { + "epoch": 1.2497704877899505, + "grad_norm": 0.2745625972747803, + "learning_rate": 6.427861749601945e-06, + "loss": 0.0044, + "step": 20420 + }, + { + "epoch": 1.2503825203500827, + "grad_norm": 0.12855033576488495, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0042, + "step": 20430 + }, + { + "epoch": 1.2509945529102149, + "grad_norm": 0.30358386039733887, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0049, + "step": 20440 + }, + { + "epoch": 1.251606585470347, + "grad_norm": 0.15514959394931793, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0048, + "step": 20450 + }, + { + "epoch": 1.2522186180304793, + "grad_norm": 0.1414988487958908, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0088, + "step": 20460 + }, + { + "epoch": 1.2528306505906115, + "grad_norm": 0.17399665713310242, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0046, + "step": 20470 + }, + { + "epoch": 1.2534426831507437, + "grad_norm": 0.22629426419734955, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0038, + "step": 20480 + }, + { + "epoch": 1.254054715710876, + "grad_norm": 0.30595293641090393, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0027, + "step": 20490 + }, + { + "epoch": 1.254666748271008, + "grad_norm": 0.17980262637138367, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0034, + "step": 20500 + }, + { + "epoch": 1.2552787808311403, + "grad_norm": 0.19016452133655548, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0029, + "step": 20510 + }, + { + "epoch": 1.2558908133912725, + "grad_norm": 0.20200394093990326, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0037, + "step": 20520 + }, + { + "epoch": 1.2565028459514047, + "grad_norm": 0.15347513556480408, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0041, + "step": 20530 + }, + { + "epoch": 1.257114878511537, + "grad_norm": 0.1851687729358673, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0042, + "step": 20540 + }, + { + "epoch": 1.2577269110716691, + "grad_norm": 0.2529662549495697, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0037, + "step": 20550 + }, + { + "epoch": 1.2583389436318013, + "grad_norm": 0.18209592998027802, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0037, + "step": 20560 + }, + { + "epoch": 1.2589509761919335, + "grad_norm": 0.18981963396072388, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0036, + "step": 20570 + }, + { + "epoch": 1.2595630087520657, + "grad_norm": 0.13232728838920593, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0035, + "step": 20580 + }, + { + "epoch": 1.260175041312198, + "grad_norm": 0.133514404296875, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0039, + "step": 20590 + }, + { + "epoch": 1.2607870738723301, + "grad_norm": 0.14339123666286469, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0043, + "step": 20600 + }, + { + "epoch": 1.2613991064324623, + "grad_norm": 0.48857489228248596, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0045, + "step": 20610 + }, + { + "epoch": 1.2620111389925945, + "grad_norm": 0.1513262242078781, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0029, + "step": 20620 + }, + { + "epoch": 1.2626231715527267, + "grad_norm": 0.1497354805469513, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0039, + "step": 20630 + }, + { + "epoch": 1.2632352041128587, + "grad_norm": 0.132791206240654, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0037, + "step": 20640 + }, + { + "epoch": 1.263847236672991, + "grad_norm": 0.13804496824741364, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0035, + "step": 20650 + }, + { + "epoch": 1.2644592692331231, + "grad_norm": 0.19393391907215118, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0049, + "step": 20660 + }, + { + "epoch": 1.2650713017932553, + "grad_norm": 0.17623338103294373, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0038, + "step": 20670 + }, + { + "epoch": 1.2656833343533875, + "grad_norm": 0.26931124925613403, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0042, + "step": 20680 + }, + { + "epoch": 1.2662953669135197, + "grad_norm": 0.17984439432621002, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0036, + "step": 20690 + }, + { + "epoch": 1.266907399473652, + "grad_norm": 0.19648219645023346, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0046, + "step": 20700 + }, + { + "epoch": 1.2675194320337841, + "grad_norm": 0.1464766263961792, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0024, + "step": 20710 + }, + { + "epoch": 1.2681314645939163, + "grad_norm": 0.1271074265241623, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0042, + "step": 20720 + }, + { + "epoch": 1.2687434971540485, + "grad_norm": 0.15960967540740967, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0079, + "step": 20730 + }, + { + "epoch": 1.2693555297141808, + "grad_norm": 0.13636153936386108, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0046, + "step": 20740 + }, + { + "epoch": 1.269967562274313, + "grad_norm": 0.19099050760269165, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0046, + "step": 20750 + }, + { + "epoch": 1.2705795948344452, + "grad_norm": 0.28632739186286926, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0036, + "step": 20760 + }, + { + "epoch": 1.2711916273945774, + "grad_norm": 0.2565019726753235, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0055, + "step": 20770 + }, + { + "epoch": 1.2718036599547096, + "grad_norm": 0.24443399906158447, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0053, + "step": 20780 + }, + { + "epoch": 1.2724156925148418, + "grad_norm": 0.1396762877702713, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0029, + "step": 20790 + }, + { + "epoch": 1.273027725074974, + "grad_norm": 0.3028377890586853, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0061, + "step": 20800 + }, + { + "epoch": 1.2736397576351062, + "grad_norm": 0.18195804953575134, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0034, + "step": 20810 + }, + { + "epoch": 1.2742517901952384, + "grad_norm": 0.16194652020931244, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0054, + "step": 20820 + }, + { + "epoch": 1.2748638227553706, + "grad_norm": 0.13011956214904785, + "learning_rate": 6.08816828695283e-06, + "loss": 0.003, + "step": 20830 + }, + { + "epoch": 1.2754758553155028, + "grad_norm": 0.23294220864772797, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0041, + "step": 20840 + }, + { + "epoch": 1.276087887875635, + "grad_norm": 0.1892961710691452, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0031, + "step": 20850 + }, + { + "epoch": 1.2766999204357672, + "grad_norm": 0.1984476000070572, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0046, + "step": 20860 + }, + { + "epoch": 1.2773119529958994, + "grad_norm": 0.158709317445755, + "learning_rate": 6.055535530104466e-06, + "loss": 0.003, + "step": 20870 + }, + { + "epoch": 1.2779239855560316, + "grad_norm": 0.16505110263824463, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0039, + "step": 20880 + }, + { + "epoch": 1.2785360181161638, + "grad_norm": 0.18332232534885406, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0036, + "step": 20890 + }, + { + "epoch": 1.279148050676296, + "grad_norm": 0.1797804981470108, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0049, + "step": 20900 + }, + { + "epoch": 1.2797600832364282, + "grad_norm": 0.19247964024543762, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0053, + "step": 20910 + }, + { + "epoch": 1.2803721157965604, + "grad_norm": 0.17845408618450165, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0045, + "step": 20920 + }, + { + "epoch": 1.2809841483566926, + "grad_norm": 0.09454555809497833, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0027, + "step": 20930 + }, + { + "epoch": 1.2815961809168248, + "grad_norm": 0.12647129595279694, + "learning_rate": 5.998651973182953e-06, + "loss": 0.004, + "step": 20940 + }, + { + "epoch": 1.282208213476957, + "grad_norm": 0.39115941524505615, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0051, + "step": 20950 + }, + { + "epoch": 1.2828202460370892, + "grad_norm": 0.29081296920776367, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0044, + "step": 20960 + }, + { + "epoch": 1.2834322785972214, + "grad_norm": 0.1849275827407837, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0042, + "step": 20970 + }, + { + "epoch": 1.2840443111573536, + "grad_norm": 0.24075689911842346, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0031, + "step": 20980 + }, + { + "epoch": 1.2846563437174858, + "grad_norm": 0.12463482469320297, + "learning_rate": 5.958196751005967e-06, + "loss": 0.003, + "step": 20990 + }, + { + "epoch": 1.285268376277618, + "grad_norm": 0.16987742483615875, + "learning_rate": 5.950123419134817e-06, + "loss": 0.004, + "step": 21000 + }, + { + "epoch": 1.2858804088377502, + "grad_norm": 0.20316782593727112, + "learning_rate": 5.942056013575106e-06, + "loss": 0.004, + "step": 21010 + }, + { + "epoch": 1.2864924413978824, + "grad_norm": 0.20989514887332916, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0053, + "step": 21020 + }, + { + "epoch": 1.2871044739580146, + "grad_norm": 0.33795273303985596, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0048, + "step": 21030 + }, + { + "epoch": 1.2877165065181468, + "grad_norm": 0.13918501138687134, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.004, + "step": 21040 + }, + { + "epoch": 1.288328539078279, + "grad_norm": 0.2992899715900421, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0038, + "step": 21050 + }, + { + "epoch": 1.288940571638411, + "grad_norm": 0.2540164589881897, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0037, + "step": 21060 + }, + { + "epoch": 1.2895526041985432, + "grad_norm": 0.161032035946846, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0047, + "step": 21070 + }, + { + "epoch": 1.2901646367586754, + "grad_norm": 0.1743200421333313, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0037, + "step": 21080 + }, + { + "epoch": 1.2907766693188076, + "grad_norm": 0.26604363322257996, + "learning_rate": 5.877731250949785e-06, + "loss": 0.004, + "step": 21090 + }, + { + "epoch": 1.2913887018789398, + "grad_norm": 0.275696724653244, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0044, + "step": 21100 + }, + { + "epoch": 1.292000734439072, + "grad_norm": 0.16888457536697388, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0042, + "step": 21110 + }, + { + "epoch": 1.2926127669992042, + "grad_norm": 0.12902231514453888, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0048, + "step": 21120 + }, + { + "epoch": 1.2932247995593364, + "grad_norm": 0.14577728509902954, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0046, + "step": 21130 + }, + { + "epoch": 1.2938368321194686, + "grad_norm": 0.1544434279203415, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0031, + "step": 21140 + }, + { + "epoch": 1.2944488646796009, + "grad_norm": 0.09238115698099136, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0035, + "step": 21150 + }, + { + "epoch": 1.295060897239733, + "grad_norm": 0.1770051270723343, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0033, + "step": 21160 + }, + { + "epoch": 1.2956729297998653, + "grad_norm": 0.20360831916332245, + "learning_rate": 5.813791207086085e-06, + "loss": 0.0037, + "step": 21170 + }, + { + "epoch": 1.2962849623599975, + "grad_norm": 0.18503794074058533, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0045, + "step": 21180 + }, + { + "epoch": 1.2968969949201297, + "grad_norm": 0.12918968498706818, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0048, + "step": 21190 + }, + { + "epoch": 1.2975090274802619, + "grad_norm": 0.14289438724517822, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0041, + "step": 21200 + }, + { + "epoch": 1.298121060040394, + "grad_norm": 0.17546117305755615, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0042, + "step": 21210 + }, + { + "epoch": 1.2987330926005263, + "grad_norm": 0.2919277846813202, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0051, + "step": 21220 + }, + { + "epoch": 1.2993451251606585, + "grad_norm": 0.0988069474697113, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0044, + "step": 21230 + }, + { + "epoch": 1.2999571577207907, + "grad_norm": 0.19284513592720032, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0037, + "step": 21240 + }, + { + "epoch": 1.3005691902809229, + "grad_norm": 0.12894058227539062, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0031, + "step": 21250 + }, + { + "epoch": 1.301181222841055, + "grad_norm": 0.14740346372127533, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0037, + "step": 21260 + }, + { + "epoch": 1.3017932554011873, + "grad_norm": 0.16817794740200043, + "learning_rate": 5.734414476316747e-06, + "loss": 0.005, + "step": 21270 + }, + { + "epoch": 1.3024052879613195, + "grad_norm": 0.29237234592437744, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0039, + "step": 21280 + }, + { + "epoch": 1.3030173205214517, + "grad_norm": 0.12649856507778168, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0035, + "step": 21290 + }, + { + "epoch": 1.303629353081584, + "grad_norm": 0.11057443916797638, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0039, + "step": 21300 + }, + { + "epoch": 1.304241385641716, + "grad_norm": 0.13494674861431122, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0035, + "step": 21310 + }, + { + "epoch": 1.3048534182018483, + "grad_norm": 0.3079472482204437, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0042, + "step": 21320 + }, + { + "epoch": 1.3054654507619805, + "grad_norm": 0.13513535261154175, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0048, + "step": 21330 + }, + { + "epoch": 1.3060774833221127, + "grad_norm": 0.39266663789749146, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0046, + "step": 21340 + }, + { + "epoch": 1.306689515882245, + "grad_norm": 0.15097978711128235, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0047, + "step": 21350 + }, + { + "epoch": 1.3073015484423771, + "grad_norm": 0.25206202268600464, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0049, + "step": 21360 + }, + { + "epoch": 1.3079135810025093, + "grad_norm": 0.16765817999839783, + "learning_rate": 5.655655685355026e-06, + "loss": 0.005, + "step": 21370 + }, + { + "epoch": 1.3085256135626415, + "grad_norm": 0.2137158215045929, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0048, + "step": 21380 + }, + { + "epoch": 1.3091376461227737, + "grad_norm": 0.19711454212665558, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0043, + "step": 21390 + }, + { + "epoch": 1.309749678682906, + "grad_norm": 0.1722051054239273, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0044, + "step": 21400 + }, + { + "epoch": 1.3103617112430381, + "grad_norm": 0.1807536482810974, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0045, + "step": 21410 + }, + { + "epoch": 1.3109737438031703, + "grad_norm": 0.15052185952663422, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.004, + "step": 21420 + }, + { + "epoch": 1.3115857763633025, + "grad_norm": 0.1485220491886139, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0036, + "step": 21430 + }, + { + "epoch": 1.3121978089234347, + "grad_norm": 0.15065325796604156, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0037, + "step": 21440 + }, + { + "epoch": 1.312809841483567, + "grad_norm": 0.17903591692447662, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0047, + "step": 21450 + }, + { + "epoch": 1.3134218740436991, + "grad_norm": 0.14310622215270996, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0043, + "step": 21460 + }, + { + "epoch": 1.3140339066038313, + "grad_norm": 0.12117830663919449, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0053, + "step": 21470 + }, + { + "epoch": 1.3146459391639636, + "grad_norm": 0.1484573632478714, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0036, + "step": 21480 + }, + { + "epoch": 1.3152579717240958, + "grad_norm": 0.16559219360351562, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0037, + "step": 21490 + }, + { + "epoch": 1.315870004284228, + "grad_norm": 0.21626432240009308, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0031, + "step": 21500 + }, + { + "epoch": 1.3164820368443602, + "grad_norm": 0.08177383989095688, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0052, + "step": 21510 + }, + { + "epoch": 1.3170940694044924, + "grad_norm": 0.18640732765197754, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0062, + "step": 21520 + }, + { + "epoch": 1.3177061019646246, + "grad_norm": 0.2599853277206421, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0039, + "step": 21530 + }, + { + "epoch": 1.3183181345247568, + "grad_norm": 0.1591203212738037, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0034, + "step": 21540 + }, + { + "epoch": 1.318930167084889, + "grad_norm": 0.2834412455558777, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0037, + "step": 21550 + }, + { + "epoch": 1.3195421996450212, + "grad_norm": 0.13853803277015686, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0038, + "step": 21560 + }, + { + "epoch": 1.3201542322051534, + "grad_norm": 0.14707128703594208, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0042, + "step": 21570 + }, + { + "epoch": 1.3207662647652856, + "grad_norm": 0.12561920285224915, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0038, + "step": 21580 + }, + { + "epoch": 1.3213782973254178, + "grad_norm": 0.4156799018383026, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0051, + "step": 21590 + }, + { + "epoch": 1.32199032988555, + "grad_norm": 0.11400662362575531, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0031, + "step": 21600 + }, + { + "epoch": 1.3226023624456822, + "grad_norm": 0.15658807754516602, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0052, + "step": 21610 + }, + { + "epoch": 1.3232143950058144, + "grad_norm": 0.1212862953543663, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0034, + "step": 21620 + }, + { + "epoch": 1.3238264275659466, + "grad_norm": 0.2201654314994812, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0036, + "step": 21630 + }, + { + "epoch": 1.3244384601260788, + "grad_norm": 0.11623375117778778, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0032, + "step": 21640 + }, + { + "epoch": 1.325050492686211, + "grad_norm": 0.13092897832393646, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0035, + "step": 21650 + }, + { + "epoch": 1.3256625252463432, + "grad_norm": 0.15409153699874878, + "learning_rate": 5.430834687545416e-06, + "loss": 0.004, + "step": 21660 + }, + { + "epoch": 1.3262745578064754, + "grad_norm": 0.3148297369480133, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0031, + "step": 21670 + }, + { + "epoch": 1.3268865903666076, + "grad_norm": 0.13435055315494537, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0033, + "step": 21680 + }, + { + "epoch": 1.3274986229267398, + "grad_norm": 0.17878089845180511, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0029, + "step": 21690 + }, + { + "epoch": 1.328110655486872, + "grad_norm": 0.1823783665895462, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0039, + "step": 21700 + }, + { + "epoch": 1.3287226880470042, + "grad_norm": 0.14492660760879517, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0033, + "step": 21710 + }, + { + "epoch": 1.3293347206071364, + "grad_norm": 0.1730341762304306, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0041, + "step": 21720 + }, + { + "epoch": 1.3299467531672686, + "grad_norm": 0.07961586117744446, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0039, + "step": 21730 + }, + { + "epoch": 1.3305587857274008, + "grad_norm": 0.14440582692623138, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0038, + "step": 21740 + }, + { + "epoch": 1.331170818287533, + "grad_norm": 0.22034496068954468, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0023, + "step": 21750 + }, + { + "epoch": 1.3317828508476652, + "grad_norm": 0.1861305832862854, + "learning_rate": 5.354573491223212e-06, + "loss": 0.005, + "step": 21760 + }, + { + "epoch": 1.3323948834077972, + "grad_norm": 0.15587164461612701, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.0044, + "step": 21770 + }, + { + "epoch": 1.3330069159679294, + "grad_norm": 0.6852900981903076, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0075, + "step": 21780 + }, + { + "epoch": 1.3336189485280616, + "grad_norm": 0.14315280318260193, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0034, + "step": 21790 + }, + { + "epoch": 1.3342309810881938, + "grad_norm": 0.350981205701828, + "learning_rate": 5.324254018551227e-06, + "loss": 0.004, + "step": 21800 + }, + { + "epoch": 1.334843013648326, + "grad_norm": 0.12344911694526672, + "learning_rate": 5.316690780174352e-06, + "loss": 0.004, + "step": 21810 + }, + { + "epoch": 1.3354550462084582, + "grad_norm": 0.18744061887264252, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0035, + "step": 21820 + }, + { + "epoch": 1.3360670787685904, + "grad_norm": 0.22747837007045746, + "learning_rate": 5.301584321328435e-06, + "loss": 0.004, + "step": 21830 + }, + { + "epoch": 1.3366791113287226, + "grad_norm": 0.22695699334144592, + "learning_rate": 5.294041118587667e-06, + "loss": 0.0042, + "step": 21840 + }, + { + "epoch": 1.3372911438888548, + "grad_norm": 0.17258964478969574, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0044, + "step": 21850 + }, + { + "epoch": 1.337903176448987, + "grad_norm": 0.1523793637752533, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0047, + "step": 21860 + }, + { + "epoch": 1.3385152090091192, + "grad_norm": 0.1983587145805359, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0037, + "step": 21870 + }, + { + "epoch": 1.3391272415692514, + "grad_norm": 0.1263747215270996, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0034, + "step": 21880 + }, + { + "epoch": 1.3397392741293837, + "grad_norm": 0.1550009399652481, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0037, + "step": 21890 + }, + { + "epoch": 1.3403513066895159, + "grad_norm": 0.14963915944099426, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0038, + "step": 21900 + }, + { + "epoch": 1.340963339249648, + "grad_norm": 0.17783671617507935, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0036, + "step": 21910 + }, + { + "epoch": 1.3415753718097803, + "grad_norm": 0.2715896964073181, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0036, + "step": 21920 + }, + { + "epoch": 1.3421874043699125, + "grad_norm": 0.22924886643886566, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0037, + "step": 21930 + }, + { + "epoch": 1.3427994369300447, + "grad_norm": 0.13689789175987244, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0033, + "step": 21940 + }, + { + "epoch": 1.3434114694901769, + "grad_norm": 0.09137748926877975, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0031, + "step": 21950 + }, + { + "epoch": 1.344023502050309, + "grad_norm": 0.17097881436347961, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0031, + "step": 21960 + }, + { + "epoch": 1.3446355346104413, + "grad_norm": 0.23919200897216797, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0046, + "step": 21970 + }, + { + "epoch": 1.3452475671705735, + "grad_norm": 0.14261527359485626, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0037, + "step": 21980 + }, + { + "epoch": 1.3458595997307057, + "grad_norm": 0.156734898686409, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0043, + "step": 21990 + }, + { + "epoch": 1.3464716322908379, + "grad_norm": 0.21755588054656982, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0032, + "step": 22000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.374205589323776e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/training_args.bin b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cd9e28a44ae85140e2ef027a82e8be4c39167cc4 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5644791eb57bcb4c4808b4c2429b71e4c49eece4fc60f263f4553a3380f230bb +size 6097 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/added_tokens.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/generation_config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00001-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b02a578d09d829933f7561cafa13ff0e6a9eaccc --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d06998e3214d4c6c19d5dfe668c00f396b6d8e946a8cc8859e3db82738a8288 +size 4921072616 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00002-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d973bf7779a6758adf7c7f316bc686b146d492e8 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2cea08dfb3c0aed125f044350834984d3374d56b3c609c9194d57a7c8be73af +size 4978830984 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00003-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..303f194da8cef9fbe1f84e77d39cd91956de71f2 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc3994bb8bc192ffda7a5a63321194a5c1e20952dc60557bba83b111bfb1014c +size 4100977896 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model.safetensors.index.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/norm_stats.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..f33de4b80f47e0bac1a414431a8354d8345d60c5 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -24.65332532291412, + -30.64622355117798, + -14.452480476760865, + -1.8581012797355654, + -2.2742317820549007, + -1.9569469915390014, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 3.0011677881240857, + 22.348905650329584, + 21.68580058555603, + 2.3937565994262693, + 4.117288079452516, + 3.295379007720948, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + -6.570000648498535, + -1.0618462562561035, + 3.623035430908203, + 0.010442602448165417, + 0.7240540385246277, + 0.44398337602615356, + 0.12898989021778107, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 7.04909086227417, + 17.099597930908203, + 8.363018989562988, + 0.6997263431549072, + 1.1358375549316406, + 0.9687971472740173, + 0.9916459321975708, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.777750787353515, + -21.249025872802733, + -2.4021557040214536, + -4.092200187206268, + -3.2986312219619753, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.645499613952634, + 30.59561934127808, + 14.405443457031247, + 1.8499586300849913, + 2.268683268356323, + 1.963451420021057, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.6817545890808105, + 1.3444018363952637, + -3.5411791801452637, + -0.009792014956474304, + -0.7230188846588135, + -0.44849714636802673, + 0.15749873220920563, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.988739013671875, + 16.884004592895508, + 8.242538452148438, + 0.6991510391235352, + 1.1302146911621094, + 0.9690405130386353, + 0.9875192046165466, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/pi0.yaml b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff8c0ecf109af377331583e4079865e7d8037bc8 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 5 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/special_tokens_map.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/tokenizer.model b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/tokenizer_config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/trainer_state.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d9cdcd4d3db6f03b52aa770109c1ee6857bb653f --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/trainer_state.json @@ -0,0 +1,16834 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4688781443172778, + "eval_steps": 500, + "global_step": 24000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006120325601321991, + "grad_norm": 2.2432243824005127, + "learning_rate": 1.8e-07, + "loss": 0.1384, + "step": 10 + }, + { + "epoch": 0.0012240651202643981, + "grad_norm": 1.959119439125061, + "learning_rate": 3.8e-07, + "loss": 0.1388, + "step": 20 + }, + { + "epoch": 0.001836097680396597, + "grad_norm": 1.8843899965286255, + "learning_rate": 5.800000000000001e-07, + "loss": 0.1307, + "step": 30 + }, + { + "epoch": 0.0024481302405287963, + "grad_norm": 1.7569042444229126, + "learning_rate": 7.8e-07, + "loss": 0.1238, + "step": 40 + }, + { + "epoch": 0.0030601628006609954, + "grad_norm": 2.6189017295837402, + "learning_rate": 9.800000000000001e-07, + "loss": 0.1275, + "step": 50 + }, + { + "epoch": 0.003672195360793194, + "grad_norm": 1.8418694734573364, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.1032, + "step": 60 + }, + { + "epoch": 0.004284227920925393, + "grad_norm": 1.481676697731018, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0816, + "step": 70 + }, + { + "epoch": 0.004896260481057593, + "grad_norm": 0.9590038061141968, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.0732, + "step": 80 + }, + { + "epoch": 0.005508293041189791, + "grad_norm": 1.002897024154663, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.0608, + "step": 90 + }, + { + "epoch": 0.006120325601321991, + "grad_norm": 0.9830108284950256, + "learning_rate": 1.98e-06, + "loss": 0.042, + "step": 100 + }, + { + "epoch": 0.006732358161454189, + "grad_norm": 0.858244001865387, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.0314, + "step": 110 + }, + { + "epoch": 0.007344390721586388, + "grad_norm": 0.5761063694953918, + "learning_rate": 2.38e-06, + "loss": 0.029, + "step": 120 + }, + { + "epoch": 0.007956423281718587, + "grad_norm": 0.5434514284133911, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.0227, + "step": 130 + }, + { + "epoch": 0.008568455841850786, + "grad_norm": 0.6488766670227051, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.0202, + "step": 140 + }, + { + "epoch": 0.009180488401982986, + "grad_norm": 0.36763015389442444, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.0157, + "step": 150 + }, + { + "epoch": 0.009792520962115185, + "grad_norm": 0.49271446466445923, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.0194, + "step": 160 + }, + { + "epoch": 0.010404553522247383, + "grad_norm": 0.23608209192752838, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.0122, + "step": 170 + }, + { + "epoch": 0.011016586082379582, + "grad_norm": 0.47871828079223633, + "learning_rate": 3.58e-06, + "loss": 0.0131, + "step": 180 + }, + { + "epoch": 0.011628618642511782, + "grad_norm": 0.6862446069717407, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.0131, + "step": 190 + }, + { + "epoch": 0.012240651202643981, + "grad_norm": 0.7964349389076233, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0155, + "step": 200 + }, + { + "epoch": 0.01285268376277618, + "grad_norm": 0.5564846396446228, + "learning_rate": 4.18e-06, + "loss": 0.0104, + "step": 210 + }, + { + "epoch": 0.013464716322908379, + "grad_norm": 0.2810452878475189, + "learning_rate": 4.38e-06, + "loss": 0.0128, + "step": 220 + }, + { + "epoch": 0.014076748883040578, + "grad_norm": 0.4474979341030121, + "learning_rate": 4.58e-06, + "loss": 0.0188, + "step": 230 + }, + { + "epoch": 0.014688781443172776, + "grad_norm": 0.47965875267982483, + "learning_rate": 4.78e-06, + "loss": 0.0141, + "step": 240 + }, + { + "epoch": 0.015300814003304975, + "grad_norm": 0.3410812020301819, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0085, + "step": 250 + }, + { + "epoch": 0.015912846563437173, + "grad_norm": 0.39907002449035645, + "learning_rate": 5.18e-06, + "loss": 0.0106, + "step": 260 + }, + { + "epoch": 0.016524879123569373, + "grad_norm": 0.28909367322921753, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0103, + "step": 270 + }, + { + "epoch": 0.017136911683701572, + "grad_norm": 0.31524109840393066, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0101, + "step": 280 + }, + { + "epoch": 0.017748944243833772, + "grad_norm": 0.29430100321769714, + "learning_rate": 5.78e-06, + "loss": 0.0109, + "step": 290 + }, + { + "epoch": 0.01836097680396597, + "grad_norm": 0.2709169387817383, + "learning_rate": 5.98e-06, + "loss": 0.0102, + "step": 300 + }, + { + "epoch": 0.01897300936409817, + "grad_norm": 0.33067119121551514, + "learning_rate": 6.18e-06, + "loss": 0.0095, + "step": 310 + }, + { + "epoch": 0.01958504192423037, + "grad_norm": 0.28110620379447937, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0102, + "step": 320 + }, + { + "epoch": 0.02019707448436257, + "grad_norm": 0.27736902236938477, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0088, + "step": 330 + }, + { + "epoch": 0.020809107044494766, + "grad_norm": 0.3238557279109955, + "learning_rate": 6.780000000000001e-06, + "loss": 0.01, + "step": 340 + }, + { + "epoch": 0.021421139604626965, + "grad_norm": 0.30263441801071167, + "learning_rate": 6.98e-06, + "loss": 0.0095, + "step": 350 + }, + { + "epoch": 0.022033172164759165, + "grad_norm": 0.2618265450000763, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0096, + "step": 360 + }, + { + "epoch": 0.022645204724891364, + "grad_norm": 0.272565633058548, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0093, + "step": 370 + }, + { + "epoch": 0.023257237285023564, + "grad_norm": 0.44272440671920776, + "learning_rate": 7.58e-06, + "loss": 0.0087, + "step": 380 + }, + { + "epoch": 0.023869269845155763, + "grad_norm": 0.27631404995918274, + "learning_rate": 7.78e-06, + "loss": 0.0093, + "step": 390 + }, + { + "epoch": 0.024481302405287963, + "grad_norm": 0.4108494520187378, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0093, + "step": 400 + }, + { + "epoch": 0.02509333496542016, + "grad_norm": 0.43498387932777405, + "learning_rate": 8.18e-06, + "loss": 0.0098, + "step": 410 + }, + { + "epoch": 0.02570536752555236, + "grad_norm": 0.3419845700263977, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0091, + "step": 420 + }, + { + "epoch": 0.026317400085684558, + "grad_norm": 0.5677013993263245, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0104, + "step": 430 + }, + { + "epoch": 0.026929432645816757, + "grad_norm": 0.24424298107624054, + "learning_rate": 8.78e-06, + "loss": 0.0089, + "step": 440 + }, + { + "epoch": 0.027541465205948957, + "grad_norm": 0.267781138420105, + "learning_rate": 8.98e-06, + "loss": 0.0107, + "step": 450 + }, + { + "epoch": 0.028153497766081156, + "grad_norm": 0.38459253311157227, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0081, + "step": 460 + }, + { + "epoch": 0.028765530326213356, + "grad_norm": 0.2647954523563385, + "learning_rate": 9.38e-06, + "loss": 0.0082, + "step": 470 + }, + { + "epoch": 0.029377562886345552, + "grad_norm": 0.44312018156051636, + "learning_rate": 9.58e-06, + "loss": 0.0102, + "step": 480 + }, + { + "epoch": 0.02998959544647775, + "grad_norm": 0.2309781014919281, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0118, + "step": 490 + }, + { + "epoch": 0.03060162800660995, + "grad_norm": 0.41755014657974243, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0094, + "step": 500 + }, + { + "epoch": 0.03121366056674215, + "grad_norm": 0.38537120819091797, + "learning_rate": 1.018e-05, + "loss": 0.011, + "step": 510 + }, + { + "epoch": 0.031825693126874346, + "grad_norm": 0.49801477789878845, + "learning_rate": 1.038e-05, + "loss": 0.0093, + "step": 520 + }, + { + "epoch": 0.03243772568700655, + "grad_norm": 0.3854966163635254, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0116, + "step": 530 + }, + { + "epoch": 0.033049758247138745, + "grad_norm": 0.3163810968399048, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.008, + "step": 540 + }, + { + "epoch": 0.03366179080727095, + "grad_norm": 0.33000636100769043, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0093, + "step": 550 + }, + { + "epoch": 0.034273823367403145, + "grad_norm": 0.3350297808647156, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0083, + "step": 560 + }, + { + "epoch": 0.03488585592753535, + "grad_norm": 0.18780949711799622, + "learning_rate": 1.138e-05, + "loss": 0.0097, + "step": 570 + }, + { + "epoch": 0.035497888487667544, + "grad_norm": 0.20399607717990875, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0092, + "step": 580 + }, + { + "epoch": 0.03610992104779974, + "grad_norm": 0.15931005775928497, + "learning_rate": 1.178e-05, + "loss": 0.0076, + "step": 590 + }, + { + "epoch": 0.03672195360793194, + "grad_norm": 0.20751547813415527, + "learning_rate": 1.198e-05, + "loss": 0.0079, + "step": 600 + }, + { + "epoch": 0.03733398616806414, + "grad_norm": 0.39666953682899475, + "learning_rate": 1.218e-05, + "loss": 0.0072, + "step": 610 + }, + { + "epoch": 0.03794601872819634, + "grad_norm": 0.385407030582428, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0089, + "step": 620 + }, + { + "epoch": 0.03855805128832854, + "grad_norm": 0.5228332877159119, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0092, + "step": 630 + }, + { + "epoch": 0.03917008384846074, + "grad_norm": 0.29315415024757385, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0098, + "step": 640 + }, + { + "epoch": 0.03978211640859294, + "grad_norm": 0.4300646483898163, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0079, + "step": 650 + }, + { + "epoch": 0.04039414896872514, + "grad_norm": 0.38021156191825867, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0103, + "step": 660 + }, + { + "epoch": 0.041006181528857336, + "grad_norm": 0.43489688634872437, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0105, + "step": 670 + }, + { + "epoch": 0.04161821408898953, + "grad_norm": 0.48019328713417053, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0124, + "step": 680 + }, + { + "epoch": 0.042230246649121735, + "grad_norm": 0.28486984968185425, + "learning_rate": 1.378e-05, + "loss": 0.0122, + "step": 690 + }, + { + "epoch": 0.04284227920925393, + "grad_norm": 0.35172080993652344, + "learning_rate": 1.398e-05, + "loss": 0.0093, + "step": 700 + }, + { + "epoch": 0.043454311769386134, + "grad_norm": 0.32531124353408813, + "learning_rate": 1.418e-05, + "loss": 0.0116, + "step": 710 + }, + { + "epoch": 0.04406634432951833, + "grad_norm": 0.388637512922287, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0077, + "step": 720 + }, + { + "epoch": 0.04467837688965053, + "grad_norm": 0.3816429078578949, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0123, + "step": 730 + }, + { + "epoch": 0.04529040944978273, + "grad_norm": 0.22786036133766174, + "learning_rate": 1.478e-05, + "loss": 0.0089, + "step": 740 + }, + { + "epoch": 0.045902442009914925, + "grad_norm": 0.2965328097343445, + "learning_rate": 1.498e-05, + "loss": 0.011, + "step": 750 + }, + { + "epoch": 0.04651447457004713, + "grad_norm": 0.3568362593650818, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0101, + "step": 760 + }, + { + "epoch": 0.047126507130179324, + "grad_norm": 0.2972166836261749, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0093, + "step": 770 + }, + { + "epoch": 0.04773853969031153, + "grad_norm": 0.4221388101577759, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.012, + "step": 780 + }, + { + "epoch": 0.04835057225044372, + "grad_norm": 0.37255391478538513, + "learning_rate": 1.578e-05, + "loss": 0.0085, + "step": 790 + }, + { + "epoch": 0.048962604810575926, + "grad_norm": 0.36007094383239746, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.008, + "step": 800 + }, + { + "epoch": 0.04957463737070812, + "grad_norm": 0.40588808059692383, + "learning_rate": 1.618e-05, + "loss": 0.0081, + "step": 810 + }, + { + "epoch": 0.05018666993084032, + "grad_norm": 0.46563687920570374, + "learning_rate": 1.638e-05, + "loss": 0.0076, + "step": 820 + }, + { + "epoch": 0.05079870249097252, + "grad_norm": 0.3161381483078003, + "learning_rate": 1.658e-05, + "loss": 0.0129, + "step": 830 + }, + { + "epoch": 0.05141073505110472, + "grad_norm": 0.3800298869609833, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0146, + "step": 840 + }, + { + "epoch": 0.05202276761123692, + "grad_norm": 0.36572107672691345, + "learning_rate": 1.698e-05, + "loss": 0.0148, + "step": 850 + }, + { + "epoch": 0.052634800171369116, + "grad_norm": 0.4084141254425049, + "learning_rate": 1.718e-05, + "loss": 0.0085, + "step": 860 + }, + { + "epoch": 0.05324683273150132, + "grad_norm": 0.2906867265701294, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0116, + "step": 870 + }, + { + "epoch": 0.053858865291633515, + "grad_norm": 0.41204380989074707, + "learning_rate": 1.758e-05, + "loss": 0.0076, + "step": 880 + }, + { + "epoch": 0.05447089785176571, + "grad_norm": 0.5292996764183044, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0094, + "step": 890 + }, + { + "epoch": 0.055082930411897914, + "grad_norm": 0.23192685842514038, + "learning_rate": 1.798e-05, + "loss": 0.0116, + "step": 900 + }, + { + "epoch": 0.05569496297203011, + "grad_norm": 0.41050270199775696, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0099, + "step": 910 + }, + { + "epoch": 0.05630699553216231, + "grad_norm": 0.3336002230644226, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0082, + "step": 920 + }, + { + "epoch": 0.05691902809229451, + "grad_norm": 0.46233776211738586, + "learning_rate": 1.858e-05, + "loss": 0.0104, + "step": 930 + }, + { + "epoch": 0.05753106065242671, + "grad_norm": 0.36776405572891235, + "learning_rate": 1.878e-05, + "loss": 0.0115, + "step": 940 + }, + { + "epoch": 0.05814309321255891, + "grad_norm": 0.47848618030548096, + "learning_rate": 1.898e-05, + "loss": 0.0108, + "step": 950 + }, + { + "epoch": 0.058755125772691104, + "grad_norm": 0.35507604479789734, + "learning_rate": 1.918e-05, + "loss": 0.0095, + "step": 960 + }, + { + "epoch": 0.05936715833282331, + "grad_norm": 0.4613397717475891, + "learning_rate": 1.938e-05, + "loss": 0.0119, + "step": 970 + }, + { + "epoch": 0.0599791908929555, + "grad_norm": 0.34492260217666626, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0109, + "step": 980 + }, + { + "epoch": 0.060591223453087706, + "grad_norm": 0.34624582529067993, + "learning_rate": 1.978e-05, + "loss": 0.0099, + "step": 990 + }, + { + "epoch": 0.0612032560132199, + "grad_norm": 0.9161475896835327, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0109, + "step": 1000 + }, + { + "epoch": 0.061815288573352105, + "grad_norm": 0.367807537317276, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0087, + "step": 1010 + }, + { + "epoch": 0.0624273211334843, + "grad_norm": 0.4043216407299042, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.0084, + "step": 1020 + }, + { + "epoch": 0.0630393536936165, + "grad_norm": 0.315305233001709, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0075, + "step": 1030 + }, + { + "epoch": 0.06365138625374869, + "grad_norm": 0.49702969193458557, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0103, + "step": 1040 + }, + { + "epoch": 0.0642634188138809, + "grad_norm": 0.46286216378211975, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0116, + "step": 1050 + }, + { + "epoch": 0.0648754513740131, + "grad_norm": 0.332142174243927, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0103, + "step": 1060 + }, + { + "epoch": 0.0654874839341453, + "grad_norm": 0.6118510961532593, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0118, + "step": 1070 + }, + { + "epoch": 0.06609951649427749, + "grad_norm": 0.49074795842170715, + "learning_rate": 1.999967041472886e-05, + "loss": 0.011, + "step": 1080 + }, + { + "epoch": 0.0667115490544097, + "grad_norm": 0.42575374245643616, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0125, + "step": 1090 + }, + { + "epoch": 0.0673235816145419, + "grad_norm": 0.3223794996738434, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0091, + "step": 1100 + }, + { + "epoch": 0.06793561417467409, + "grad_norm": 0.4952760636806488, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.011, + "step": 1110 + }, + { + "epoch": 0.06854764673480629, + "grad_norm": 0.36144813895225525, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0096, + "step": 1120 + }, + { + "epoch": 0.06915967929493849, + "grad_norm": 0.31190025806427, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0115, + "step": 1130 + }, + { + "epoch": 0.0697717118550707, + "grad_norm": 0.7014928460121155, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.014, + "step": 1140 + }, + { + "epoch": 0.07038374441520288, + "grad_norm": 0.4382205605506897, + "learning_rate": 1.999882759038658e-05, + "loss": 0.0111, + "step": 1150 + }, + { + "epoch": 0.07099577697533509, + "grad_norm": 0.3750714659690857, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0116, + "step": 1160 + }, + { + "epoch": 0.07160780953546729, + "grad_norm": 0.4174371361732483, + "learning_rate": 1.999849173538598e-05, + "loss": 0.009, + "step": 1170 + }, + { + "epoch": 0.07221984209559948, + "grad_norm": 0.44394591450691223, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0094, + "step": 1180 + }, + { + "epoch": 0.07283187465573168, + "grad_norm": 0.43412888050079346, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0098, + "step": 1190 + }, + { + "epoch": 0.07344390721586389, + "grad_norm": 0.6421196460723877, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.01, + "step": 1200 + }, + { + "epoch": 0.07405593977599609, + "grad_norm": 0.6313903331756592, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0137, + "step": 1210 + }, + { + "epoch": 0.07466797233612828, + "grad_norm": 0.49340254068374634, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0104, + "step": 1220 + }, + { + "epoch": 0.07528000489626048, + "grad_norm": 0.40420663356781006, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0126, + "step": 1230 + }, + { + "epoch": 0.07589203745639268, + "grad_norm": 0.3955318033695221, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.013, + "step": 1240 + }, + { + "epoch": 0.07650407001652489, + "grad_norm": 0.4967520236968994, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0098, + "step": 1250 + }, + { + "epoch": 0.07711610257665708, + "grad_norm": 0.3380029499530792, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0084, + "step": 1260 + }, + { + "epoch": 0.07772813513678928, + "grad_norm": 0.4542321562767029, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.009, + "step": 1270 + }, + { + "epoch": 0.07834016769692148, + "grad_norm": 0.4533286392688751, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0093, + "step": 1280 + }, + { + "epoch": 0.07895220025705367, + "grad_norm": 0.39559242129325867, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0106, + "step": 1290 + }, + { + "epoch": 0.07956423281718587, + "grad_norm": 0.23190362751483917, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.01, + "step": 1300 + }, + { + "epoch": 0.08017626537731808, + "grad_norm": 0.4732286334037781, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0089, + "step": 1310 + }, + { + "epoch": 0.08078829793745028, + "grad_norm": 0.3010174036026001, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0091, + "step": 1320 + }, + { + "epoch": 0.08140033049758247, + "grad_norm": 0.3989834189414978, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0097, + "step": 1330 + }, + { + "epoch": 0.08201236305771467, + "grad_norm": 0.4597114622592926, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.01, + "step": 1340 + }, + { + "epoch": 0.08262439561784687, + "grad_norm": 0.426826536655426, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.011, + "step": 1350 + }, + { + "epoch": 0.08323642817797906, + "grad_norm": 0.4876341223716736, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0105, + "step": 1360 + }, + { + "epoch": 0.08384846073811127, + "grad_norm": 0.5444457530975342, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0099, + "step": 1370 + }, + { + "epoch": 0.08446049329824347, + "grad_norm": 0.5096126794815063, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.007, + "step": 1380 + }, + { + "epoch": 0.08507252585837567, + "grad_norm": 0.43828368186950684, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.009, + "step": 1390 + }, + { + "epoch": 0.08568455841850786, + "grad_norm": 0.40163955092430115, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0103, + "step": 1400 + }, + { + "epoch": 0.08629659097864006, + "grad_norm": 0.3110432028770447, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0115, + "step": 1410 + }, + { + "epoch": 0.08690862353877227, + "grad_norm": 0.8393893241882324, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.012, + "step": 1420 + }, + { + "epoch": 0.08752065609890446, + "grad_norm": 0.2751714289188385, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0093, + "step": 1430 + }, + { + "epoch": 0.08813268865903666, + "grad_norm": 0.36969971656799316, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0112, + "step": 1440 + }, + { + "epoch": 0.08874472121916886, + "grad_norm": 0.3721938729286194, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0113, + "step": 1450 + }, + { + "epoch": 0.08935675377930107, + "grad_norm": 0.26564934849739075, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0107, + "step": 1460 + }, + { + "epoch": 0.08996878633943325, + "grad_norm": 0.36552169919013977, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0123, + "step": 1470 + }, + { + "epoch": 0.09058081889956546, + "grad_norm": 0.23664990067481995, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0074, + "step": 1480 + }, + { + "epoch": 0.09119285145969766, + "grad_norm": 0.49903133511543274, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0084, + "step": 1490 + }, + { + "epoch": 0.09180488401982985, + "grad_norm": 0.43505051732063293, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0115, + "step": 1500 + }, + { + "epoch": 0.09241691657996205, + "grad_norm": 0.20318932831287384, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0088, + "step": 1510 + }, + { + "epoch": 0.09302894914009426, + "grad_norm": 0.3289708197116852, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.008, + "step": 1520 + }, + { + "epoch": 0.09364098170022646, + "grad_norm": 0.3920934200286865, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0145, + "step": 1530 + }, + { + "epoch": 0.09425301426035865, + "grad_norm": 0.40396374464035034, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0081, + "step": 1540 + }, + { + "epoch": 0.09486504682049085, + "grad_norm": 0.4044182300567627, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.012, + "step": 1550 + }, + { + "epoch": 0.09547707938062305, + "grad_norm": 0.2318611741065979, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0115, + "step": 1560 + }, + { + "epoch": 0.09608911194075524, + "grad_norm": 0.3905714750289917, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.008, + "step": 1570 + }, + { + "epoch": 0.09670114450088745, + "grad_norm": 0.2516922652721405, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0084, + "step": 1580 + }, + { + "epoch": 0.09731317706101965, + "grad_norm": 0.338455468416214, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0122, + "step": 1590 + }, + { + "epoch": 0.09792520962115185, + "grad_norm": 0.31875041127204895, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0081, + "step": 1600 + }, + { + "epoch": 0.09853724218128404, + "grad_norm": 0.2996121644973755, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0068, + "step": 1610 + }, + { + "epoch": 0.09914927474141624, + "grad_norm": 0.4381162226200104, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0103, + "step": 1620 + }, + { + "epoch": 0.09976130730154845, + "grad_norm": 0.5531038045883179, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0168, + "step": 1630 + }, + { + "epoch": 0.10037333986168064, + "grad_norm": 1.1283385753631592, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0119, + "step": 1640 + }, + { + "epoch": 0.10098537242181284, + "grad_norm": 0.38017332553863525, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0096, + "step": 1650 + }, + { + "epoch": 0.10159740498194504, + "grad_norm": 0.4669477045536041, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0111, + "step": 1660 + }, + { + "epoch": 0.10220943754207724, + "grad_norm": 0.3903254270553589, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0098, + "step": 1670 + }, + { + "epoch": 0.10282147010220943, + "grad_norm": 0.49671587347984314, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0083, + "step": 1680 + }, + { + "epoch": 0.10343350266234164, + "grad_norm": 0.36555853486061096, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0088, + "step": 1690 + }, + { + "epoch": 0.10404553522247384, + "grad_norm": 0.21804726123809814, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0086, + "step": 1700 + }, + { + "epoch": 0.10465756778260603, + "grad_norm": 0.6744784116744995, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0109, + "step": 1710 + }, + { + "epoch": 0.10526960034273823, + "grad_norm": 0.34379470348358154, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0096, + "step": 1720 + }, + { + "epoch": 0.10588163290287043, + "grad_norm": 0.27760598063468933, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0095, + "step": 1730 + }, + { + "epoch": 0.10649366546300264, + "grad_norm": 0.36294442415237427, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0091, + "step": 1740 + }, + { + "epoch": 0.10710569802313483, + "grad_norm": 0.42200908064842224, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.011, + "step": 1750 + }, + { + "epoch": 0.10771773058326703, + "grad_norm": 0.47863906621932983, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0107, + "step": 1760 + }, + { + "epoch": 0.10832976314339923, + "grad_norm": 0.32717248797416687, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0112, + "step": 1770 + }, + { + "epoch": 0.10894179570353142, + "grad_norm": 0.4255545735359192, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0106, + "step": 1780 + }, + { + "epoch": 0.10955382826366362, + "grad_norm": 0.5034983158111572, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0098, + "step": 1790 + }, + { + "epoch": 0.11016586082379583, + "grad_norm": 0.37071412801742554, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0099, + "step": 1800 + }, + { + "epoch": 0.11077789338392803, + "grad_norm": 0.23624737560749054, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0069, + "step": 1810 + }, + { + "epoch": 0.11138992594406022, + "grad_norm": 0.5815485715866089, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0095, + "step": 1820 + }, + { + "epoch": 0.11200195850419242, + "grad_norm": 1.1828722953796387, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0104, + "step": 1830 + }, + { + "epoch": 0.11261399106432463, + "grad_norm": 0.38099589943885803, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0106, + "step": 1840 + }, + { + "epoch": 0.11322602362445681, + "grad_norm": 0.38476184010505676, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0112, + "step": 1850 + }, + { + "epoch": 0.11383805618458902, + "grad_norm": 0.48982104659080505, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0125, + "step": 1860 + }, + { + "epoch": 0.11445008874472122, + "grad_norm": 0.4165821671485901, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0099, + "step": 1870 + }, + { + "epoch": 0.11506212130485342, + "grad_norm": 0.3412662446498871, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0061, + "step": 1880 + }, + { + "epoch": 0.11567415386498561, + "grad_norm": 0.46617937088012695, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0129, + "step": 1890 + }, + { + "epoch": 0.11628618642511782, + "grad_norm": 0.2705824077129364, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0082, + "step": 1900 + }, + { + "epoch": 0.11689821898525002, + "grad_norm": 0.3567829430103302, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0125, + "step": 1910 + }, + { + "epoch": 0.11751025154538221, + "grad_norm": 0.4438138008117676, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0129, + "step": 1920 + }, + { + "epoch": 0.11812228410551441, + "grad_norm": 0.356703519821167, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0083, + "step": 1930 + }, + { + "epoch": 0.11873431666564661, + "grad_norm": 0.6039804220199585, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0086, + "step": 1940 + }, + { + "epoch": 0.11934634922577882, + "grad_norm": 0.4572801887989044, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0067, + "step": 1950 + }, + { + "epoch": 0.119958381785911, + "grad_norm": 0.5063445568084717, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0091, + "step": 1960 + }, + { + "epoch": 0.12057041434604321, + "grad_norm": 0.3467857837677002, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.008, + "step": 1970 + }, + { + "epoch": 0.12118244690617541, + "grad_norm": 0.4875742197036743, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0102, + "step": 1980 + }, + { + "epoch": 0.1217944794663076, + "grad_norm": 0.3209119141101837, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0081, + "step": 1990 + }, + { + "epoch": 0.1224065120264398, + "grad_norm": 0.4731980860233307, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0118, + "step": 2000 + }, + { + "epoch": 0.123018544586572, + "grad_norm": 0.5742963552474976, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.0125, + "step": 2010 + }, + { + "epoch": 0.12363057714670421, + "grad_norm": 0.41357406973838806, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.1242426097068364, + "grad_norm": 0.6277521252632141, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0096, + "step": 2030 + }, + { + "epoch": 0.1248546422669686, + "grad_norm": 0.41252902150154114, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0108, + "step": 2040 + }, + { + "epoch": 0.1254666748271008, + "grad_norm": 0.782122790813446, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0134, + "step": 2050 + }, + { + "epoch": 0.126078707387233, + "grad_norm": 0.45011264085769653, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0102, + "step": 2060 + }, + { + "epoch": 0.1266907399473652, + "grad_norm": 0.2724951207637787, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0088, + "step": 2070 + }, + { + "epoch": 0.12730277250749739, + "grad_norm": 0.2351481169462204, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.009, + "step": 2080 + }, + { + "epoch": 0.1279148050676296, + "grad_norm": 0.34568479657173157, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0092, + "step": 2090 + }, + { + "epoch": 0.1285268376277618, + "grad_norm": 0.44493499398231506, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0087, + "step": 2100 + }, + { + "epoch": 0.129138870187894, + "grad_norm": 0.3011283874511719, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0108, + "step": 2110 + }, + { + "epoch": 0.1297509027480262, + "grad_norm": 0.4170232117176056, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0087, + "step": 2120 + }, + { + "epoch": 0.1303629353081584, + "grad_norm": 0.2696056365966797, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0093, + "step": 2130 + }, + { + "epoch": 0.1309749678682906, + "grad_norm": 0.4092336893081665, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0083, + "step": 2140 + }, + { + "epoch": 0.13158700042842278, + "grad_norm": 0.36637401580810547, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.01, + "step": 2150 + }, + { + "epoch": 0.13219903298855498, + "grad_norm": 0.28675684332847595, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0079, + "step": 2160 + }, + { + "epoch": 0.13281106554868718, + "grad_norm": 0.27699902653694153, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0071, + "step": 2170 + }, + { + "epoch": 0.1334230981088194, + "grad_norm": 0.3832298517227173, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0104, + "step": 2180 + }, + { + "epoch": 0.1340351306689516, + "grad_norm": 0.3590598702430725, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0074, + "step": 2190 + }, + { + "epoch": 0.1346471632290838, + "grad_norm": 0.21830014884471893, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0093, + "step": 2200 + }, + { + "epoch": 0.135259195789216, + "grad_norm": 0.342492938041687, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0109, + "step": 2210 + }, + { + "epoch": 0.13587122834934817, + "grad_norm": 0.6337023973464966, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0082, + "step": 2220 + }, + { + "epoch": 0.13648326090948038, + "grad_norm": 0.41742798686027527, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0083, + "step": 2230 + }, + { + "epoch": 0.13709529346961258, + "grad_norm": 0.3180190324783325, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0083, + "step": 2240 + }, + { + "epoch": 0.13770732602974478, + "grad_norm": 0.36720144748687744, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0114, + "step": 2250 + }, + { + "epoch": 0.13831935858987698, + "grad_norm": 0.29457366466522217, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0077, + "step": 2260 + }, + { + "epoch": 0.1389313911500092, + "grad_norm": 0.24702222645282745, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0074, + "step": 2270 + }, + { + "epoch": 0.1395434237101414, + "grad_norm": 0.3203345835208893, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0079, + "step": 2280 + }, + { + "epoch": 0.14015545627027357, + "grad_norm": 0.4375395178794861, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0127, + "step": 2290 + }, + { + "epoch": 0.14076748883040577, + "grad_norm": 0.44338247179985046, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0077, + "step": 2300 + }, + { + "epoch": 0.14137952139053797, + "grad_norm": 0.31765618920326233, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0091, + "step": 2310 + }, + { + "epoch": 0.14199155395067017, + "grad_norm": 0.322534441947937, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0068, + "step": 2320 + }, + { + "epoch": 0.14260358651080238, + "grad_norm": 0.23571068048477173, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0094, + "step": 2330 + }, + { + "epoch": 0.14321561907093458, + "grad_norm": 0.26818808913230896, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0092, + "step": 2340 + }, + { + "epoch": 0.14382765163106678, + "grad_norm": 0.31886982917785645, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0084, + "step": 2350 + }, + { + "epoch": 0.14443968419119896, + "grad_norm": 0.5176070928573608, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0104, + "step": 2360 + }, + { + "epoch": 0.14505171675133116, + "grad_norm": 0.4322161078453064, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0081, + "step": 2370 + }, + { + "epoch": 0.14566374931146336, + "grad_norm": 0.4076510965824127, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0102, + "step": 2380 + }, + { + "epoch": 0.14627578187159557, + "grad_norm": 0.3808838725090027, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0096, + "step": 2390 + }, + { + "epoch": 0.14688781443172777, + "grad_norm": 0.5045232176780701, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0109, + "step": 2400 + }, + { + "epoch": 0.14749984699185997, + "grad_norm": 0.3932737708091736, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0077, + "step": 2410 + }, + { + "epoch": 0.14811187955199218, + "grad_norm": 0.28561875224113464, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0141, + "step": 2420 + }, + { + "epoch": 0.14872391211212435, + "grad_norm": 0.414410799741745, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0094, + "step": 2430 + }, + { + "epoch": 0.14933594467225655, + "grad_norm": 0.4587285816669464, + "learning_rate": 1.989086647373215e-05, + "loss": 0.009, + "step": 2440 + }, + { + "epoch": 0.14994797723238876, + "grad_norm": 0.7567377686500549, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0084, + "step": 2450 + }, + { + "epoch": 0.15056000979252096, + "grad_norm": 0.4980221390724182, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0087, + "step": 2460 + }, + { + "epoch": 0.15117204235265316, + "grad_norm": 0.41810303926467896, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0082, + "step": 2470 + }, + { + "epoch": 0.15178407491278537, + "grad_norm": 0.4193445146083832, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0131, + "step": 2480 + }, + { + "epoch": 0.15239610747291757, + "grad_norm": 0.2561246156692505, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0074, + "step": 2490 + }, + { + "epoch": 0.15300814003304977, + "grad_norm": 0.22316500544548035, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0069, + "step": 2500 + }, + { + "epoch": 0.15362017259318195, + "grad_norm": 0.31504112482070923, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0097, + "step": 2510 + }, + { + "epoch": 0.15423220515331415, + "grad_norm": 0.2944568991661072, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0101, + "step": 2520 + }, + { + "epoch": 0.15484423771344635, + "grad_norm": 0.2744649052619934, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0074, + "step": 2530 + }, + { + "epoch": 0.15545627027357856, + "grad_norm": 0.2717166841030121, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.007, + "step": 2540 + }, + { + "epoch": 0.15606830283371076, + "grad_norm": 0.32652929425239563, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0097, + "step": 2550 + }, + { + "epoch": 0.15668033539384296, + "grad_norm": 0.3169964849948883, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0089, + "step": 2560 + }, + { + "epoch": 0.15729236795397517, + "grad_norm": 0.24130010604858398, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0083, + "step": 2570 + }, + { + "epoch": 0.15790440051410734, + "grad_norm": 0.3869011700153351, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0088, + "step": 2580 + }, + { + "epoch": 0.15851643307423954, + "grad_norm": 0.2944110333919525, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0085, + "step": 2590 + }, + { + "epoch": 0.15912846563437175, + "grad_norm": 0.27993839979171753, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0097, + "step": 2600 + }, + { + "epoch": 0.15974049819450395, + "grad_norm": 0.42018845677375793, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0104, + "step": 2610 + }, + { + "epoch": 0.16035253075463615, + "grad_norm": 0.45006832480430603, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0072, + "step": 2620 + }, + { + "epoch": 0.16096456331476836, + "grad_norm": 0.275564581155777, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0072, + "step": 2630 + }, + { + "epoch": 0.16157659587490056, + "grad_norm": 0.503052294254303, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0091, + "step": 2640 + }, + { + "epoch": 0.16218862843503273, + "grad_norm": 0.33740976452827454, + "learning_rate": 1.985678043265668e-05, + "loss": 0.008, + "step": 2650 + }, + { + "epoch": 0.16280066099516494, + "grad_norm": 0.5379078984260559, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0097, + "step": 2660 + }, + { + "epoch": 0.16341269355529714, + "grad_norm": 0.3605813980102539, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0109, + "step": 2670 + }, + { + "epoch": 0.16402472611542934, + "grad_norm": 0.49490585923194885, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.013, + "step": 2680 + }, + { + "epoch": 0.16463675867556155, + "grad_norm": 0.29894375801086426, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0089, + "step": 2690 + }, + { + "epoch": 0.16524879123569375, + "grad_norm": 0.395270437002182, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0092, + "step": 2700 + }, + { + "epoch": 0.16586082379582595, + "grad_norm": 0.25507843494415283, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0078, + "step": 2710 + }, + { + "epoch": 0.16647285635595813, + "grad_norm": 0.3304852843284607, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0178, + "step": 2720 + }, + { + "epoch": 0.16708488891609033, + "grad_norm": 0.4356633126735687, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0082, + "step": 2730 + }, + { + "epoch": 0.16769692147622253, + "grad_norm": 0.4104527533054352, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0086, + "step": 2740 + }, + { + "epoch": 0.16830895403635474, + "grad_norm": 0.25723493099212646, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0097, + "step": 2750 + }, + { + "epoch": 0.16892098659648694, + "grad_norm": 0.3280608057975769, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0079, + "step": 2760 + }, + { + "epoch": 0.16953301915661914, + "grad_norm": 0.4641128480434418, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0081, + "step": 2770 + }, + { + "epoch": 0.17014505171675134, + "grad_norm": 0.2704941928386688, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.0112, + "step": 2780 + }, + { + "epoch": 0.17075708427688352, + "grad_norm": 0.42343780398368835, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0084, + "step": 2790 + }, + { + "epoch": 0.17136911683701572, + "grad_norm": 0.2606532573699951, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0085, + "step": 2800 + }, + { + "epoch": 0.17198114939714793, + "grad_norm": 0.39099374413490295, + "learning_rate": 1.982773261916081e-05, + "loss": 0.014, + "step": 2810 + }, + { + "epoch": 0.17259318195728013, + "grad_norm": 0.32653889060020447, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0095, + "step": 2820 + }, + { + "epoch": 0.17320521451741233, + "grad_norm": 0.34765321016311646, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0084, + "step": 2830 + }, + { + "epoch": 0.17381724707754453, + "grad_norm": 0.2844177186489105, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.011, + "step": 2840 + }, + { + "epoch": 0.17442927963767674, + "grad_norm": 0.5079899430274963, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0114, + "step": 2850 + }, + { + "epoch": 0.1750413121978089, + "grad_norm": 0.4043678045272827, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0073, + "step": 2860 + }, + { + "epoch": 0.17565334475794112, + "grad_norm": 0.3833003640174866, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0076, + "step": 2870 + }, + { + "epoch": 0.17626537731807332, + "grad_norm": 0.2826341986656189, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0072, + "step": 2880 + }, + { + "epoch": 0.17687740987820552, + "grad_norm": 0.6043460965156555, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0084, + "step": 2890 + }, + { + "epoch": 0.17748944243833772, + "grad_norm": 0.3238481879234314, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0149, + "step": 2900 + }, + { + "epoch": 0.17810147499846993, + "grad_norm": 0.45817995071411133, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0103, + "step": 2910 + }, + { + "epoch": 0.17871350755860213, + "grad_norm": 0.21048744022846222, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0094, + "step": 2920 + }, + { + "epoch": 0.1793255401187343, + "grad_norm": 0.3401891887187958, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0058, + "step": 2930 + }, + { + "epoch": 0.1799375726788665, + "grad_norm": 0.3655509948730469, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0071, + "step": 2940 + }, + { + "epoch": 0.1805496052389987, + "grad_norm": 0.47406241297721863, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0067, + "step": 2950 + }, + { + "epoch": 0.18116163779913091, + "grad_norm": 0.3278841972351074, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0121, + "step": 2960 + }, + { + "epoch": 0.18177367035926312, + "grad_norm": 0.271436482667923, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.008, + "step": 2970 + }, + { + "epoch": 0.18238570291939532, + "grad_norm": 0.41475561261177063, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.008, + "step": 2980 + }, + { + "epoch": 0.18299773547952752, + "grad_norm": 0.5389090776443481, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0091, + "step": 2990 + }, + { + "epoch": 0.1836097680396597, + "grad_norm": 0.3958609700202942, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0106, + "step": 3000 + }, + { + "epoch": 0.1842218005997919, + "grad_norm": 0.3456019461154938, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0084, + "step": 3010 + }, + { + "epoch": 0.1848338331599241, + "grad_norm": 0.2959386706352234, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0071, + "step": 3020 + }, + { + "epoch": 0.1854458657200563, + "grad_norm": 0.2617223858833313, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0082, + "step": 3030 + }, + { + "epoch": 0.1860578982801885, + "grad_norm": 0.45173966884613037, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0105, + "step": 3040 + }, + { + "epoch": 0.1866699308403207, + "grad_norm": 0.4127421975135803, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.008, + "step": 3050 + }, + { + "epoch": 0.18728196340045292, + "grad_norm": 0.3142230808734894, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0085, + "step": 3060 + }, + { + "epoch": 0.1878939959605851, + "grad_norm": 0.49720287322998047, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0089, + "step": 3070 + }, + { + "epoch": 0.1885060285207173, + "grad_norm": 0.6417365074157715, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.0088, + "step": 3080 + }, + { + "epoch": 0.1891180610808495, + "grad_norm": 0.44801583886146545, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.0098, + "step": 3090 + }, + { + "epoch": 0.1897300936409817, + "grad_norm": 0.3606127202510834, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0087, + "step": 3100 + }, + { + "epoch": 0.1903421262011139, + "grad_norm": 0.268971711397171, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0092, + "step": 3110 + }, + { + "epoch": 0.1909541587612461, + "grad_norm": 0.2367011308670044, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0074, + "step": 3120 + }, + { + "epoch": 0.1915661913213783, + "grad_norm": 0.41643625497817993, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0062, + "step": 3130 + }, + { + "epoch": 0.19217822388151048, + "grad_norm": 0.33202284574508667, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0081, + "step": 3140 + }, + { + "epoch": 0.1927902564416427, + "grad_norm": 0.279813289642334, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0074, + "step": 3150 + }, + { + "epoch": 0.1934022890017749, + "grad_norm": 0.5127174258232117, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0101, + "step": 3160 + }, + { + "epoch": 0.1940143215619071, + "grad_norm": 0.36921849846839905, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0078, + "step": 3170 + }, + { + "epoch": 0.1946263541220393, + "grad_norm": 0.3509728014469147, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0114, + "step": 3180 + }, + { + "epoch": 0.1952383866821715, + "grad_norm": 0.3088139295578003, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0089, + "step": 3190 + }, + { + "epoch": 0.1958504192423037, + "grad_norm": 0.43653762340545654, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0116, + "step": 3200 + }, + { + "epoch": 0.19646245180243588, + "grad_norm": 0.2522308826446533, + "learning_rate": 1.974353140804231e-05, + "loss": 0.007, + "step": 3210 + }, + { + "epoch": 0.19707448436256808, + "grad_norm": 0.37519100308418274, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0098, + "step": 3220 + }, + { + "epoch": 0.19768651692270028, + "grad_norm": 0.379027783870697, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0086, + "step": 3230 + }, + { + "epoch": 0.1982985494828325, + "grad_norm": 0.2713090479373932, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0073, + "step": 3240 + }, + { + "epoch": 0.1989105820429647, + "grad_norm": 0.41106846928596497, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0188, + "step": 3250 + }, + { + "epoch": 0.1995226146030969, + "grad_norm": 0.3914758861064911, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0097, + "step": 3260 + }, + { + "epoch": 0.2001346471632291, + "grad_norm": 0.4763018488883972, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0082, + "step": 3270 + }, + { + "epoch": 0.20074667972336127, + "grad_norm": 0.23002664744853973, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0085, + "step": 3280 + }, + { + "epoch": 0.20135871228349347, + "grad_norm": 0.2887377142906189, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0074, + "step": 3290 + }, + { + "epoch": 0.20197074484362568, + "grad_norm": 0.2322079837322235, + "learning_rate": 1.972231769371516e-05, + "loss": 0.009, + "step": 3300 + }, + { + "epoch": 0.20258277740375788, + "grad_norm": 0.39307233691215515, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0095, + "step": 3310 + }, + { + "epoch": 0.20319480996389008, + "grad_norm": 0.5209783315658569, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.012, + "step": 3320 + }, + { + "epoch": 0.20380684252402229, + "grad_norm": 0.45187172293663025, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0086, + "step": 3330 + }, + { + "epoch": 0.2044188750841545, + "grad_norm": 0.480970174074173, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0072, + "step": 3340 + }, + { + "epoch": 0.20503090764428666, + "grad_norm": 0.30979010462760925, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0091, + "step": 3350 + }, + { + "epoch": 0.20564294020441887, + "grad_norm": 0.6410729289054871, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0096, + "step": 3360 + }, + { + "epoch": 0.20625497276455107, + "grad_norm": 0.23707512021064758, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0078, + "step": 3370 + }, + { + "epoch": 0.20686700532468327, + "grad_norm": 0.3029544949531555, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0115, + "step": 3380 + }, + { + "epoch": 0.20747903788481548, + "grad_norm": 0.28677740693092346, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0068, + "step": 3390 + }, + { + "epoch": 0.20809107044494768, + "grad_norm": 0.2433662712574005, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0082, + "step": 3400 + }, + { + "epoch": 0.20870310300507988, + "grad_norm": 0.38066667318344116, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0089, + "step": 3410 + }, + { + "epoch": 0.20931513556521206, + "grad_norm": 0.3830282390117645, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0073, + "step": 3420 + }, + { + "epoch": 0.20992716812534426, + "grad_norm": 0.359684556722641, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0088, + "step": 3430 + }, + { + "epoch": 0.21053920068547646, + "grad_norm": 0.3497346341609955, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0082, + "step": 3440 + }, + { + "epoch": 0.21115123324560867, + "grad_norm": 0.3664748966693878, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0091, + "step": 3450 + }, + { + "epoch": 0.21176326580574087, + "grad_norm": 0.382804811000824, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0077, + "step": 3460 + }, + { + "epoch": 0.21237529836587307, + "grad_norm": 0.22746194899082184, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0107, + "step": 3470 + }, + { + "epoch": 0.21298733092600527, + "grad_norm": 0.4094266891479492, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0096, + "step": 3480 + }, + { + "epoch": 0.21359936348613745, + "grad_norm": 0.26990365982055664, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0089, + "step": 3490 + }, + { + "epoch": 0.21421139604626965, + "grad_norm": 0.2602371275424957, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0074, + "step": 3500 + }, + { + "epoch": 0.21482342860640186, + "grad_norm": 0.34200435876846313, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0083, + "step": 3510 + }, + { + "epoch": 0.21543546116653406, + "grad_norm": 0.4260508716106415, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0098, + "step": 3520 + }, + { + "epoch": 0.21604749372666626, + "grad_norm": 0.4017483592033386, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0103, + "step": 3530 + }, + { + "epoch": 0.21665952628679847, + "grad_norm": 0.40005844831466675, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0094, + "step": 3540 + }, + { + "epoch": 0.21727155884693067, + "grad_norm": 0.3856841027736664, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0095, + "step": 3550 + }, + { + "epoch": 0.21788359140706284, + "grad_norm": 0.3245168626308441, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0067, + "step": 3560 + }, + { + "epoch": 0.21849562396719505, + "grad_norm": 0.2698485255241394, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0079, + "step": 3570 + }, + { + "epoch": 0.21910765652732725, + "grad_norm": 0.24520452320575714, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0109, + "step": 3580 + }, + { + "epoch": 0.21971968908745945, + "grad_norm": 0.397175133228302, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0114, + "step": 3590 + }, + { + "epoch": 0.22033172164759166, + "grad_norm": 0.40339091420173645, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0094, + "step": 3600 + }, + { + "epoch": 0.22094375420772386, + "grad_norm": 0.404435396194458, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0087, + "step": 3610 + }, + { + "epoch": 0.22155578676785606, + "grad_norm": 0.3300188183784485, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0073, + "step": 3620 + }, + { + "epoch": 0.22216781932798824, + "grad_norm": 0.23486892879009247, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0087, + "step": 3630 + }, + { + "epoch": 0.22277985188812044, + "grad_norm": 0.37211188673973083, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0078, + "step": 3640 + }, + { + "epoch": 0.22339188444825264, + "grad_norm": 0.32422709465026855, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.009, + "step": 3650 + }, + { + "epoch": 0.22400391700838485, + "grad_norm": 0.43535664677619934, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0099, + "step": 3660 + }, + { + "epoch": 0.22461594956851705, + "grad_norm": 0.3295724093914032, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0074, + "step": 3670 + }, + { + "epoch": 0.22522798212864925, + "grad_norm": 0.2840734124183655, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0082, + "step": 3680 + }, + { + "epoch": 0.22584001468878145, + "grad_norm": 0.2861844599246979, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0079, + "step": 3690 + }, + { + "epoch": 0.22645204724891363, + "grad_norm": 0.3194407820701599, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0071, + "step": 3700 + }, + { + "epoch": 0.22706407980904583, + "grad_norm": 0.38770729303359985, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0076, + "step": 3710 + }, + { + "epoch": 0.22767611236917804, + "grad_norm": 0.4637960195541382, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0093, + "step": 3720 + }, + { + "epoch": 0.22828814492931024, + "grad_norm": 0.31972312927246094, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0078, + "step": 3730 + }, + { + "epoch": 0.22890017748944244, + "grad_norm": 0.5273001790046692, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0076, + "step": 3740 + }, + { + "epoch": 0.22951221004957464, + "grad_norm": 0.30589622259140015, + "learning_rate": 1.960385541132679e-05, + "loss": 0.009, + "step": 3750 + }, + { + "epoch": 0.23012424260970685, + "grad_norm": 0.31634265184402466, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0063, + "step": 3760 + }, + { + "epoch": 0.23073627516983902, + "grad_norm": 0.32762402296066284, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0081, + "step": 3770 + }, + { + "epoch": 0.23134830772997123, + "grad_norm": 0.42696496844291687, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0089, + "step": 3780 + }, + { + "epoch": 0.23196034029010343, + "grad_norm": 0.4676671624183655, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0093, + "step": 3790 + }, + { + "epoch": 0.23257237285023563, + "grad_norm": 0.3347911536693573, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0131, + "step": 3800 + }, + { + "epoch": 0.23318440541036783, + "grad_norm": 0.3083193600177765, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0072, + "step": 3810 + }, + { + "epoch": 0.23379643797050004, + "grad_norm": 0.38178423047065735, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0083, + "step": 3820 + }, + { + "epoch": 0.23440847053063224, + "grad_norm": 0.2796846330165863, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0072, + "step": 3830 + }, + { + "epoch": 0.23502050309076442, + "grad_norm": 0.37444883584976196, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.008, + "step": 3840 + }, + { + "epoch": 0.23563253565089662, + "grad_norm": 0.3286772668361664, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0092, + "step": 3850 + }, + { + "epoch": 0.23624456821102882, + "grad_norm": 0.45423513650894165, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0068, + "step": 3860 + }, + { + "epoch": 0.23685660077116102, + "grad_norm": 0.36881721019744873, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0068, + "step": 3870 + }, + { + "epoch": 0.23746863333129323, + "grad_norm": 0.3560579717159271, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0084, + "step": 3880 + }, + { + "epoch": 0.23808066589142543, + "grad_norm": 0.43887296319007874, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0076, + "step": 3890 + }, + { + "epoch": 0.23869269845155763, + "grad_norm": 0.3080165982246399, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0069, + "step": 3900 + }, + { + "epoch": 0.2393047310116898, + "grad_norm": 0.2327195703983307, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0077, + "step": 3910 + }, + { + "epoch": 0.239916763571822, + "grad_norm": 0.5960802435874939, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0088, + "step": 3920 + }, + { + "epoch": 0.24052879613195421, + "grad_norm": 0.36213600635528564, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0076, + "step": 3930 + }, + { + "epoch": 0.24114082869208642, + "grad_norm": 0.2950032949447632, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0115, + "step": 3940 + }, + { + "epoch": 0.24175286125221862, + "grad_norm": 0.4527084529399872, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0089, + "step": 3950 + }, + { + "epoch": 0.24236489381235082, + "grad_norm": 0.4422491192817688, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0135, + "step": 3960 + }, + { + "epoch": 0.24297692637248303, + "grad_norm": 0.45049232244491577, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 0.2435889589326152, + "grad_norm": 0.2566494941711426, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0095, + "step": 3980 + }, + { + "epoch": 0.2442009914927474, + "grad_norm": 0.49880343675613403, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0099, + "step": 3990 + }, + { + "epoch": 0.2448130240528796, + "grad_norm": 0.4699341952800751, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0084, + "step": 4000 + }, + { + "epoch": 0.2454250566130118, + "grad_norm": 0.41230708360671997, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0071, + "step": 4010 + }, + { + "epoch": 0.246037089173144, + "grad_norm": 0.4836854934692383, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.01, + "step": 4020 + }, + { + "epoch": 0.24664912173327622, + "grad_norm": 0.3056115508079529, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0082, + "step": 4030 + }, + { + "epoch": 0.24726115429340842, + "grad_norm": 0.151325523853302, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0062, + "step": 4040 + }, + { + "epoch": 0.2478731868535406, + "grad_norm": 0.3798811137676239, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0081, + "step": 4050 + }, + { + "epoch": 0.2484852194136728, + "grad_norm": 0.3308229148387909, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0072, + "step": 4060 + }, + { + "epoch": 0.249097251973805, + "grad_norm": 0.2891339957714081, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0074, + "step": 4070 + }, + { + "epoch": 0.2497092845339372, + "grad_norm": 0.24179549515247345, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 0.2503213170940694, + "grad_norm": 0.20879383385181427, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0064, + "step": 4090 + }, + { + "epoch": 0.2509333496542016, + "grad_norm": 0.39275774359703064, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0073, + "step": 4100 + }, + { + "epoch": 0.2515453822143338, + "grad_norm": 0.2925782799720764, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0095, + "step": 4110 + }, + { + "epoch": 0.252157414774466, + "grad_norm": 0.6465128660202026, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0102, + "step": 4120 + }, + { + "epoch": 0.2527694473345982, + "grad_norm": 0.34663915634155273, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.007, + "step": 4130 + }, + { + "epoch": 0.2533814798947304, + "grad_norm": 0.3387165367603302, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0091, + "step": 4140 + }, + { + "epoch": 0.2539935124548626, + "grad_norm": 0.32989630103111267, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0084, + "step": 4150 + }, + { + "epoch": 0.25460554501499477, + "grad_norm": 0.22870391607284546, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0071, + "step": 4160 + }, + { + "epoch": 0.255217577575127, + "grad_norm": 0.3866496682167053, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0079, + "step": 4170 + }, + { + "epoch": 0.2558296101352592, + "grad_norm": 0.29885268211364746, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0068, + "step": 4180 + }, + { + "epoch": 0.2564416426953914, + "grad_norm": 0.4693736135959625, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0095, + "step": 4190 + }, + { + "epoch": 0.2570536752555236, + "grad_norm": 0.2822454273700714, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0074, + "step": 4200 + }, + { + "epoch": 0.2576657078156558, + "grad_norm": 0.21141012012958527, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0099, + "step": 4210 + }, + { + "epoch": 0.258277740375788, + "grad_norm": 0.2284570336341858, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0102, + "step": 4220 + }, + { + "epoch": 0.2588897729359202, + "grad_norm": 0.4675048887729645, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0112, + "step": 4230 + }, + { + "epoch": 0.2595018054960524, + "grad_norm": 0.3906441628932953, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0088, + "step": 4240 + }, + { + "epoch": 0.2601138380561846, + "grad_norm": 0.22990387678146362, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0091, + "step": 4250 + }, + { + "epoch": 0.2607258706163168, + "grad_norm": 0.41871073842048645, + "learning_rate": 1.944490251296856e-05, + "loss": 0.009, + "step": 4260 + }, + { + "epoch": 0.261337903176449, + "grad_norm": 0.2724440395832062, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0074, + "step": 4270 + }, + { + "epoch": 0.2619499357365812, + "grad_norm": 0.42590636014938354, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0069, + "step": 4280 + }, + { + "epoch": 0.2625619682967134, + "grad_norm": 0.3604855239391327, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0068, + "step": 4290 + }, + { + "epoch": 0.26317400085684556, + "grad_norm": 0.475304514169693, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0082, + "step": 4300 + }, + { + "epoch": 0.26378603341697776, + "grad_norm": 0.24752479791641235, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0065, + "step": 4310 + }, + { + "epoch": 0.26439806597710996, + "grad_norm": 0.4384835958480835, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0104, + "step": 4320 + }, + { + "epoch": 0.26501009853724217, + "grad_norm": 0.24999107420444489, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0076, + "step": 4330 + }, + { + "epoch": 0.26562213109737437, + "grad_norm": 0.292491614818573, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0069, + "step": 4340 + }, + { + "epoch": 0.2662341636575066, + "grad_norm": 0.2380208522081375, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0069, + "step": 4350 + }, + { + "epoch": 0.2668461962176388, + "grad_norm": 0.2906023859977722, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0063, + "step": 4360 + }, + { + "epoch": 0.267458228777771, + "grad_norm": 0.4718990623950958, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0074, + "step": 4370 + }, + { + "epoch": 0.2680702613379032, + "grad_norm": 0.33257269859313965, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0073, + "step": 4380 + }, + { + "epoch": 0.2686822938980354, + "grad_norm": 0.34411463141441345, + "learning_rate": 1.940024231916886e-05, + "loss": 0.006, + "step": 4390 + }, + { + "epoch": 0.2692943264581676, + "grad_norm": 0.40312516689300537, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0093, + "step": 4400 + }, + { + "epoch": 0.2699063590182998, + "grad_norm": 0.2248350828886032, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0082, + "step": 4410 + }, + { + "epoch": 0.270518391578432, + "grad_norm": 0.30094820261001587, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0092, + "step": 4420 + }, + { + "epoch": 0.2711304241385642, + "grad_norm": 0.4277440309524536, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0099, + "step": 4430 + }, + { + "epoch": 0.27174245669869634, + "grad_norm": 0.2876254916191101, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0078, + "step": 4440 + }, + { + "epoch": 0.27235448925882855, + "grad_norm": 0.3453986346721649, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0096, + "step": 4450 + }, + { + "epoch": 0.27296652181896075, + "grad_norm": 0.31379634141921997, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0088, + "step": 4460 + }, + { + "epoch": 0.27357855437909295, + "grad_norm": 0.294477254152298, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0073, + "step": 4470 + }, + { + "epoch": 0.27419058693922516, + "grad_norm": 0.3773270845413208, + "learning_rate": 1.936834723687526e-05, + "loss": 0.008, + "step": 4480 + }, + { + "epoch": 0.27480261949935736, + "grad_norm": 0.31942978501319885, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0086, + "step": 4490 + }, + { + "epoch": 0.27541465205948956, + "grad_norm": 0.46827632188796997, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0081, + "step": 4500 + }, + { + "epoch": 0.27602668461962176, + "grad_norm": 0.2735249102115631, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0107, + "step": 4510 + }, + { + "epoch": 0.27663871717975397, + "grad_norm": 0.30048197507858276, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0082, + "step": 4520 + }, + { + "epoch": 0.27725074973988617, + "grad_norm": 0.3507469594478607, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0093, + "step": 4530 + }, + { + "epoch": 0.2778627823000184, + "grad_norm": 0.5642989277839661, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0091, + "step": 4540 + }, + { + "epoch": 0.2784748148601506, + "grad_norm": 0.2769993245601654, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0105, + "step": 4550 + }, + { + "epoch": 0.2790868474202828, + "grad_norm": 0.30269622802734375, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0066, + "step": 4560 + }, + { + "epoch": 0.279698879980415, + "grad_norm": 0.3717023432254791, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0114, + "step": 4570 + }, + { + "epoch": 0.28031091254054713, + "grad_norm": 0.5065163373947144, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0075, + "step": 4580 + }, + { + "epoch": 0.28092294510067933, + "grad_norm": 0.4302189350128174, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0077, + "step": 4590 + }, + { + "epoch": 0.28153497766081154, + "grad_norm": 0.44008374214172363, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0086, + "step": 4600 + }, + { + "epoch": 0.28214701022094374, + "grad_norm": 0.4647364318370819, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0073, + "step": 4610 + }, + { + "epoch": 0.28275904278107594, + "grad_norm": 0.4229913651943207, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0077, + "step": 4620 + }, + { + "epoch": 0.28337107534120815, + "grad_norm": 0.36600178480148315, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0078, + "step": 4630 + }, + { + "epoch": 0.28398310790134035, + "grad_norm": 0.47143280506134033, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0085, + "step": 4640 + }, + { + "epoch": 0.28459514046147255, + "grad_norm": 0.29140496253967285, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0056, + "step": 4650 + }, + { + "epoch": 0.28520717302160475, + "grad_norm": 0.3964666426181793, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0071, + "step": 4660 + }, + { + "epoch": 0.28581920558173696, + "grad_norm": 0.407536119222641, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0095, + "step": 4670 + }, + { + "epoch": 0.28643123814186916, + "grad_norm": 0.33687031269073486, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0091, + "step": 4680 + }, + { + "epoch": 0.28704327070200136, + "grad_norm": 0.3182448446750641, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0087, + "step": 4690 + }, + { + "epoch": 0.28765530326213357, + "grad_norm": 0.40998023748397827, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0099, + "step": 4700 + }, + { + "epoch": 0.28826733582226577, + "grad_norm": 0.28750360012054443, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0087, + "step": 4710 + }, + { + "epoch": 0.2888793683823979, + "grad_norm": 0.36494627594947815, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0062, + "step": 4720 + }, + { + "epoch": 0.2894914009425301, + "grad_norm": 0.37047910690307617, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0094, + "step": 4730 + }, + { + "epoch": 0.2901034335026623, + "grad_norm": 0.2577553987503052, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0082, + "step": 4740 + }, + { + "epoch": 0.2907154660627945, + "grad_norm": 0.24589397013187408, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0067, + "step": 4750 + }, + { + "epoch": 0.29132749862292673, + "grad_norm": 0.37927499413490295, + "learning_rate": 1.926404507646751e-05, + "loss": 0.008, + "step": 4760 + }, + { + "epoch": 0.29193953118305893, + "grad_norm": 0.40547946095466614, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0101, + "step": 4770 + }, + { + "epoch": 0.29255156374319113, + "grad_norm": 0.47896578907966614, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0114, + "step": 4780 + }, + { + "epoch": 0.29316359630332334, + "grad_norm": 0.42911696434020996, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0066, + "step": 4790 + }, + { + "epoch": 0.29377562886345554, + "grad_norm": 0.21735505759716034, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0072, + "step": 4800 + }, + { + "epoch": 0.29438766142358774, + "grad_norm": 0.25916650891304016, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0078, + "step": 4810 + }, + { + "epoch": 0.29499969398371995, + "grad_norm": 0.23863966763019562, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0086, + "step": 4820 + }, + { + "epoch": 0.29561172654385215, + "grad_norm": 0.41552650928497314, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0078, + "step": 4830 + }, + { + "epoch": 0.29622375910398435, + "grad_norm": 0.2775874733924866, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0082, + "step": 4840 + }, + { + "epoch": 0.29683579166411656, + "grad_norm": 0.28962916135787964, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0088, + "step": 4850 + }, + { + "epoch": 0.2974478242242487, + "grad_norm": 0.3488757610321045, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0076, + "step": 4860 + }, + { + "epoch": 0.2980598567843809, + "grad_norm": 0.3833489716053009, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0096, + "step": 4870 + }, + { + "epoch": 0.2986718893445131, + "grad_norm": 0.20357537269592285, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.0061, + "step": 4880 + }, + { + "epoch": 0.2992839219046453, + "grad_norm": 0.4648539423942566, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0099, + "step": 4890 + }, + { + "epoch": 0.2998959544647775, + "grad_norm": 0.2701941728591919, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0062, + "step": 4900 + }, + { + "epoch": 0.3005079870249097, + "grad_norm": 0.31277161836624146, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0069, + "step": 4910 + }, + { + "epoch": 0.3011200195850419, + "grad_norm": 0.27697697281837463, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0094, + "step": 4920 + }, + { + "epoch": 0.3017320521451741, + "grad_norm": 0.22880606353282928, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0074, + "step": 4930 + }, + { + "epoch": 0.3023440847053063, + "grad_norm": 0.258404940366745, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0078, + "step": 4940 + }, + { + "epoch": 0.30295611726543853, + "grad_norm": 0.394394189119339, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0109, + "step": 4950 + }, + { + "epoch": 0.30356814982557073, + "grad_norm": 0.24108687043190002, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0082, + "step": 4960 + }, + { + "epoch": 0.30418018238570294, + "grad_norm": 0.34520867466926575, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0098, + "step": 4970 + }, + { + "epoch": 0.30479221494583514, + "grad_norm": 0.33723267912864685, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0104, + "step": 4980 + }, + { + "epoch": 0.30540424750596734, + "grad_norm": 0.28276878595352173, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0072, + "step": 4990 + }, + { + "epoch": 0.30601628006609954, + "grad_norm": 0.32236188650131226, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.012, + "step": 5000 + }, + { + "epoch": 0.3066283126262317, + "grad_norm": 0.20596888661384583, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0073, + "step": 5010 + }, + { + "epoch": 0.3072403451863639, + "grad_norm": 0.37921255826950073, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0073, + "step": 5020 + }, + { + "epoch": 0.3078523777464961, + "grad_norm": 0.30738911032676697, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0083, + "step": 5030 + }, + { + "epoch": 0.3084644103066283, + "grad_norm": 0.1938163936138153, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0065, + "step": 5040 + }, + { + "epoch": 0.3090764428667605, + "grad_norm": 0.25826898217201233, + "learning_rate": 1.914800406458133e-05, + "loss": 0.008, + "step": 5050 + }, + { + "epoch": 0.3096884754268927, + "grad_norm": 0.18951697647571564, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0058, + "step": 5060 + }, + { + "epoch": 0.3103005079870249, + "grad_norm": 0.3877381980419159, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0095, + "step": 5070 + }, + { + "epoch": 0.3109125405471571, + "grad_norm": 0.3133573830127716, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0088, + "step": 5080 + }, + { + "epoch": 0.3115245731072893, + "grad_norm": 0.33131852746009827, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0062, + "step": 5090 + }, + { + "epoch": 0.3121366056674215, + "grad_norm": 0.21276263892650604, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0071, + "step": 5100 + }, + { + "epoch": 0.3127486382275537, + "grad_norm": 0.46878281235694885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0084, + "step": 5110 + }, + { + "epoch": 0.3133606707876859, + "grad_norm": 0.44227683544158936, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0097, + "step": 5120 + }, + { + "epoch": 0.3139727033478181, + "grad_norm": 0.41950204968452454, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0115, + "step": 5130 + }, + { + "epoch": 0.31458473590795033, + "grad_norm": 0.4214445948600769, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0081, + "step": 5140 + }, + { + "epoch": 0.3151967684680825, + "grad_norm": 0.3779868483543396, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0075, + "step": 5150 + }, + { + "epoch": 0.3158088010282147, + "grad_norm": 0.4587777853012085, + "learning_rate": 1.910187855634501e-05, + "loss": 0.009, + "step": 5160 + }, + { + "epoch": 0.3164208335883469, + "grad_norm": 0.4875587224960327, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0099, + "step": 5170 + }, + { + "epoch": 0.3170328661484791, + "grad_norm": 0.22378237545490265, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0071, + "step": 5180 + }, + { + "epoch": 0.3176448987086113, + "grad_norm": 0.3360678553581238, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0101, + "step": 5190 + }, + { + "epoch": 0.3182569312687435, + "grad_norm": 0.36370640993118286, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0068, + "step": 5200 + }, + { + "epoch": 0.3188689638288757, + "grad_norm": 0.25814393162727356, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0076, + "step": 5210 + }, + { + "epoch": 0.3194809963890079, + "grad_norm": 0.39010074734687805, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0066, + "step": 5220 + }, + { + "epoch": 0.3200930289491401, + "grad_norm": 0.44009074568748474, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0078, + "step": 5230 + }, + { + "epoch": 0.3207050615092723, + "grad_norm": 0.45733046531677246, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0074, + "step": 5240 + }, + { + "epoch": 0.3213170940694045, + "grad_norm": 0.4555135667324066, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0089, + "step": 5250 + }, + { + "epoch": 0.3219291266295367, + "grad_norm": 0.5864276885986328, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0083, + "step": 5260 + }, + { + "epoch": 0.3225411591896689, + "grad_norm": 0.3305470943450928, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0094, + "step": 5270 + }, + { + "epoch": 0.3231531917498011, + "grad_norm": 0.21458053588867188, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.0091, + "step": 5280 + }, + { + "epoch": 0.32376522430993326, + "grad_norm": 0.2927384376525879, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.007, + "step": 5290 + }, + { + "epoch": 0.32437725687006547, + "grad_norm": 0.387608140707016, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0068, + "step": 5300 + }, + { + "epoch": 0.32498928943019767, + "grad_norm": 0.28193122148513794, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0065, + "step": 5310 + }, + { + "epoch": 0.3256013219903299, + "grad_norm": 0.33098119497299194, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0082, + "step": 5320 + }, + { + "epoch": 0.3262133545504621, + "grad_norm": 0.5442482233047485, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0124, + "step": 5330 + }, + { + "epoch": 0.3268253871105943, + "grad_norm": 0.503669798374176, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0093, + "step": 5340 + }, + { + "epoch": 0.3274374196707265, + "grad_norm": 0.2307574301958084, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0071, + "step": 5350 + }, + { + "epoch": 0.3280494522308587, + "grad_norm": 0.3543917238712311, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.007, + "step": 5360 + }, + { + "epoch": 0.3286614847909909, + "grad_norm": 0.21763169765472412, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0059, + "step": 5370 + }, + { + "epoch": 0.3292735173511231, + "grad_norm": 0.38023391366004944, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0068, + "step": 5380 + }, + { + "epoch": 0.3298855499112553, + "grad_norm": 0.44597327709198, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0069, + "step": 5390 + }, + { + "epoch": 0.3304975824713875, + "grad_norm": 0.2994389533996582, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0083, + "step": 5400 + }, + { + "epoch": 0.3311096150315197, + "grad_norm": 0.26668304204940796, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0071, + "step": 5410 + }, + { + "epoch": 0.3317216475916519, + "grad_norm": 0.25944197177886963, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0065, + "step": 5420 + }, + { + "epoch": 0.33233368015178405, + "grad_norm": 0.3646431267261505, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0065, + "step": 5430 + }, + { + "epoch": 0.33294571271191625, + "grad_norm": 0.34860959649086, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0098, + "step": 5440 + }, + { + "epoch": 0.33355774527204846, + "grad_norm": 0.33718568086624146, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0069, + "step": 5450 + }, + { + "epoch": 0.33416977783218066, + "grad_norm": 0.2417302280664444, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0064, + "step": 5460 + }, + { + "epoch": 0.33478181039231286, + "grad_norm": 0.26607826352119446, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0048, + "step": 5470 + }, + { + "epoch": 0.33539384295244506, + "grad_norm": 0.31762364506721497, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0064, + "step": 5480 + }, + { + "epoch": 0.33600587551257727, + "grad_norm": 0.21427015960216522, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0079, + "step": 5490 + }, + { + "epoch": 0.33661790807270947, + "grad_norm": 0.3372637629508972, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0077, + "step": 5500 + }, + { + "epoch": 0.3372299406328417, + "grad_norm": 0.3760700821876526, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0066, + "step": 5510 + }, + { + "epoch": 0.3378419731929739, + "grad_norm": 0.22838029265403748, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0061, + "step": 5520 + }, + { + "epoch": 0.3384540057531061, + "grad_norm": 0.3105243444442749, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0089, + "step": 5530 + }, + { + "epoch": 0.3390660383132383, + "grad_norm": 0.23694929480552673, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0086, + "step": 5540 + }, + { + "epoch": 0.3396780708733705, + "grad_norm": 0.22935174405574799, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0074, + "step": 5550 + }, + { + "epoch": 0.3402901034335027, + "grad_norm": 0.26384714245796204, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0069, + "step": 5560 + }, + { + "epoch": 0.34090213599363484, + "grad_norm": 0.33245643973350525, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0061, + "step": 5570 + }, + { + "epoch": 0.34151416855376704, + "grad_norm": 0.3904813230037689, + "learning_rate": 1.891523933768891e-05, + "loss": 0.009, + "step": 5580 + }, + { + "epoch": 0.34212620111389924, + "grad_norm": 0.33858415484428406, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0067, + "step": 5590 + }, + { + "epoch": 0.34273823367403145, + "grad_norm": 0.3197486996650696, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0082, + "step": 5600 + }, + { + "epoch": 0.34335026623416365, + "grad_norm": 0.23814789950847626, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0068, + "step": 5610 + }, + { + "epoch": 0.34396229879429585, + "grad_norm": 0.3820457458496094, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0082, + "step": 5620 + }, + { + "epoch": 0.34457433135442805, + "grad_norm": 0.27518680691719055, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0063, + "step": 5630 + }, + { + "epoch": 0.34518636391456026, + "grad_norm": 0.24741721153259277, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0076, + "step": 5640 + }, + { + "epoch": 0.34579839647469246, + "grad_norm": 0.5140052437782288, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0178, + "step": 5650 + }, + { + "epoch": 0.34641042903482466, + "grad_norm": 0.5363543033599854, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0097, + "step": 5660 + }, + { + "epoch": 0.34702246159495687, + "grad_norm": 0.41116055846214294, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0078, + "step": 5670 + }, + { + "epoch": 0.34763449415508907, + "grad_norm": 0.412762314081192, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0086, + "step": 5680 + }, + { + "epoch": 0.34824652671522127, + "grad_norm": 0.399527907371521, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0088, + "step": 5690 + }, + { + "epoch": 0.3488585592753535, + "grad_norm": 0.3447834551334381, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0084, + "step": 5700 + }, + { + "epoch": 0.3494705918354856, + "grad_norm": 0.3418859541416168, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0092, + "step": 5710 + }, + { + "epoch": 0.3500826243956178, + "grad_norm": 0.3336535692214966, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.0065, + "step": 5720 + }, + { + "epoch": 0.35069465695575003, + "grad_norm": 0.34575122594833374, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0068, + "step": 5730 + }, + { + "epoch": 0.35130668951588223, + "grad_norm": 0.34325110912323, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.01, + "step": 5740 + }, + { + "epoch": 0.35191872207601443, + "grad_norm": 0.20104236900806427, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0085, + "step": 5750 + }, + { + "epoch": 0.35253075463614664, + "grad_norm": 0.33699074387550354, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0073, + "step": 5760 + }, + { + "epoch": 0.35314278719627884, + "grad_norm": 0.33322635293006897, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0091, + "step": 5770 + }, + { + "epoch": 0.35375481975641104, + "grad_norm": 0.26897475123405457, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0087, + "step": 5780 + }, + { + "epoch": 0.35436685231654325, + "grad_norm": 0.5310013890266418, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0067, + "step": 5790 + }, + { + "epoch": 0.35497888487667545, + "grad_norm": 0.4203440845012665, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0097, + "step": 5800 + }, + { + "epoch": 0.35559091743680765, + "grad_norm": 0.2179369181394577, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0082, + "step": 5810 + }, + { + "epoch": 0.35620294999693985, + "grad_norm": 0.2789444625377655, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0066, + "step": 5820 + }, + { + "epoch": 0.35681498255707206, + "grad_norm": 0.28009694814682007, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.007, + "step": 5830 + }, + { + "epoch": 0.35742701511720426, + "grad_norm": 0.304768443107605, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0065, + "step": 5840 + }, + { + "epoch": 0.3580390476773364, + "grad_norm": 0.2829401195049286, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0061, + "step": 5850 + }, + { + "epoch": 0.3586510802374686, + "grad_norm": 0.3388998508453369, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0083, + "step": 5860 + }, + { + "epoch": 0.3592631127976008, + "grad_norm": 0.3313426673412323, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0069, + "step": 5870 + }, + { + "epoch": 0.359875145357733, + "grad_norm": 0.2886904180049896, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0094, + "step": 5880 + }, + { + "epoch": 0.3604871779178652, + "grad_norm": 0.3132432997226715, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0086, + "step": 5890 + }, + { + "epoch": 0.3610992104779974, + "grad_norm": 0.37195107340812683, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0137, + "step": 5900 + }, + { + "epoch": 0.3617112430381296, + "grad_norm": 0.30853375792503357, + "learning_rate": 1.875708056549365e-05, + "loss": 0.01, + "step": 5910 + }, + { + "epoch": 0.36232327559826183, + "grad_norm": 0.39785459637641907, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0073, + "step": 5920 + }, + { + "epoch": 0.36293530815839403, + "grad_norm": 0.26958727836608887, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0059, + "step": 5930 + }, + { + "epoch": 0.36354734071852624, + "grad_norm": 0.354956716299057, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0069, + "step": 5940 + }, + { + "epoch": 0.36415937327865844, + "grad_norm": 0.3470858037471771, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0066, + "step": 5950 + }, + { + "epoch": 0.36477140583879064, + "grad_norm": 0.30000701546669006, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0075, + "step": 5960 + }, + { + "epoch": 0.36538343839892284, + "grad_norm": 0.5558263063430786, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0083, + "step": 5970 + }, + { + "epoch": 0.36599547095905505, + "grad_norm": 0.39146295189857483, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0062, + "step": 5980 + }, + { + "epoch": 0.3666075035191872, + "grad_norm": 0.44002753496170044, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0089, + "step": 5990 + }, + { + "epoch": 0.3672195360793194, + "grad_norm": 0.3220095932483673, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0097, + "step": 6000 + }, + { + "epoch": 0.3678315686394516, + "grad_norm": 0.3569507598876953, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0067, + "step": 6010 + }, + { + "epoch": 0.3684436011995838, + "grad_norm": 0.3004184365272522, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0084, + "step": 6020 + }, + { + "epoch": 0.369055633759716, + "grad_norm": 0.2931320071220398, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0079, + "step": 6030 + }, + { + "epoch": 0.3696676663198482, + "grad_norm": 0.39551016688346863, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0088, + "step": 6040 + }, + { + "epoch": 0.3702796988799804, + "grad_norm": 0.33755603432655334, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0082, + "step": 6050 + }, + { + "epoch": 0.3708917314401126, + "grad_norm": 0.3101558983325958, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0069, + "step": 6060 + }, + { + "epoch": 0.3715037640002448, + "grad_norm": 0.2921602129936218, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0065, + "step": 6070 + }, + { + "epoch": 0.372115796560377, + "grad_norm": 0.3601403832435608, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0063, + "step": 6080 + }, + { + "epoch": 0.3727278291205092, + "grad_norm": 0.34929168224334717, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0073, + "step": 6090 + }, + { + "epoch": 0.3733398616806414, + "grad_norm": 0.3987390995025635, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0068, + "step": 6100 + }, + { + "epoch": 0.37395189424077363, + "grad_norm": 0.2641090452671051, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0091, + "step": 6110 + }, + { + "epoch": 0.37456392680090583, + "grad_norm": 0.23139338195323944, + "learning_rate": 1.865125972978549e-05, + "loss": 0.006, + "step": 6120 + }, + { + "epoch": 0.375175959361038, + "grad_norm": 0.26552167534828186, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0056, + "step": 6130 + }, + { + "epoch": 0.3757879919211702, + "grad_norm": 0.43827885389328003, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0084, + "step": 6140 + }, + { + "epoch": 0.3764000244813024, + "grad_norm": 0.27495354413986206, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.012, + "step": 6150 + }, + { + "epoch": 0.3770120570414346, + "grad_norm": 0.36078640818595886, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0104, + "step": 6160 + }, + { + "epoch": 0.3776240896015668, + "grad_norm": 0.28252753615379333, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0103, + "step": 6170 + }, + { + "epoch": 0.378236122161699, + "grad_norm": 0.2674558162689209, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0069, + "step": 6180 + }, + { + "epoch": 0.3788481547218312, + "grad_norm": 0.21457509696483612, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0089, + "step": 6190 + }, + { + "epoch": 0.3794601872819634, + "grad_norm": 0.3142339885234833, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0075, + "step": 6200 + }, + { + "epoch": 0.3800722198420956, + "grad_norm": 0.32714203000068665, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0084, + "step": 6210 + }, + { + "epoch": 0.3806842524022278, + "grad_norm": 0.2632557153701782, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0072, + "step": 6220 + }, + { + "epoch": 0.38129628496236, + "grad_norm": 0.1893932968378067, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0063, + "step": 6230 + }, + { + "epoch": 0.3819083175224922, + "grad_norm": 0.49935290217399597, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0087, + "step": 6240 + }, + { + "epoch": 0.3825203500826244, + "grad_norm": 0.34605127573013306, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0101, + "step": 6250 + }, + { + "epoch": 0.3831323826427566, + "grad_norm": 0.3294198513031006, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0067, + "step": 6260 + }, + { + "epoch": 0.38374441520288877, + "grad_norm": 0.34797370433807373, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0065, + "step": 6270 + }, + { + "epoch": 0.38435644776302097, + "grad_norm": 0.37710750102996826, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0061, + "step": 6280 + }, + { + "epoch": 0.3849684803231532, + "grad_norm": 0.39949893951416016, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0066, + "step": 6290 + }, + { + "epoch": 0.3855805128832854, + "grad_norm": 0.33014294505119324, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0095, + "step": 6300 + }, + { + "epoch": 0.3861925454434176, + "grad_norm": 0.4329249858856201, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0089, + "step": 6310 + }, + { + "epoch": 0.3868045780035498, + "grad_norm": 0.298330157995224, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0076, + "step": 6320 + }, + { + "epoch": 0.387416610563682, + "grad_norm": 0.2672661542892456, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0074, + "step": 6330 + }, + { + "epoch": 0.3880286431238142, + "grad_norm": 0.48193076252937317, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0103, + "step": 6340 + }, + { + "epoch": 0.3886406756839464, + "grad_norm": 0.29180601239204407, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0075, + "step": 6350 + }, + { + "epoch": 0.3892527082440786, + "grad_norm": 0.21320492029190063, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0077, + "step": 6360 + }, + { + "epoch": 0.3898647408042108, + "grad_norm": 0.37252935767173767, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0078, + "step": 6370 + }, + { + "epoch": 0.390476773364343, + "grad_norm": 0.284586101770401, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0089, + "step": 6380 + }, + { + "epoch": 0.3910888059244752, + "grad_norm": 0.5030382871627808, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0088, + "step": 6390 + }, + { + "epoch": 0.3917008384846074, + "grad_norm": 0.357239305973053, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0069, + "step": 6400 + }, + { + "epoch": 0.39231287104473955, + "grad_norm": 0.20308594405651093, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0062, + "step": 6410 + }, + { + "epoch": 0.39292490360487176, + "grad_norm": 0.2678150534629822, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0085, + "step": 6420 + }, + { + "epoch": 0.39353693616500396, + "grad_norm": 0.35160595178604126, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0072, + "step": 6430 + }, + { + "epoch": 0.39414896872513616, + "grad_norm": 0.33254173398017883, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0083, + "step": 6440 + }, + { + "epoch": 0.39476100128526836, + "grad_norm": 0.22763408720493317, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0061, + "step": 6450 + }, + { + "epoch": 0.39537303384540057, + "grad_norm": 0.20889192819595337, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0067, + "step": 6460 + }, + { + "epoch": 0.39598506640553277, + "grad_norm": 0.22515206038951874, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0086, + "step": 6470 + }, + { + "epoch": 0.396597098965665, + "grad_norm": 0.36421817541122437, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.0064, + "step": 6480 + }, + { + "epoch": 0.3972091315257972, + "grad_norm": 0.3869773745536804, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0084, + "step": 6490 + }, + { + "epoch": 0.3978211640859294, + "grad_norm": 0.26248687505722046, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0061, + "step": 6500 + }, + { + "epoch": 0.3984331966460616, + "grad_norm": 0.22152310609817505, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0057, + "step": 6510 + }, + { + "epoch": 0.3990452292061938, + "grad_norm": 0.25921961665153503, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0071, + "step": 6520 + }, + { + "epoch": 0.399657261766326, + "grad_norm": 0.3289903998374939, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0076, + "step": 6530 + }, + { + "epoch": 0.4002692943264582, + "grad_norm": 0.2767571210861206, + "learning_rate": 1.8427795928237e-05, + "loss": 0.01, + "step": 6540 + }, + { + "epoch": 0.40088132688659034, + "grad_norm": 0.46339666843414307, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0064, + "step": 6550 + }, + { + "epoch": 0.40149335944672254, + "grad_norm": 0.2942553460597992, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0067, + "step": 6560 + }, + { + "epoch": 0.40210539200685474, + "grad_norm": 0.3868240714073181, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0085, + "step": 6570 + }, + { + "epoch": 0.40271742456698695, + "grad_norm": 0.3999684154987335, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0067, + "step": 6580 + }, + { + "epoch": 0.40332945712711915, + "grad_norm": 0.42856812477111816, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0091, + "step": 6590 + }, + { + "epoch": 0.40394148968725135, + "grad_norm": 0.3099806010723114, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0084, + "step": 6600 + }, + { + "epoch": 0.40455352224738356, + "grad_norm": 0.3798827826976776, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0066, + "step": 6610 + }, + { + "epoch": 0.40516555480751576, + "grad_norm": 0.19007280468940735, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0068, + "step": 6620 + }, + { + "epoch": 0.40577758736764796, + "grad_norm": 0.3723277151584625, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0085, + "step": 6630 + }, + { + "epoch": 0.40638961992778017, + "grad_norm": 0.21034900844097137, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0069, + "step": 6640 + }, + { + "epoch": 0.40700165248791237, + "grad_norm": 0.29838645458221436, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0095, + "step": 6650 + }, + { + "epoch": 0.40761368504804457, + "grad_norm": 0.2645854353904724, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0061, + "step": 6660 + }, + { + "epoch": 0.4082257176081768, + "grad_norm": 0.21633592247962952, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.006, + "step": 6670 + }, + { + "epoch": 0.408837750168309, + "grad_norm": 0.25387731194496155, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.008, + "step": 6680 + }, + { + "epoch": 0.4094497827284412, + "grad_norm": 0.3752288520336151, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0092, + "step": 6690 + }, + { + "epoch": 0.41006181528857333, + "grad_norm": 0.33368971943855286, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0072, + "step": 6700 + }, + { + "epoch": 0.41067384784870553, + "grad_norm": 0.34388917684555054, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0074, + "step": 6710 + }, + { + "epoch": 0.41128588040883773, + "grad_norm": 0.2683192789554596, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.007, + "step": 6720 + }, + { + "epoch": 0.41189791296896994, + "grad_norm": 0.5121234059333801, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0074, + "step": 6730 + }, + { + "epoch": 0.41250994552910214, + "grad_norm": 0.333406925201416, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0117, + "step": 6740 + }, + { + "epoch": 0.41312197808923434, + "grad_norm": 0.26011794805526733, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0062, + "step": 6750 + }, + { + "epoch": 0.41373401064936655, + "grad_norm": 0.28925821185112, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0066, + "step": 6760 + }, + { + "epoch": 0.41434604320949875, + "grad_norm": 0.2202957570552826, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0058, + "step": 6770 + }, + { + "epoch": 0.41495807576963095, + "grad_norm": 0.2740793824195862, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0072, + "step": 6780 + }, + { + "epoch": 0.41557010832976315, + "grad_norm": 0.46569427847862244, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0069, + "step": 6790 + }, + { + "epoch": 0.41618214088989536, + "grad_norm": 0.3959881067276001, + "learning_rate": 1.828172598376902e-05, + "loss": 0.009, + "step": 6800 + }, + { + "epoch": 0.41679417345002756, + "grad_norm": 0.2465214729309082, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0068, + "step": 6810 + }, + { + "epoch": 0.41740620601015976, + "grad_norm": 0.3207756280899048, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0083, + "step": 6820 + }, + { + "epoch": 0.41801823857029197, + "grad_norm": 0.5600990653038025, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0076, + "step": 6830 + }, + { + "epoch": 0.4186302711304241, + "grad_norm": 0.32832831144332886, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0072, + "step": 6840 + }, + { + "epoch": 0.4192423036905563, + "grad_norm": 0.3397129774093628, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0083, + "step": 6850 + }, + { + "epoch": 0.4198543362506885, + "grad_norm": 0.3481312096118927, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0064, + "step": 6860 + }, + { + "epoch": 0.4204663688108207, + "grad_norm": 0.4542059898376465, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0104, + "step": 6870 + }, + { + "epoch": 0.4210784013709529, + "grad_norm": 0.2517620325088501, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0089, + "step": 6880 + }, + { + "epoch": 0.42169043393108513, + "grad_norm": 0.3671923875808716, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0068, + "step": 6890 + }, + { + "epoch": 0.42230246649121733, + "grad_norm": 0.41340726613998413, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0084, + "step": 6900 + }, + { + "epoch": 0.42291449905134954, + "grad_norm": 0.22815965116024017, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0079, + "step": 6910 + }, + { + "epoch": 0.42352653161148174, + "grad_norm": 0.35324010252952576, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0073, + "step": 6920 + }, + { + "epoch": 0.42413856417161394, + "grad_norm": 0.30134323239326477, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0071, + "step": 6930 + }, + { + "epoch": 0.42475059673174614, + "grad_norm": 0.4007415771484375, + "learning_rate": 1.82006727813775e-05, + "loss": 0.006, + "step": 6940 + }, + { + "epoch": 0.42536262929187835, + "grad_norm": 0.3320179879665375, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0074, + "step": 6950 + }, + { + "epoch": 0.42597466185201055, + "grad_norm": 0.311971515417099, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0062, + "step": 6960 + }, + { + "epoch": 0.42658669441214275, + "grad_norm": 0.34347453713417053, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0068, + "step": 6970 + }, + { + "epoch": 0.4271987269722749, + "grad_norm": 0.25632336735725403, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0113, + "step": 6980 + }, + { + "epoch": 0.4278107595324071, + "grad_norm": 0.21711130440235138, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0068, + "step": 6990 + }, + { + "epoch": 0.4284227920925393, + "grad_norm": 0.3381270170211792, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0064, + "step": 7000 + }, + { + "epoch": 0.4290348246526715, + "grad_norm": 0.32262885570526123, + "learning_rate": 1.815952390818299e-05, + "loss": 0.0091, + "step": 7010 + }, + { + "epoch": 0.4296468572128037, + "grad_norm": 0.65865558385849, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0104, + "step": 7020 + }, + { + "epoch": 0.4302588897729359, + "grad_norm": 0.3021128177642822, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.009, + "step": 7030 + }, + { + "epoch": 0.4308709223330681, + "grad_norm": 0.2859005331993103, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0065, + "step": 7040 + }, + { + "epoch": 0.4314829548932003, + "grad_norm": 0.3379405736923218, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0062, + "step": 7050 + }, + { + "epoch": 0.4320949874533325, + "grad_norm": 0.22009991109371185, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.007, + "step": 7060 + }, + { + "epoch": 0.4327070200134647, + "grad_norm": 0.24766206741333008, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0072, + "step": 7070 + }, + { + "epoch": 0.43331905257359693, + "grad_norm": 0.3557615280151367, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0096, + "step": 7080 + }, + { + "epoch": 0.43393108513372913, + "grad_norm": 0.5700691938400269, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0066, + "step": 7090 + }, + { + "epoch": 0.43454311769386134, + "grad_norm": 0.3194892704486847, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0065, + "step": 7100 + }, + { + "epoch": 0.43515515025399354, + "grad_norm": 0.2766750752925873, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0074, + "step": 7110 + }, + { + "epoch": 0.4357671828141257, + "grad_norm": 0.2775132656097412, + "learning_rate": 1.809403050791396e-05, + "loss": 0.007, + "step": 7120 + }, + { + "epoch": 0.4363792153742579, + "grad_norm": 0.4468507170677185, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0066, + "step": 7130 + }, + { + "epoch": 0.4369912479343901, + "grad_norm": 0.3282400369644165, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0185, + "step": 7140 + }, + { + "epoch": 0.4376032804945223, + "grad_norm": 0.2625710964202881, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0088, + "step": 7150 + }, + { + "epoch": 0.4382153130546545, + "grad_norm": 0.47729599475860596, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.008, + "step": 7160 + }, + { + "epoch": 0.4388273456147867, + "grad_norm": 0.30350950360298157, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0091, + "step": 7170 + }, + { + "epoch": 0.4394393781749189, + "grad_norm": 0.3514627516269684, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0065, + "step": 7180 + }, + { + "epoch": 0.4400514107350511, + "grad_norm": 0.26150578260421753, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0087, + "step": 7190 + }, + { + "epoch": 0.4406634432951833, + "grad_norm": 0.374138206243515, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0073, + "step": 7200 + }, + { + "epoch": 0.4412754758553155, + "grad_norm": 0.2980635166168213, + "learning_rate": 1.803969531201634e-05, + "loss": 0.007, + "step": 7210 + }, + { + "epoch": 0.4418875084154477, + "grad_norm": 0.38190510869026184, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0077, + "step": 7220 + }, + { + "epoch": 0.4424995409755799, + "grad_norm": 0.28819066286087036, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0065, + "step": 7230 + }, + { + "epoch": 0.4431115735357121, + "grad_norm": 0.43382275104522705, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0077, + "step": 7240 + }, + { + "epoch": 0.4437236060958443, + "grad_norm": 0.31589648127555847, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0065, + "step": 7250 + }, + { + "epoch": 0.4443356386559765, + "grad_norm": 0.3744536340236664, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0067, + "step": 7260 + }, + { + "epoch": 0.4449476712161087, + "grad_norm": 0.2600225806236267, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.008, + "step": 7270 + }, + { + "epoch": 0.4455597037762409, + "grad_norm": 0.28064799308776855, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0061, + "step": 7280 + }, + { + "epoch": 0.4461717363363731, + "grad_norm": 0.2745135426521301, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0075, + "step": 7290 + }, + { + "epoch": 0.4467837688965053, + "grad_norm": 0.23609793186187744, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0056, + "step": 7300 + }, + { + "epoch": 0.4473958014566375, + "grad_norm": 0.35910022258758545, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0079, + "step": 7310 + }, + { + "epoch": 0.4480078340167697, + "grad_norm": 0.22230662405490875, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0065, + "step": 7320 + }, + { + "epoch": 0.4486198665769019, + "grad_norm": 0.3835199475288391, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.008, + "step": 7330 + }, + { + "epoch": 0.4492318991370341, + "grad_norm": 0.37863102555274963, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0083, + "step": 7340 + }, + { + "epoch": 0.4498439316971663, + "grad_norm": 0.25412216782569885, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0056, + "step": 7350 + }, + { + "epoch": 0.4504559642572985, + "grad_norm": 0.43248918652534485, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0061, + "step": 7360 + }, + { + "epoch": 0.4510679968174307, + "grad_norm": 0.2937811613082886, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0073, + "step": 7370 + }, + { + "epoch": 0.4516800293775629, + "grad_norm": 0.3018436133861542, + "learning_rate": 1.793524061803872e-05, + "loss": 0.007, + "step": 7380 + }, + { + "epoch": 0.4522920619376951, + "grad_norm": 0.32781726121902466, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0079, + "step": 7390 + }, + { + "epoch": 0.45290409449782726, + "grad_norm": 0.2843719720840454, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0067, + "step": 7400 + }, + { + "epoch": 0.45351612705795946, + "grad_norm": 0.27588292956352234, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0092, + "step": 7410 + }, + { + "epoch": 0.45412815961809166, + "grad_norm": 0.38858234882354736, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0067, + "step": 7420 + }, + { + "epoch": 0.45474019217822387, + "grad_norm": 0.4235166609287262, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0082, + "step": 7430 + }, + { + "epoch": 0.45535222473835607, + "grad_norm": 0.272210031747818, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0059, + "step": 7440 + }, + { + "epoch": 0.4559642572984883, + "grad_norm": 0.23851896822452545, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0071, + "step": 7450 + }, + { + "epoch": 0.4565762898586205, + "grad_norm": 0.37179476022720337, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0073, + "step": 7460 + }, + { + "epoch": 0.4571883224187527, + "grad_norm": 0.31902605295181274, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.009, + "step": 7470 + }, + { + "epoch": 0.4578003549788849, + "grad_norm": 0.47023633122444153, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0091, + "step": 7480 + }, + { + "epoch": 0.4584123875390171, + "grad_norm": 0.35726839303970337, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0079, + "step": 7490 + }, + { + "epoch": 0.4590244200991493, + "grad_norm": 0.27567291259765625, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0073, + "step": 7500 + }, + { + "epoch": 0.4596364526592815, + "grad_norm": 0.23053516447544098, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0065, + "step": 7510 + }, + { + "epoch": 0.4602484852194137, + "grad_norm": 0.2169056385755539, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0054, + "step": 7520 + }, + { + "epoch": 0.4608605177795459, + "grad_norm": 0.2912258207798004, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0083, + "step": 7530 + }, + { + "epoch": 0.46147255033967804, + "grad_norm": 0.2527846097946167, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.006, + "step": 7540 + }, + { + "epoch": 0.46208458289981025, + "grad_norm": 0.3878445029258728, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0079, + "step": 7550 + }, + { + "epoch": 0.46269661545994245, + "grad_norm": 0.3981980085372925, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0081, + "step": 7560 + }, + { + "epoch": 0.46330864802007465, + "grad_norm": 0.48834845423698425, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0086, + "step": 7570 + }, + { + "epoch": 0.46392068058020686, + "grad_norm": 0.3045276701450348, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0085, + "step": 7580 + }, + { + "epoch": 0.46453271314033906, + "grad_norm": 0.23345299065113068, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0072, + "step": 7590 + }, + { + "epoch": 0.46514474570047126, + "grad_norm": 0.3632943034172058, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0085, + "step": 7600 + }, + { + "epoch": 0.46575677826060347, + "grad_norm": 0.19813670217990875, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0073, + "step": 7610 + }, + { + "epoch": 0.46636881082073567, + "grad_norm": 0.36094173789024353, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0085, + "step": 7620 + }, + { + "epoch": 0.46698084338086787, + "grad_norm": 0.30049464106559753, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0085, + "step": 7630 + }, + { + "epoch": 0.4675928759410001, + "grad_norm": 0.27693697810173035, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0057, + "step": 7640 + }, + { + "epoch": 0.4682049085011323, + "grad_norm": 0.3656866252422333, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0064, + "step": 7650 + }, + { + "epoch": 0.4688169410612645, + "grad_norm": 0.602168083190918, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0076, + "step": 7660 + }, + { + "epoch": 0.4694289736213967, + "grad_norm": 0.3553078770637512, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0062, + "step": 7670 + }, + { + "epoch": 0.47004100618152883, + "grad_norm": 0.326695054769516, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0062, + "step": 7680 + }, + { + "epoch": 0.47065303874166103, + "grad_norm": 0.2762170732021332, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0075, + "step": 7690 + }, + { + "epoch": 0.47126507130179324, + "grad_norm": 0.35057321190834045, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0063, + "step": 7700 + }, + { + "epoch": 0.47187710386192544, + "grad_norm": 0.3906462788581848, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0086, + "step": 7710 + }, + { + "epoch": 0.47248913642205764, + "grad_norm": 0.290752112865448, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0087, + "step": 7720 + }, + { + "epoch": 0.47310116898218985, + "grad_norm": 0.2242034673690796, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0068, + "step": 7730 + }, + { + "epoch": 0.47371320154232205, + "grad_norm": 0.3283435106277466, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0068, + "step": 7740 + }, + { + "epoch": 0.47432523410245425, + "grad_norm": 0.24059069156646729, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.0056, + "step": 7750 + }, + { + "epoch": 0.47493726666258645, + "grad_norm": 0.2978667914867401, + "learning_rate": 1.769330275540774e-05, + "loss": 0.007, + "step": 7760 + }, + { + "epoch": 0.47554929922271866, + "grad_norm": 0.2605571150779724, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0084, + "step": 7770 + }, + { + "epoch": 0.47616133178285086, + "grad_norm": 0.4010445475578308, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0075, + "step": 7780 + }, + { + "epoch": 0.47677336434298306, + "grad_norm": 0.31932029128074646, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0076, + "step": 7790 + }, + { + "epoch": 0.47738539690311527, + "grad_norm": 0.3508684039115906, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0067, + "step": 7800 + }, + { + "epoch": 0.47799742946324747, + "grad_norm": 0.2835206091403961, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0064, + "step": 7810 + }, + { + "epoch": 0.4786094620233796, + "grad_norm": 0.2661663293838501, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0051, + "step": 7820 + }, + { + "epoch": 0.4792214945835118, + "grad_norm": 0.4146379828453064, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0073, + "step": 7830 + }, + { + "epoch": 0.479833527143644, + "grad_norm": 0.38621196150779724, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0076, + "step": 7840 + }, + { + "epoch": 0.4804455597037762, + "grad_norm": 0.19052188098430634, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.008, + "step": 7850 + }, + { + "epoch": 0.48105759226390843, + "grad_norm": 0.3699149489402771, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0071, + "step": 7860 + }, + { + "epoch": 0.48166962482404063, + "grad_norm": 0.3756427764892578, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0071, + "step": 7870 + }, + { + "epoch": 0.48228165738417283, + "grad_norm": 0.2987386882305145, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0069, + "step": 7880 + }, + { + "epoch": 0.48289368994430504, + "grad_norm": 0.24891899526119232, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0077, + "step": 7890 + }, + { + "epoch": 0.48350572250443724, + "grad_norm": 0.44080299139022827, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.011, + "step": 7900 + }, + { + "epoch": 0.48411775506456944, + "grad_norm": 0.20801177620887756, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0089, + "step": 7910 + }, + { + "epoch": 0.48472978762470165, + "grad_norm": 0.31475305557250977, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0091, + "step": 7920 + }, + { + "epoch": 0.48534182018483385, + "grad_norm": 0.29783639311790466, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0082, + "step": 7930 + }, + { + "epoch": 0.48595385274496605, + "grad_norm": 0.3330203890800476, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0071, + "step": 7940 + }, + { + "epoch": 0.48656588530509826, + "grad_norm": 0.3537667691707611, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0068, + "step": 7950 + }, + { + "epoch": 0.4871779178652304, + "grad_norm": 0.2810688316822052, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0059, + "step": 7960 + }, + { + "epoch": 0.4877899504253626, + "grad_norm": 0.3359779715538025, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0059, + "step": 7970 + }, + { + "epoch": 0.4884019829854948, + "grad_norm": 0.36015257239341736, + "learning_rate": 1.754802282200567e-05, + "loss": 0.008, + "step": 7980 + }, + { + "epoch": 0.489014015545627, + "grad_norm": 0.2647690176963806, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0065, + "step": 7990 + }, + { + "epoch": 0.4896260481057592, + "grad_norm": 0.23366811871528625, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0068, + "step": 8000 + }, + { + "epoch": 0.4902380806658914, + "grad_norm": 0.2904139757156372, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0054, + "step": 8010 + }, + { + "epoch": 0.4908501132260236, + "grad_norm": 0.30941230058670044, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0069, + "step": 8020 + }, + { + "epoch": 0.4914621457861558, + "grad_norm": 0.1959473341703415, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0063, + "step": 8030 + }, + { + "epoch": 0.492074178346288, + "grad_norm": 0.33349713683128357, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0059, + "step": 8040 + }, + { + "epoch": 0.49268621090642023, + "grad_norm": 0.39017921686172485, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0067, + "step": 8050 + }, + { + "epoch": 0.49329824346655243, + "grad_norm": 0.36401957273483276, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0061, + "step": 8060 + }, + { + "epoch": 0.49391027602668464, + "grad_norm": 0.22296921908855438, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0071, + "step": 8070 + }, + { + "epoch": 0.49452230858681684, + "grad_norm": 0.8712129592895508, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0104, + "step": 8080 + }, + { + "epoch": 0.49513434114694904, + "grad_norm": 0.39942649006843567, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0068, + "step": 8090 + }, + { + "epoch": 0.4957463737070812, + "grad_norm": 0.3821292817592621, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0076, + "step": 8100 + }, + { + "epoch": 0.4963584062672134, + "grad_norm": 0.35861077904701233, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0086, + "step": 8110 + }, + { + "epoch": 0.4969704388273456, + "grad_norm": 0.38629451394081116, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0068, + "step": 8120 + }, + { + "epoch": 0.4975824713874778, + "grad_norm": 3.412374973297119, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0168, + "step": 8130 + }, + { + "epoch": 0.49819450394761, + "grad_norm": 0.2893833816051483, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0065, + "step": 8140 + }, + { + "epoch": 0.4988065365077422, + "grad_norm": 0.37679117918014526, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0058, + "step": 8150 + }, + { + "epoch": 0.4994185690678744, + "grad_norm": 0.2745130658149719, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0082, + "step": 8160 + }, + { + "epoch": 0.5000306016280066, + "grad_norm": 0.30250442028045654, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0065, + "step": 8170 + }, + { + "epoch": 0.5006426341881388, + "grad_norm": 0.19602464139461517, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0056, + "step": 8180 + }, + { + "epoch": 0.501254666748271, + "grad_norm": 0.4736115634441376, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0062, + "step": 8190 + }, + { + "epoch": 0.5018666993084032, + "grad_norm": 0.25439244508743286, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0069, + "step": 8200 + }, + { + "epoch": 0.5024787318685354, + "grad_norm": 0.19290995597839355, + "learning_rate": 1.739216409306913e-05, + "loss": 0.007, + "step": 8210 + }, + { + "epoch": 0.5030907644286676, + "grad_norm": 0.24844267964363098, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0071, + "step": 8220 + }, + { + "epoch": 0.5037027969887998, + "grad_norm": 0.21179668605327606, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0071, + "step": 8230 + }, + { + "epoch": 0.504314829548932, + "grad_norm": 0.29139387607574463, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0082, + "step": 8240 + }, + { + "epoch": 0.5049268621090642, + "grad_norm": 0.2621973752975464, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0068, + "step": 8250 + }, + { + "epoch": 0.5055388946691964, + "grad_norm": 0.23394125699996948, + "learning_rate": 1.735775329110705e-05, + "loss": 0.006, + "step": 8260 + }, + { + "epoch": 0.5061509272293286, + "grad_norm": 0.28399863839149475, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0067, + "step": 8270 + }, + { + "epoch": 0.5067629597894608, + "grad_norm": 0.5048072934150696, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.008, + "step": 8280 + }, + { + "epoch": 0.507374992349593, + "grad_norm": 0.33848801255226135, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0054, + "step": 8290 + }, + { + "epoch": 0.5079870249097252, + "grad_norm": 0.28341951966285706, + "learning_rate": 1.733009030001197e-05, + "loss": 0.008, + "step": 8300 + }, + { + "epoch": 0.5085990574698575, + "grad_norm": 0.3223153054714203, + "learning_rate": 1.732315596014244e-05, + "loss": 0.007, + "step": 8310 + }, + { + "epoch": 0.5092110900299895, + "grad_norm": 0.23227599263191223, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0049, + "step": 8320 + }, + { + "epoch": 0.5098231225901217, + "grad_norm": 0.2847786247730255, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.007, + "step": 8330 + }, + { + "epoch": 0.510435155150254, + "grad_norm": 0.2026357650756836, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.007, + "step": 8340 + }, + { + "epoch": 0.5110471877103862, + "grad_norm": 0.3617453873157501, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0067, + "step": 8350 + }, + { + "epoch": 0.5116592202705184, + "grad_norm": 0.4439109265804291, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0067, + "step": 8360 + }, + { + "epoch": 0.5122712528306506, + "grad_norm": 0.26640209555625916, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0086, + "step": 8370 + }, + { + "epoch": 0.5128832853907828, + "grad_norm": 0.38045984506607056, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0075, + "step": 8380 + }, + { + "epoch": 0.513495317950915, + "grad_norm": 0.23035791516304016, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.006, + "step": 8390 + }, + { + "epoch": 0.5141073505110472, + "grad_norm": 0.40618664026260376, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0071, + "step": 8400 + }, + { + "epoch": 0.5147193830711794, + "grad_norm": 0.2593354880809784, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0064, + "step": 8410 + }, + { + "epoch": 0.5153314156313116, + "grad_norm": 0.27723655104637146, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0062, + "step": 8420 + }, + { + "epoch": 0.5159434481914438, + "grad_norm": 0.3793911039829254, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0059, + "step": 8430 + }, + { + "epoch": 0.516555480751576, + "grad_norm": 0.28634312748908997, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0063, + "step": 8440 + }, + { + "epoch": 0.5171675133117082, + "grad_norm": 0.39417290687561035, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0078, + "step": 8450 + }, + { + "epoch": 0.5177795458718404, + "grad_norm": 0.3043057322502136, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0064, + "step": 8460 + }, + { + "epoch": 0.5183915784319726, + "grad_norm": 0.36794111132621765, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0106, + "step": 8470 + }, + { + "epoch": 0.5190036109921048, + "grad_norm": 0.312161922454834, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0067, + "step": 8480 + }, + { + "epoch": 0.519615643552237, + "grad_norm": 0.39240267872810364, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0065, + "step": 8490 + }, + { + "epoch": 0.5202276761123692, + "grad_norm": 0.4500446915626526, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0073, + "step": 8500 + }, + { + "epoch": 0.5208397086725014, + "grad_norm": 0.22808927297592163, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0072, + "step": 8510 + }, + { + "epoch": 0.5214517412326336, + "grad_norm": 0.3262411057949066, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0065, + "step": 8520 + }, + { + "epoch": 0.5220637737927658, + "grad_norm": 0.472229927778244, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0068, + "step": 8530 + }, + { + "epoch": 0.522675806352898, + "grad_norm": 0.31563568115234375, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0074, + "step": 8540 + }, + { + "epoch": 0.5232878389130302, + "grad_norm": 0.27949750423431396, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0064, + "step": 8550 + }, + { + "epoch": 0.5238998714731624, + "grad_norm": 0.30297499895095825, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0075, + "step": 8560 + }, + { + "epoch": 0.5245119040332946, + "grad_norm": 0.3946770429611206, + "learning_rate": 1.714028248198457e-05, + "loss": 0.011, + "step": 8570 + }, + { + "epoch": 0.5251239365934268, + "grad_norm": 0.3405992090702057, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0082, + "step": 8580 + }, + { + "epoch": 0.525735969153559, + "grad_norm": 0.2963511347770691, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0066, + "step": 8590 + }, + { + "epoch": 0.5263480017136911, + "grad_norm": 0.1909177303314209, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.006, + "step": 8600 + }, + { + "epoch": 0.5269600342738233, + "grad_norm": 0.3378836512565613, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0058, + "step": 8610 + }, + { + "epoch": 0.5275720668339555, + "grad_norm": 0.30862805247306824, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0067, + "step": 8620 + }, + { + "epoch": 0.5281840993940877, + "grad_norm": 0.397293359041214, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0071, + "step": 8630 + }, + { + "epoch": 0.5287961319542199, + "grad_norm": 0.3665411174297333, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0068, + "step": 8640 + }, + { + "epoch": 0.5294081645143521, + "grad_norm": 0.34842419624328613, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0068, + "step": 8650 + }, + { + "epoch": 0.5300201970744843, + "grad_norm": 0.38205671310424805, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0065, + "step": 8660 + }, + { + "epoch": 0.5306322296346165, + "grad_norm": 0.35549092292785645, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0068, + "step": 8670 + }, + { + "epoch": 0.5312442621947487, + "grad_norm": 0.15676020085811615, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0055, + "step": 8680 + }, + { + "epoch": 0.5318562947548809, + "grad_norm": 0.22985056042671204, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0071, + "step": 8690 + }, + { + "epoch": 0.5324683273150131, + "grad_norm": 0.2743426263332367, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0047, + "step": 8700 + }, + { + "epoch": 0.5330803598751453, + "grad_norm": 0.2503803074359894, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0079, + "step": 8710 + }, + { + "epoch": 0.5336923924352776, + "grad_norm": 0.5036469101905823, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0095, + "step": 8720 + }, + { + "epoch": 0.5343044249954098, + "grad_norm": 0.2349964827299118, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0068, + "step": 8730 + }, + { + "epoch": 0.534916457555542, + "grad_norm": 0.28706061840057373, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0065, + "step": 8740 + }, + { + "epoch": 0.5355284901156742, + "grad_norm": 0.21812452375888824, + "learning_rate": 1.701081551967764e-05, + "loss": 0.008, + "step": 8750 + }, + { + "epoch": 0.5361405226758064, + "grad_norm": 0.301618754863739, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0069, + "step": 8760 + }, + { + "epoch": 0.5367525552359386, + "grad_norm": 0.35402950644493103, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0067, + "step": 8770 + }, + { + "epoch": 0.5373645877960708, + "grad_norm": 0.2875203788280487, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0097, + "step": 8780 + }, + { + "epoch": 0.537976620356203, + "grad_norm": 0.2358965128660202, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0053, + "step": 8790 + }, + { + "epoch": 0.5385886529163352, + "grad_norm": 0.14462094008922577, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0059, + "step": 8800 + }, + { + "epoch": 0.5392006854764674, + "grad_norm": 0.17893171310424805, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0062, + "step": 8810 + }, + { + "epoch": 0.5398127180365996, + "grad_norm": 0.2923351526260376, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0078, + "step": 8820 + }, + { + "epoch": 0.5404247505967318, + "grad_norm": 0.3288479745388031, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0067, + "step": 8830 + }, + { + "epoch": 0.541036783156864, + "grad_norm": 0.3996310532093048, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.006, + "step": 8840 + }, + { + "epoch": 0.5416488157169962, + "grad_norm": 0.24345380067825317, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0067, + "step": 8850 + }, + { + "epoch": 0.5422608482771284, + "grad_norm": 0.26688340306282043, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0072, + "step": 8860 + }, + { + "epoch": 0.5428728808372606, + "grad_norm": 0.4816153645515442, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0099, + "step": 8870 + }, + { + "epoch": 0.5434849133973927, + "grad_norm": 0.22544988989830017, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.007, + "step": 8880 + }, + { + "epoch": 0.5440969459575249, + "grad_norm": 0.2820419669151306, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0064, + "step": 8890 + }, + { + "epoch": 0.5447089785176571, + "grad_norm": 0.2758846879005432, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0072, + "step": 8900 + }, + { + "epoch": 0.5453210110777893, + "grad_norm": 0.4620129466056824, + "learning_rate": 1.689381359053773e-05, + "loss": 0.008, + "step": 8910 + }, + { + "epoch": 0.5459330436379215, + "grad_norm": 0.5567039847373962, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0079, + "step": 8920 + }, + { + "epoch": 0.5465450761980537, + "grad_norm": 0.347251832485199, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.006, + "step": 8930 + }, + { + "epoch": 0.5471571087581859, + "grad_norm": 0.31768012046813965, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0065, + "step": 8940 + }, + { + "epoch": 0.5477691413183181, + "grad_norm": 0.24245156347751617, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0052, + "step": 8950 + }, + { + "epoch": 0.5483811738784503, + "grad_norm": 0.2124931961297989, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0074, + "step": 8960 + }, + { + "epoch": 0.5489932064385825, + "grad_norm": 0.18998636305332184, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0056, + "step": 8970 + }, + { + "epoch": 0.5496052389987147, + "grad_norm": 0.2667362689971924, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0056, + "step": 8980 + }, + { + "epoch": 0.5502172715588469, + "grad_norm": 0.4424617886543274, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0091, + "step": 8990 + }, + { + "epoch": 0.5508293041189791, + "grad_norm": 0.33623644709587097, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0061, + "step": 9000 + }, + { + "epoch": 0.5514413366791113, + "grad_norm": 0.29990604519844055, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0059, + "step": 9010 + }, + { + "epoch": 0.5520533692392435, + "grad_norm": 0.4384118914604187, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0059, + "step": 9020 + }, + { + "epoch": 0.5526654017993757, + "grad_norm": 0.3468496799468994, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0068, + "step": 9030 + }, + { + "epoch": 0.5532774343595079, + "grad_norm": 0.3473573327064514, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0062, + "step": 9040 + }, + { + "epoch": 0.5538894669196401, + "grad_norm": 0.36125242710113525, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0062, + "step": 9050 + }, + { + "epoch": 0.5545014994797723, + "grad_norm": 0.2603420615196228, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0091, + "step": 9060 + }, + { + "epoch": 0.5551135320399045, + "grad_norm": 0.27355659008026123, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0074, + "step": 9070 + }, + { + "epoch": 0.5557255646000367, + "grad_norm": 0.24741119146347046, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0064, + "step": 9080 + }, + { + "epoch": 0.556337597160169, + "grad_norm": 0.2001475840806961, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0094, + "step": 9090 + }, + { + "epoch": 0.5569496297203012, + "grad_norm": 0.41522347927093506, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0088, + "step": 9100 + }, + { + "epoch": 0.5575616622804334, + "grad_norm": 0.27282488346099854, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0062, + "step": 9110 + }, + { + "epoch": 0.5581736948405656, + "grad_norm": 0.26905956864356995, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.007, + "step": 9120 + }, + { + "epoch": 0.5587857274006978, + "grad_norm": 0.24747484922409058, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0084, + "step": 9130 + }, + { + "epoch": 0.55939775996083, + "grad_norm": 0.1863871067762375, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0061, + "step": 9140 + }, + { + "epoch": 0.5600097925209622, + "grad_norm": 0.3599740266799927, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0063, + "step": 9150 + }, + { + "epoch": 0.5606218250810943, + "grad_norm": 0.2238125205039978, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0047, + "step": 9160 + }, + { + "epoch": 0.5612338576412265, + "grad_norm": 0.272077351808548, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.006, + "step": 9170 + }, + { + "epoch": 0.5618458902013587, + "grad_norm": 0.2371625155210495, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0064, + "step": 9180 + }, + { + "epoch": 0.5624579227614909, + "grad_norm": 0.12783293426036835, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0054, + "step": 9190 + }, + { + "epoch": 0.5630699553216231, + "grad_norm": 0.3144581615924835, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0064, + "step": 9200 + }, + { + "epoch": 0.5636819878817553, + "grad_norm": 0.31995031237602234, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0082, + "step": 9210 + }, + { + "epoch": 0.5642940204418875, + "grad_norm": 0.31995660066604614, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0083, + "step": 9220 + }, + { + "epoch": 0.5649060530020197, + "grad_norm": 0.5018982291221619, + "learning_rate": 1.665453350687773e-05, + "loss": 0.007, + "step": 9230 + }, + { + "epoch": 0.5655180855621519, + "grad_norm": 0.2927841544151306, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0104, + "step": 9240 + }, + { + "epoch": 0.5661301181222841, + "grad_norm": 0.21124979853630066, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0068, + "step": 9250 + }, + { + "epoch": 0.5667421506824163, + "grad_norm": 0.25787463784217834, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0059, + "step": 9260 + }, + { + "epoch": 0.5673541832425485, + "grad_norm": 0.3194720447063446, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0065, + "step": 9270 + }, + { + "epoch": 0.5679662158026807, + "grad_norm": 0.24165599048137665, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.006, + "step": 9280 + }, + { + "epoch": 0.5685782483628129, + "grad_norm": 0.4880482256412506, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0096, + "step": 9290 + }, + { + "epoch": 0.5691902809229451, + "grad_norm": 0.24660199880599976, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0098, + "step": 9300 + }, + { + "epoch": 0.5698023134830773, + "grad_norm": 0.24707400798797607, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0052, + "step": 9310 + }, + { + "epoch": 0.5704143460432095, + "grad_norm": 0.33855682611465454, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.011, + "step": 9320 + }, + { + "epoch": 0.5710263786033417, + "grad_norm": 0.22913751006126404, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0074, + "step": 9330 + }, + { + "epoch": 0.5716384111634739, + "grad_norm": 0.24127185344696045, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0062, + "step": 9340 + }, + { + "epoch": 0.5722504437236061, + "grad_norm": 0.26104915142059326, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0065, + "step": 9350 + }, + { + "epoch": 0.5728624762837383, + "grad_norm": 0.21698857843875885, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0062, + "step": 9360 + }, + { + "epoch": 0.5734745088438705, + "grad_norm": 0.29092445969581604, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0081, + "step": 9370 + }, + { + "epoch": 0.5740865414040027, + "grad_norm": 0.2534378468990326, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0064, + "step": 9380 + }, + { + "epoch": 0.5746985739641349, + "grad_norm": 0.28900131583213806, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0064, + "step": 9390 + }, + { + "epoch": 0.5753106065242671, + "grad_norm": 0.3028101921081543, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0104, + "step": 9400 + }, + { + "epoch": 0.5759226390843993, + "grad_norm": 0.28851139545440674, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0069, + "step": 9410 + }, + { + "epoch": 0.5765346716445315, + "grad_norm": 0.5735841393470764, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0072, + "step": 9420 + }, + { + "epoch": 0.5771467042046637, + "grad_norm": 0.20355567336082458, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0056, + "step": 9430 + }, + { + "epoch": 0.5777587367647958, + "grad_norm": 0.37027955055236816, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.006, + "step": 9440 + }, + { + "epoch": 0.578370769324928, + "grad_norm": 0.2701684832572937, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0066, + "step": 9450 + }, + { + "epoch": 0.5789828018850602, + "grad_norm": 0.17381855845451355, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0054, + "step": 9460 + }, + { + "epoch": 0.5795948344451924, + "grad_norm": 0.250261515378952, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0096, + "step": 9470 + }, + { + "epoch": 0.5802068670053246, + "grad_norm": 0.22972841560840607, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0058, + "step": 9480 + }, + { + "epoch": 0.5808188995654568, + "grad_norm": 0.22654809057712555, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0061, + "step": 9490 + }, + { + "epoch": 0.581430932125589, + "grad_norm": 0.17165100574493408, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0058, + "step": 9500 + }, + { + "epoch": 0.5820429646857213, + "grad_norm": 0.2462143450975418, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0054, + "step": 9510 + }, + { + "epoch": 0.5826549972458535, + "grad_norm": 0.3970383107662201, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0069, + "step": 9520 + }, + { + "epoch": 0.5832670298059857, + "grad_norm": 0.21578988432884216, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0054, + "step": 9530 + }, + { + "epoch": 0.5838790623661179, + "grad_norm": 0.5680915713310242, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0072, + "step": 9540 + }, + { + "epoch": 0.5844910949262501, + "grad_norm": 0.24070246517658234, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0073, + "step": 9550 + }, + { + "epoch": 0.5851031274863823, + "grad_norm": 0.2524685263633728, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0068, + "step": 9560 + }, + { + "epoch": 0.5857151600465145, + "grad_norm": 0.27286672592163086, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.008, + "step": 9570 + }, + { + "epoch": 0.5863271926066467, + "grad_norm": 0.3459629714488983, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0088, + "step": 9580 + }, + { + "epoch": 0.5869392251667789, + "grad_norm": 0.2964814603328705, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0066, + "step": 9590 + }, + { + "epoch": 0.5875512577269111, + "grad_norm": 0.3559853434562683, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0064, + "step": 9600 + }, + { + "epoch": 0.5881632902870433, + "grad_norm": 0.256898432970047, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0056, + "step": 9610 + }, + { + "epoch": 0.5887753228471755, + "grad_norm": 0.25032711029052734, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0052, + "step": 9620 + }, + { + "epoch": 0.5893873554073077, + "grad_norm": 0.2467224895954132, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0048, + "step": 9630 + }, + { + "epoch": 0.5899993879674399, + "grad_norm": 0.5331161618232727, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0078, + "step": 9640 + }, + { + "epoch": 0.5906114205275721, + "grad_norm": 0.33348897099494934, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0068, + "step": 9650 + }, + { + "epoch": 0.5912234530877043, + "grad_norm": 0.21435993909835815, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0058, + "step": 9660 + }, + { + "epoch": 0.5918354856478365, + "grad_norm": 0.35850396752357483, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0068, + "step": 9670 + }, + { + "epoch": 0.5924475182079687, + "grad_norm": 0.3007623851299286, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0063, + "step": 9680 + }, + { + "epoch": 0.5930595507681009, + "grad_norm": 0.22949714958667755, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0054, + "step": 9690 + }, + { + "epoch": 0.5936715833282331, + "grad_norm": 0.23259367048740387, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0048, + "step": 9700 + }, + { + "epoch": 0.5942836158883653, + "grad_norm": 0.2305079996585846, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0047, + "step": 9710 + }, + { + "epoch": 0.5948956484484974, + "grad_norm": 0.33875930309295654, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0063, + "step": 9720 + }, + { + "epoch": 0.5955076810086296, + "grad_norm": 0.3981896936893463, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0076, + "step": 9730 + }, + { + "epoch": 0.5961197135687618, + "grad_norm": 0.280831515789032, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0075, + "step": 9740 + }, + { + "epoch": 0.596731746128894, + "grad_norm": 0.26045629382133484, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0064, + "step": 9750 + }, + { + "epoch": 0.5973437786890262, + "grad_norm": 0.23102521896362305, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0071, + "step": 9760 + }, + { + "epoch": 0.5979558112491584, + "grad_norm": 0.5013224482536316, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0063, + "step": 9770 + }, + { + "epoch": 0.5985678438092906, + "grad_norm": 0.45689067244529724, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0088, + "step": 9780 + }, + { + "epoch": 0.5991798763694228, + "grad_norm": 0.27118632197380066, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0065, + "step": 9790 + }, + { + "epoch": 0.599791908929555, + "grad_norm": 0.420202374458313, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0076, + "step": 9800 + }, + { + "epoch": 0.6004039414896872, + "grad_norm": 0.35844025015830994, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0056, + "step": 9810 + }, + { + "epoch": 0.6010159740498194, + "grad_norm": 0.2205585241317749, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0082, + "step": 9820 + }, + { + "epoch": 0.6016280066099516, + "grad_norm": 0.18860426545143127, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.007, + "step": 9830 + }, + { + "epoch": 0.6022400391700838, + "grad_norm": 0.25045180320739746, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0082, + "step": 9840 + }, + { + "epoch": 0.602852071730216, + "grad_norm": 0.2581705152988434, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0065, + "step": 9850 + }, + { + "epoch": 0.6034641042903482, + "grad_norm": 0.25894811749458313, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0058, + "step": 9860 + }, + { + "epoch": 0.6040761368504804, + "grad_norm": 0.43305444717407227, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0066, + "step": 9870 + }, + { + "epoch": 0.6046881694106127, + "grad_norm": 0.2295757383108139, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0069, + "step": 9880 + }, + { + "epoch": 0.6053002019707449, + "grad_norm": 0.29785802960395813, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0074, + "step": 9890 + }, + { + "epoch": 0.6059122345308771, + "grad_norm": 0.3353278338909149, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0068, + "step": 9900 + }, + { + "epoch": 0.6065242670910093, + "grad_norm": 0.29115045070648193, + "learning_rate": 1.612387195896372e-05, + "loss": 0.008, + "step": 9910 + }, + { + "epoch": 0.6071362996511415, + "grad_norm": 0.3202555477619171, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0071, + "step": 9920 + }, + { + "epoch": 0.6077483322112737, + "grad_norm": 0.2849314212799072, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.005, + "step": 9930 + }, + { + "epoch": 0.6083603647714059, + "grad_norm": 0.2768756151199341, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0051, + "step": 9940 + }, + { + "epoch": 0.6089723973315381, + "grad_norm": 0.3138035535812378, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0058, + "step": 9950 + }, + { + "epoch": 0.6095844298916703, + "grad_norm": 0.20827682316303253, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0058, + "step": 9960 + }, + { + "epoch": 0.6101964624518025, + "grad_norm": 0.29986995458602905, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0076, + "step": 9970 + }, + { + "epoch": 0.6108084950119347, + "grad_norm": 0.23564326763153076, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0056, + "step": 9980 + }, + { + "epoch": 0.6114205275720669, + "grad_norm": 0.24854765832424164, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0066, + "step": 9990 + }, + { + "epoch": 0.6120325601321991, + "grad_norm": 0.5696694850921631, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0072, + "step": 10000 + }, + { + "epoch": 0.6126445926923312, + "grad_norm": 0.24267911911010742, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.0061, + "step": 10010 + }, + { + "epoch": 0.6132566252524634, + "grad_norm": 0.1955283135175705, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0076, + "step": 10020 + }, + { + "epoch": 0.6138686578125956, + "grad_norm": 0.3427830934524536, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0073, + "step": 10030 + }, + { + "epoch": 0.6144806903727278, + "grad_norm": 0.38532915711402893, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0078, + "step": 10040 + }, + { + "epoch": 0.61509272293286, + "grad_norm": 0.4302294850349426, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0071, + "step": 10050 + }, + { + "epoch": 0.6157047554929922, + "grad_norm": 0.38420233130455017, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0072, + "step": 10060 + }, + { + "epoch": 0.6163167880531244, + "grad_norm": 0.23822636902332306, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.004, + "step": 10070 + }, + { + "epoch": 0.6169288206132566, + "grad_norm": 0.25123289227485657, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0065, + "step": 10080 + }, + { + "epoch": 0.6175408531733888, + "grad_norm": 0.23007746040821075, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0059, + "step": 10090 + }, + { + "epoch": 0.618152885733521, + "grad_norm": 0.24051082134246826, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0089, + "step": 10100 + }, + { + "epoch": 0.6187649182936532, + "grad_norm": 0.26246321201324463, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0052, + "step": 10110 + }, + { + "epoch": 0.6193769508537854, + "grad_norm": 0.3160432279109955, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0059, + "step": 10120 + }, + { + "epoch": 0.6199889834139176, + "grad_norm": 0.42534199357032776, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0071, + "step": 10130 + }, + { + "epoch": 0.6206010159740498, + "grad_norm": 0.22966268658638, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0052, + "step": 10140 + }, + { + "epoch": 0.621213048534182, + "grad_norm": 0.22234882414340973, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0073, + "step": 10150 + }, + { + "epoch": 0.6218250810943142, + "grad_norm": 0.31061676144599915, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0066, + "step": 10160 + }, + { + "epoch": 0.6224371136544464, + "grad_norm": 0.34178492426872253, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0063, + "step": 10170 + }, + { + "epoch": 0.6230491462145786, + "grad_norm": 0.263583779335022, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0079, + "step": 10180 + }, + { + "epoch": 0.6236611787747108, + "grad_norm": 0.3774336278438568, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0066, + "step": 10190 + }, + { + "epoch": 0.624273211334843, + "grad_norm": 0.29274430871009827, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.007, + "step": 10200 + }, + { + "epoch": 0.6248852438949752, + "grad_norm": 0.31850868463516235, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0056, + "step": 10210 + }, + { + "epoch": 0.6254972764551074, + "grad_norm": 0.3084369897842407, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0086, + "step": 10220 + }, + { + "epoch": 0.6261093090152396, + "grad_norm": 0.21596118807792664, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0072, + "step": 10230 + }, + { + "epoch": 0.6267213415753718, + "grad_norm": 0.16397996246814728, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0075, + "step": 10240 + }, + { + "epoch": 0.627333374135504, + "grad_norm": 0.15055827796459198, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0046, + "step": 10250 + }, + { + "epoch": 0.6279454066956363, + "grad_norm": 0.23483684659004211, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0064, + "step": 10260 + }, + { + "epoch": 0.6285574392557685, + "grad_norm": 0.3131091594696045, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0069, + "step": 10270 + }, + { + "epoch": 0.6291694718159007, + "grad_norm": 0.27958226203918457, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0067, + "step": 10280 + }, + { + "epoch": 0.6297815043760328, + "grad_norm": 0.23422567546367645, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0086, + "step": 10290 + }, + { + "epoch": 0.630393536936165, + "grad_norm": 0.4644703269004822, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0067, + "step": 10300 + }, + { + "epoch": 0.6310055694962972, + "grad_norm": 0.45787107944488525, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0068, + "step": 10310 + }, + { + "epoch": 0.6316176020564294, + "grad_norm": 0.21038737893104553, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0074, + "step": 10320 + }, + { + "epoch": 0.6322296346165616, + "grad_norm": 0.23812010884284973, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0054, + "step": 10330 + }, + { + "epoch": 0.6328416671766938, + "grad_norm": 0.36856284737586975, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0061, + "step": 10340 + }, + { + "epoch": 0.633453699736826, + "grad_norm": 0.3540131151676178, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0074, + "step": 10350 + }, + { + "epoch": 0.6340657322969582, + "grad_norm": 0.3004823923110962, + "learning_rate": 1.575723252169281e-05, + "loss": 0.006, + "step": 10360 + }, + { + "epoch": 0.6346777648570904, + "grad_norm": 0.17188489437103271, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0053, + "step": 10370 + }, + { + "epoch": 0.6352897974172226, + "grad_norm": 0.21710847318172455, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0062, + "step": 10380 + }, + { + "epoch": 0.6359018299773548, + "grad_norm": 0.2356785386800766, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0061, + "step": 10390 + }, + { + "epoch": 0.636513862537487, + "grad_norm": 0.2736414670944214, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0063, + "step": 10400 + }, + { + "epoch": 0.6371258950976192, + "grad_norm": 0.23872444033622742, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.006, + "step": 10410 + }, + { + "epoch": 0.6377379276577514, + "grad_norm": 0.24478361010551453, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0073, + "step": 10420 + }, + { + "epoch": 0.6383499602178836, + "grad_norm": 0.2964334487915039, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0048, + "step": 10430 + }, + { + "epoch": 0.6389619927780158, + "grad_norm": 0.2760549783706665, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0051, + "step": 10440 + }, + { + "epoch": 0.639574025338148, + "grad_norm": 0.2598065137863159, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0072, + "step": 10450 + }, + { + "epoch": 0.6401860578982802, + "grad_norm": 0.346999853849411, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0052, + "step": 10460 + }, + { + "epoch": 0.6407980904584124, + "grad_norm": 0.31291016936302185, + "learning_rate": 1.56658563993822e-05, + "loss": 0.007, + "step": 10470 + }, + { + "epoch": 0.6414101230185446, + "grad_norm": 0.2631952166557312, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0058, + "step": 10480 + }, + { + "epoch": 0.6420221555786768, + "grad_norm": 0.30895209312438965, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.006, + "step": 10490 + }, + { + "epoch": 0.642634188138809, + "grad_norm": 0.17614217102527618, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0069, + "step": 10500 + }, + { + "epoch": 0.6432462206989412, + "grad_norm": 0.38792312145233154, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0077, + "step": 10510 + }, + { + "epoch": 0.6438582532590734, + "grad_norm": 0.1722564697265625, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0054, + "step": 10520 + }, + { + "epoch": 0.6444702858192056, + "grad_norm": 0.2741699516773224, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0062, + "step": 10530 + }, + { + "epoch": 0.6450823183793378, + "grad_norm": 0.2059863954782486, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0073, + "step": 10540 + }, + { + "epoch": 0.64569435093947, + "grad_norm": 0.2702447474002838, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0044, + "step": 10550 + }, + { + "epoch": 0.6463063834996022, + "grad_norm": 0.2299312800168991, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0051, + "step": 10560 + }, + { + "epoch": 0.6469184160597343, + "grad_norm": 0.1995723992586136, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0057, + "step": 10570 + }, + { + "epoch": 0.6475304486198665, + "grad_norm": 0.30346980690956116, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0063, + "step": 10580 + }, + { + "epoch": 0.6481424811799987, + "grad_norm": 0.5040738582611084, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0064, + "step": 10590 + }, + { + "epoch": 0.6487545137401309, + "grad_norm": 0.16984818875789642, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0071, + "step": 10600 + }, + { + "epoch": 0.6493665463002631, + "grad_norm": 0.26560020446777344, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0116, + "step": 10610 + }, + { + "epoch": 0.6499785788603953, + "grad_norm": 0.4563823342323303, + "learning_rate": 1.554018740860716e-05, + "loss": 0.008, + "step": 10620 + }, + { + "epoch": 0.6505906114205275, + "grad_norm": 0.23272818326950073, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.006, + "step": 10630 + }, + { + "epoch": 0.6512026439806597, + "grad_norm": 0.19166870415210724, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0066, + "step": 10640 + }, + { + "epoch": 0.651814676540792, + "grad_norm": 0.2822705805301666, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0062, + "step": 10650 + }, + { + "epoch": 0.6524267091009242, + "grad_norm": 0.24001267552375793, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0069, + "step": 10660 + }, + { + "epoch": 0.6530387416610564, + "grad_norm": 0.2563900947570801, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0068, + "step": 10670 + }, + { + "epoch": 0.6536507742211886, + "grad_norm": 0.2747437357902527, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0058, + "step": 10680 + }, + { + "epoch": 0.6542628067813208, + "grad_norm": 0.39710354804992676, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.005, + "step": 10690 + }, + { + "epoch": 0.654874839341453, + "grad_norm": 0.30690231919288635, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0102, + "step": 10700 + }, + { + "epoch": 0.6554868719015852, + "grad_norm": 0.2879253923892975, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0072, + "step": 10710 + }, + { + "epoch": 0.6560989044617174, + "grad_norm": 0.19964110851287842, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0065, + "step": 10720 + }, + { + "epoch": 0.6567109370218496, + "grad_norm": 0.20109151303768158, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0066, + "step": 10730 + }, + { + "epoch": 0.6573229695819818, + "grad_norm": 0.21469832956790924, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0048, + "step": 10740 + }, + { + "epoch": 0.657935002142114, + "grad_norm": 0.19622936844825745, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0057, + "step": 10750 + }, + { + "epoch": 0.6585470347022462, + "grad_norm": 0.2255190759897232, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0052, + "step": 10760 + }, + { + "epoch": 0.6591590672623784, + "grad_norm": 0.47484955191612244, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0066, + "step": 10770 + }, + { + "epoch": 0.6597710998225106, + "grad_norm": 0.32192179560661316, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0067, + "step": 10780 + }, + { + "epoch": 0.6603831323826428, + "grad_norm": 0.33044904470443726, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0061, + "step": 10790 + }, + { + "epoch": 0.660995164942775, + "grad_norm": 0.3206661343574524, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0072, + "step": 10800 + }, + { + "epoch": 0.6616071975029072, + "grad_norm": 0.34903818368911743, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0055, + "step": 10810 + }, + { + "epoch": 0.6622192300630394, + "grad_norm": 0.1982222944498062, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0063, + "step": 10820 + }, + { + "epoch": 0.6628312626231716, + "grad_norm": 0.25388309359550476, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0072, + "step": 10830 + }, + { + "epoch": 0.6634432951833038, + "grad_norm": 0.2325269728899002, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0078, + "step": 10840 + }, + { + "epoch": 0.6640553277434359, + "grad_norm": 0.3364964425563812, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0054, + "step": 10850 + }, + { + "epoch": 0.6646673603035681, + "grad_norm": 0.198661208152771, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0061, + "step": 10860 + }, + { + "epoch": 0.6652793928637003, + "grad_norm": 0.333836168050766, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0058, + "step": 10870 + }, + { + "epoch": 0.6658914254238325, + "grad_norm": 0.21908101439476013, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0087, + "step": 10880 + }, + { + "epoch": 0.6665034579839647, + "grad_norm": 0.3094167709350586, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0062, + "step": 10890 + }, + { + "epoch": 0.6671154905440969, + "grad_norm": 0.28113746643066406, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0066, + "step": 10900 + }, + { + "epoch": 0.6677275231042291, + "grad_norm": 0.20239399373531342, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0071, + "step": 10910 + }, + { + "epoch": 0.6683395556643613, + "grad_norm": 0.32829156517982483, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0056, + "step": 10920 + }, + { + "epoch": 0.6689515882244935, + "grad_norm": 0.2950859069824219, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 0.6695636207846257, + "grad_norm": 0.36404141783714294, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0075, + "step": 10940 + }, + { + "epoch": 0.6701756533447579, + "grad_norm": 0.2479381114244461, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0055, + "step": 10950 + }, + { + "epoch": 0.6707876859048901, + "grad_norm": 0.1934390366077423, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.005, + "step": 10960 + }, + { + "epoch": 0.6713997184650223, + "grad_norm": 0.20912423729896545, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0064, + "step": 10970 + }, + { + "epoch": 0.6720117510251545, + "grad_norm": 0.1781405806541443, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0048, + "step": 10980 + }, + { + "epoch": 0.6726237835852867, + "grad_norm": 0.18812811374664307, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0048, + "step": 10990 + }, + { + "epoch": 0.6732358161454189, + "grad_norm": 0.2006077766418457, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0073, + "step": 11000 + }, + { + "epoch": 0.6738478487055511, + "grad_norm": 0.20471568405628204, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0068, + "step": 11010 + }, + { + "epoch": 0.6744598812656833, + "grad_norm": 0.2979716658592224, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0067, + "step": 11020 + }, + { + "epoch": 0.6750719138258156, + "grad_norm": 0.3256290853023529, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0054, + "step": 11030 + }, + { + "epoch": 0.6756839463859478, + "grad_norm": 0.3346560001373291, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0061, + "step": 11040 + }, + { + "epoch": 0.67629597894608, + "grad_norm": 0.35791122913360596, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0054, + "step": 11050 + }, + { + "epoch": 0.6769080115062122, + "grad_norm": 0.30428826808929443, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0066, + "step": 11060 + }, + { + "epoch": 0.6775200440663444, + "grad_norm": 0.31254154443740845, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0065, + "step": 11070 + }, + { + "epoch": 0.6781320766264766, + "grad_norm": 0.263028621673584, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0062, + "step": 11080 + }, + { + "epoch": 0.6787441091866088, + "grad_norm": 0.22496990859508514, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0056, + "step": 11090 + }, + { + "epoch": 0.679356141746741, + "grad_norm": 0.2647632360458374, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0072, + "step": 11100 + }, + { + "epoch": 0.6799681743068732, + "grad_norm": 0.2517150342464447, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0064, + "step": 11110 + }, + { + "epoch": 0.6805802068670054, + "grad_norm": 0.30550616979599, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0055, + "step": 11120 + }, + { + "epoch": 0.6811922394271375, + "grad_norm": 0.21312931180000305, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0074, + "step": 11130 + }, + { + "epoch": 0.6818042719872697, + "grad_norm": 0.21152199804782867, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0047, + "step": 11140 + }, + { + "epoch": 0.6824163045474019, + "grad_norm": 0.2030613273382187, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0045, + "step": 11150 + }, + { + "epoch": 0.6830283371075341, + "grad_norm": 0.30646151304244995, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0045, + "step": 11160 + }, + { + "epoch": 0.6836403696676663, + "grad_norm": 0.2693783938884735, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0061, + "step": 11170 + }, + { + "epoch": 0.6842524022277985, + "grad_norm": 0.25288495421409607, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0068, + "step": 11180 + }, + { + "epoch": 0.6848644347879307, + "grad_norm": 0.34989964962005615, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.007, + "step": 11190 + }, + { + "epoch": 0.6854764673480629, + "grad_norm": 0.192350834608078, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0064, + "step": 11200 + }, + { + "epoch": 0.6860884999081951, + "grad_norm": 0.3841196894645691, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0069, + "step": 11210 + }, + { + "epoch": 0.6867005324683273, + "grad_norm": 0.2168666571378708, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0063, + "step": 11220 + }, + { + "epoch": 0.6873125650284595, + "grad_norm": 0.2756234109401703, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0068, + "step": 11230 + }, + { + "epoch": 0.6879245975885917, + "grad_norm": 0.1971903294324875, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.006, + "step": 11240 + }, + { + "epoch": 0.6885366301487239, + "grad_norm": 0.3857499659061432, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0063, + "step": 11250 + }, + { + "epoch": 0.6891486627088561, + "grad_norm": 0.194110706448555, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0049, + "step": 11260 + }, + { + "epoch": 0.6897606952689883, + "grad_norm": 0.24935179948806763, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0054, + "step": 11270 + }, + { + "epoch": 0.6903727278291205, + "grad_norm": 0.5208527445793152, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0062, + "step": 11280 + }, + { + "epoch": 0.6909847603892527, + "grad_norm": 0.2917899191379547, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0058, + "step": 11290 + }, + { + "epoch": 0.6915967929493849, + "grad_norm": 0.42692577838897705, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0072, + "step": 11300 + }, + { + "epoch": 0.6922088255095171, + "grad_norm": 0.36888429522514343, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0059, + "step": 11310 + }, + { + "epoch": 0.6928208580696493, + "grad_norm": 0.26246029138565063, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0057, + "step": 11320 + }, + { + "epoch": 0.6934328906297815, + "grad_norm": 0.22163739800453186, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0078, + "step": 11330 + }, + { + "epoch": 0.6940449231899137, + "grad_norm": 0.33411458134651184, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0053, + "step": 11340 + }, + { + "epoch": 0.6946569557500459, + "grad_norm": 0.2792898118495941, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0095, + "step": 11350 + }, + { + "epoch": 0.6952689883101781, + "grad_norm": 0.2770175039768219, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0059, + "step": 11360 + }, + { + "epoch": 0.6958810208703103, + "grad_norm": 0.14913171529769897, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0046, + "step": 11370 + }, + { + "epoch": 0.6964930534304425, + "grad_norm": 0.22906239330768585, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0054, + "step": 11380 + }, + { + "epoch": 0.6971050859905747, + "grad_norm": 0.2854336202144623, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0052, + "step": 11390 + }, + { + "epoch": 0.697717118550707, + "grad_norm": 0.21835818886756897, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0064, + "step": 11400 + }, + { + "epoch": 0.698329151110839, + "grad_norm": 0.42180293798446655, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0071, + "step": 11410 + }, + { + "epoch": 0.6989411836709712, + "grad_norm": 0.3056841492652893, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0092, + "step": 11420 + }, + { + "epoch": 0.6995532162311034, + "grad_norm": 0.15149559080600739, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0049, + "step": 11430 + }, + { + "epoch": 0.7001652487912357, + "grad_norm": 0.15561188757419586, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0051, + "step": 11440 + }, + { + "epoch": 0.7007772813513679, + "grad_norm": 0.2941122055053711, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0065, + "step": 11450 + }, + { + "epoch": 0.7013893139115001, + "grad_norm": 0.3008195757865906, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0059, + "step": 11460 + }, + { + "epoch": 0.7020013464716323, + "grad_norm": 0.3787235617637634, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0068, + "step": 11470 + }, + { + "epoch": 0.7026133790317645, + "grad_norm": 0.2069675624370575, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.005, + "step": 11480 + }, + { + "epoch": 0.7032254115918967, + "grad_norm": 0.33505553007125854, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0058, + "step": 11490 + }, + { + "epoch": 0.7038374441520289, + "grad_norm": 0.281213641166687, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0064, + "step": 11500 + }, + { + "epoch": 0.7044494767121611, + "grad_norm": 0.28471192717552185, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0066, + "step": 11510 + }, + { + "epoch": 0.7050615092722933, + "grad_norm": 0.3166801929473877, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0062, + "step": 11520 + }, + { + "epoch": 0.7056735418324255, + "grad_norm": 0.26893407106399536, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.005, + "step": 11530 + }, + { + "epoch": 0.7062855743925577, + "grad_norm": 0.17421478033065796, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0063, + "step": 11540 + }, + { + "epoch": 0.7068976069526899, + "grad_norm": 0.40999990701675415, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0069, + "step": 11550 + }, + { + "epoch": 0.7075096395128221, + "grad_norm": 0.190180242061615, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0061, + "step": 11560 + }, + { + "epoch": 0.7081216720729543, + "grad_norm": 0.20383603870868683, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0049, + "step": 11570 + }, + { + "epoch": 0.7087337046330865, + "grad_norm": 0.28741395473480225, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0059, + "step": 11580 + }, + { + "epoch": 0.7093457371932187, + "grad_norm": 0.24231962859630585, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.008, + "step": 11590 + }, + { + "epoch": 0.7099577697533509, + "grad_norm": 0.2221115529537201, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0053, + "step": 11600 + }, + { + "epoch": 0.7105698023134831, + "grad_norm": 0.18564820289611816, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0072, + "step": 11610 + }, + { + "epoch": 0.7111818348736153, + "grad_norm": 0.3734343647956848, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0089, + "step": 11620 + }, + { + "epoch": 0.7117938674337475, + "grad_norm": 0.3215912878513336, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0093, + "step": 11630 + }, + { + "epoch": 0.7124058999938797, + "grad_norm": 0.22602899372577667, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0062, + "step": 11640 + }, + { + "epoch": 0.7130179325540119, + "grad_norm": 0.3115978538990021, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.006, + "step": 11650 + }, + { + "epoch": 0.7136299651141441, + "grad_norm": 0.26148155331611633, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0071, + "step": 11660 + }, + { + "epoch": 0.7142419976742763, + "grad_norm": 0.142781600356102, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0051, + "step": 11670 + }, + { + "epoch": 0.7148540302344085, + "grad_norm": 0.21306048333644867, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0068, + "step": 11680 + }, + { + "epoch": 0.7154660627945407, + "grad_norm": 0.3439876437187195, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.006, + "step": 11690 + }, + { + "epoch": 0.7160780953546728, + "grad_norm": 0.4010280966758728, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0062, + "step": 11700 + }, + { + "epoch": 0.716690127914805, + "grad_norm": 0.2760031819343567, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.006, + "step": 11710 + }, + { + "epoch": 0.7173021604749372, + "grad_norm": 0.45097261667251587, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0062, + "step": 11720 + }, + { + "epoch": 0.7179141930350694, + "grad_norm": 0.20118115842342377, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0054, + "step": 11730 + }, + { + "epoch": 0.7185262255952016, + "grad_norm": 0.3090760409832001, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0054, + "step": 11740 + }, + { + "epoch": 0.7191382581553338, + "grad_norm": 0.25016647577285767, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0077, + "step": 11750 + }, + { + "epoch": 0.719750290715466, + "grad_norm": 0.2310703545808792, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0053, + "step": 11760 + }, + { + "epoch": 0.7203623232755982, + "grad_norm": 0.2269359678030014, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.006, + "step": 11770 + }, + { + "epoch": 0.7209743558357304, + "grad_norm": 0.3917788565158844, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0062, + "step": 11780 + }, + { + "epoch": 0.7215863883958626, + "grad_norm": 0.25999465584754944, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0064, + "step": 11790 + }, + { + "epoch": 0.7221984209559948, + "grad_norm": 0.19340357184410095, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0071, + "step": 11800 + }, + { + "epoch": 0.722810453516127, + "grad_norm": 0.25046268105506897, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0078, + "step": 11810 + }, + { + "epoch": 0.7234224860762593, + "grad_norm": 0.19819264113903046, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.005, + "step": 11820 + }, + { + "epoch": 0.7240345186363915, + "grad_norm": 0.43484950065612793, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0078, + "step": 11830 + }, + { + "epoch": 0.7246465511965237, + "grad_norm": 0.29191601276397705, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0064, + "step": 11840 + }, + { + "epoch": 0.7252585837566559, + "grad_norm": 0.21717441082000732, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0056, + "step": 11850 + }, + { + "epoch": 0.7258706163167881, + "grad_norm": 0.3210129737854004, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0072, + "step": 11860 + }, + { + "epoch": 0.7264826488769203, + "grad_norm": 0.33192649483680725, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0061, + "step": 11870 + }, + { + "epoch": 0.7270946814370525, + "grad_norm": 0.14648163318634033, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0083, + "step": 11880 + }, + { + "epoch": 0.7277067139971847, + "grad_norm": 0.20028764009475708, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0052, + "step": 11890 + }, + { + "epoch": 0.7283187465573169, + "grad_norm": 0.21449612081050873, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0063, + "step": 11900 + }, + { + "epoch": 0.7289307791174491, + "grad_norm": 0.27472081780433655, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0062, + "step": 11910 + }, + { + "epoch": 0.7295428116775813, + "grad_norm": 0.2919130027294159, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0048, + "step": 11920 + }, + { + "epoch": 0.7301548442377135, + "grad_norm": 0.153092160820961, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0053, + "step": 11930 + }, + { + "epoch": 0.7307668767978457, + "grad_norm": 0.22820086777210236, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0058, + "step": 11940 + }, + { + "epoch": 0.7313789093579779, + "grad_norm": 0.24281881749629974, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0044, + "step": 11950 + }, + { + "epoch": 0.7319909419181101, + "grad_norm": 0.32581812143325806, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0051, + "step": 11960 + }, + { + "epoch": 0.7326029744782423, + "grad_norm": 0.3139822483062744, + "learning_rate": 1.435930222050582e-05, + "loss": 0.006, + "step": 11970 + }, + { + "epoch": 0.7332150070383744, + "grad_norm": 0.37985655665397644, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0052, + "step": 11980 + }, + { + "epoch": 0.7338270395985066, + "grad_norm": 0.1958508938550949, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.007, + "step": 11990 + }, + { + "epoch": 0.7344390721586388, + "grad_norm": 0.25318172574043274, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0084, + "step": 12000 + }, + { + "epoch": 0.735051104718771, + "grad_norm": 0.33245304226875305, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0051, + "step": 12010 + }, + { + "epoch": 0.7356631372789032, + "grad_norm": 0.2750372290611267, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0057, + "step": 12020 + }, + { + "epoch": 0.7362751698390354, + "grad_norm": 0.2057010382413864, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0057, + "step": 12030 + }, + { + "epoch": 0.7368872023991676, + "grad_norm": 0.30713731050491333, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0067, + "step": 12040 + }, + { + "epoch": 0.7374992349592998, + "grad_norm": 0.20423808693885803, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.006, + "step": 12050 + }, + { + "epoch": 0.738111267519432, + "grad_norm": 0.3129539489746094, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0067, + "step": 12060 + }, + { + "epoch": 0.7387233000795642, + "grad_norm": 0.25026270747184753, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0081, + "step": 12070 + }, + { + "epoch": 0.7393353326396964, + "grad_norm": 0.4147534668445587, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0061, + "step": 12080 + }, + { + "epoch": 0.7399473651998286, + "grad_norm": 0.20954278111457825, + "learning_rate": 1.425047976058418e-05, + "loss": 0.006, + "step": 12090 + }, + { + "epoch": 0.7405593977599608, + "grad_norm": 0.2700798809528351, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 0.741171430320093, + "grad_norm": 0.2597086429595947, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0059, + "step": 12110 + }, + { + "epoch": 0.7417834628802252, + "grad_norm": 0.2674495279788971, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0045, + "step": 12120 + }, + { + "epoch": 0.7423954954403574, + "grad_norm": 0.24583879113197327, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0061, + "step": 12130 + }, + { + "epoch": 0.7430075280004896, + "grad_norm": 0.23704801499843597, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0054, + "step": 12140 + }, + { + "epoch": 0.7436195605606218, + "grad_norm": 0.2381024807691574, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0073, + "step": 12150 + }, + { + "epoch": 0.744231593120754, + "grad_norm": 0.24937355518341064, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0049, + "step": 12160 + }, + { + "epoch": 0.7448436256808862, + "grad_norm": 0.20442882180213928, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0061, + "step": 12170 + }, + { + "epoch": 0.7454556582410184, + "grad_norm": 0.3053426742553711, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0087, + "step": 12180 + }, + { + "epoch": 0.7460676908011507, + "grad_norm": 0.3654315769672394, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0047, + "step": 12190 + }, + { + "epoch": 0.7466797233612829, + "grad_norm": 0.18926535546779633, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0065, + "step": 12200 + }, + { + "epoch": 0.7472917559214151, + "grad_norm": 0.21620485186576843, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0094, + "step": 12210 + }, + { + "epoch": 0.7479037884815473, + "grad_norm": 0.2754563093185425, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0059, + "step": 12220 + }, + { + "epoch": 0.7485158210416795, + "grad_norm": 0.39795419573783875, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.007, + "step": 12230 + }, + { + "epoch": 0.7491278536018117, + "grad_norm": 0.20502857863903046, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0048, + "step": 12240 + }, + { + "epoch": 0.7497398861619439, + "grad_norm": 0.23821429908275604, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0082, + "step": 12250 + }, + { + "epoch": 0.750351918722076, + "grad_norm": 0.45541366934776306, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0071, + "step": 12260 + }, + { + "epoch": 0.7509639512822082, + "grad_norm": 0.24881400167942047, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0061, + "step": 12270 + }, + { + "epoch": 0.7515759838423404, + "grad_norm": 0.2409125715494156, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0061, + "step": 12280 + }, + { + "epoch": 0.7521880164024726, + "grad_norm": 0.2930417060852051, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0054, + "step": 12290 + }, + { + "epoch": 0.7528000489626048, + "grad_norm": 0.30566394329071045, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0054, + "step": 12300 + }, + { + "epoch": 0.753412081522737, + "grad_norm": 0.32679763436317444, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0059, + "step": 12310 + }, + { + "epoch": 0.7540241140828692, + "grad_norm": 0.29273876547813416, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0067, + "step": 12320 + }, + { + "epoch": 0.7546361466430014, + "grad_norm": 0.19642773270606995, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0062, + "step": 12330 + }, + { + "epoch": 0.7552481792031336, + "grad_norm": 0.21928250789642334, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0066, + "step": 12340 + }, + { + "epoch": 0.7558602117632658, + "grad_norm": 0.2534322738647461, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0059, + "step": 12350 + }, + { + "epoch": 0.756472244323398, + "grad_norm": 0.20712649822235107, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0054, + "step": 12360 + }, + { + "epoch": 0.7570842768835302, + "grad_norm": 0.18670639395713806, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0063, + "step": 12370 + }, + { + "epoch": 0.7576963094436624, + "grad_norm": 0.26770254969596863, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0052, + "step": 12380 + }, + { + "epoch": 0.7583083420037946, + "grad_norm": 0.3621291518211365, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0056, + "step": 12390 + }, + { + "epoch": 0.7589203745639268, + "grad_norm": 0.31771939992904663, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0059, + "step": 12400 + }, + { + "epoch": 0.759532407124059, + "grad_norm": 0.44418177008628845, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0065, + "step": 12410 + }, + { + "epoch": 0.7601444396841912, + "grad_norm": 0.2183474898338318, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0046, + "step": 12420 + }, + { + "epoch": 0.7607564722443234, + "grad_norm": 0.4400590658187866, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0061, + "step": 12430 + }, + { + "epoch": 0.7613685048044556, + "grad_norm": 0.296539843082428, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0059, + "step": 12440 + }, + { + "epoch": 0.7619805373645878, + "grad_norm": 0.352870374917984, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0055, + "step": 12450 + }, + { + "epoch": 0.76259256992472, + "grad_norm": 0.19494596123695374, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0061, + "step": 12460 + }, + { + "epoch": 0.7632046024848522, + "grad_norm": 0.3799489438533783, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0057, + "step": 12470 + }, + { + "epoch": 0.7638166350449844, + "grad_norm": 0.3572365641593933, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0053, + "step": 12480 + }, + { + "epoch": 0.7644286676051166, + "grad_norm": 0.2559097707271576, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0062, + "step": 12490 + }, + { + "epoch": 0.7650407001652488, + "grad_norm": 0.13144978880882263, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0065, + "step": 12500 + }, + { + "epoch": 0.765652732725381, + "grad_norm": 0.34635287523269653, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0081, + "step": 12510 + }, + { + "epoch": 0.7662647652855132, + "grad_norm": 0.25615188479423523, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0057, + "step": 12520 + }, + { + "epoch": 0.7668767978456454, + "grad_norm": 0.17619644105434418, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0047, + "step": 12530 + }, + { + "epoch": 0.7674888304057775, + "grad_norm": 0.20169994235038757, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0068, + "step": 12540 + }, + { + "epoch": 0.7681008629659097, + "grad_norm": 0.49686071276664734, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0066, + "step": 12550 + }, + { + "epoch": 0.7687128955260419, + "grad_norm": 0.28179335594177246, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0058, + "step": 12560 + }, + { + "epoch": 0.7693249280861741, + "grad_norm": 0.28156182169914246, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.005, + "step": 12570 + }, + { + "epoch": 0.7699369606463063, + "grad_norm": 0.15054315328598022, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0051, + "step": 12580 + }, + { + "epoch": 0.7705489932064385, + "grad_norm": 0.22872644662857056, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0066, + "step": 12590 + }, + { + "epoch": 0.7711610257665708, + "grad_norm": 0.25821951031684875, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0054, + "step": 12600 + }, + { + "epoch": 0.771773058326703, + "grad_norm": 0.23592771589756012, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0059, + "step": 12610 + }, + { + "epoch": 0.7723850908868352, + "grad_norm": 0.34409141540527344, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0053, + "step": 12620 + }, + { + "epoch": 0.7729971234469674, + "grad_norm": 0.2803158760070801, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0042, + "step": 12630 + }, + { + "epoch": 0.7736091560070996, + "grad_norm": 0.32796284556388855, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0074, + "step": 12640 + }, + { + "epoch": 0.7742211885672318, + "grad_norm": 0.34749120473861694, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0054, + "step": 12650 + }, + { + "epoch": 0.774833221127364, + "grad_norm": 0.34066343307495117, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0082, + "step": 12660 + }, + { + "epoch": 0.7754452536874962, + "grad_norm": 0.4294384717941284, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0062, + "step": 12670 + }, + { + "epoch": 0.7760572862476284, + "grad_norm": 0.2355230748653412, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0069, + "step": 12680 + }, + { + "epoch": 0.7766693188077606, + "grad_norm": 0.3181976079940796, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0068, + "step": 12690 + }, + { + "epoch": 0.7772813513678928, + "grad_norm": 0.2763727605342865, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0052, + "step": 12700 + }, + { + "epoch": 0.777893383928025, + "grad_norm": 0.2938949465751648, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0041, + "step": 12710 + }, + { + "epoch": 0.7785054164881572, + "grad_norm": 0.31331220269203186, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0062, + "step": 12720 + }, + { + "epoch": 0.7791174490482894, + "grad_norm": 0.3389904797077179, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0061, + "step": 12730 + }, + { + "epoch": 0.7797294816084216, + "grad_norm": 0.2848975360393524, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0065, + "step": 12740 + }, + { + "epoch": 0.7803415141685538, + "grad_norm": 0.29838478565216064, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0061, + "step": 12750 + }, + { + "epoch": 0.780953546728686, + "grad_norm": 0.47004032135009766, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0066, + "step": 12760 + }, + { + "epoch": 0.7815655792888182, + "grad_norm": 0.26898056268692017, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0063, + "step": 12770 + }, + { + "epoch": 0.7821776118489504, + "grad_norm": 0.29459917545318604, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0065, + "step": 12780 + }, + { + "epoch": 0.7827896444090826, + "grad_norm": 0.3481508791446686, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0058, + "step": 12790 + }, + { + "epoch": 0.7834016769692148, + "grad_norm": 0.1707627922296524, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0053, + "step": 12800 + }, + { + "epoch": 0.784013709529347, + "grad_norm": 0.14735333621501923, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0058, + "step": 12810 + }, + { + "epoch": 0.7846257420894791, + "grad_norm": 0.28002044558525085, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.006, + "step": 12820 + }, + { + "epoch": 0.7852377746496113, + "grad_norm": 0.39598894119262695, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0062, + "step": 12830 + }, + { + "epoch": 0.7858498072097435, + "grad_norm": 0.19379247725009918, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0058, + "step": 12840 + }, + { + "epoch": 0.7864618397698757, + "grad_norm": 0.27260729670524597, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.006, + "step": 12850 + }, + { + "epoch": 0.7870738723300079, + "grad_norm": 0.2845087945461273, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0052, + "step": 12860 + }, + { + "epoch": 0.7876859048901401, + "grad_norm": 0.37151217460632324, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0043, + "step": 12870 + }, + { + "epoch": 0.7882979374502723, + "grad_norm": 0.3387412130832672, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0046, + "step": 12880 + }, + { + "epoch": 0.7889099700104045, + "grad_norm": 0.42672809958457947, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0065, + "step": 12890 + }, + { + "epoch": 0.7895220025705367, + "grad_norm": 0.20378202199935913, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0062, + "step": 12900 + }, + { + "epoch": 0.7901340351306689, + "grad_norm": 0.16417330503463745, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0045, + "step": 12910 + }, + { + "epoch": 0.7907460676908011, + "grad_norm": 0.1704142540693283, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0054, + "step": 12920 + }, + { + "epoch": 0.7913581002509333, + "grad_norm": 0.21494890749454498, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0061, + "step": 12930 + }, + { + "epoch": 0.7919701328110655, + "grad_norm": 0.3430638909339905, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0046, + "step": 12940 + }, + { + "epoch": 0.7925821653711977, + "grad_norm": 0.22641201317310333, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0049, + "step": 12950 + }, + { + "epoch": 0.79319419793133, + "grad_norm": 0.27153971791267395, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0057, + "step": 12960 + }, + { + "epoch": 0.7938062304914622, + "grad_norm": 0.2648560702800751, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.0048, + "step": 12970 + }, + { + "epoch": 0.7944182630515944, + "grad_norm": 0.2148633897304535, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0048, + "step": 12980 + }, + { + "epoch": 0.7950302956117266, + "grad_norm": 0.35170191526412964, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0069, + "step": 12990 + }, + { + "epoch": 0.7956423281718588, + "grad_norm": 0.3539712429046631, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0067, + "step": 13000 + }, + { + "epoch": 0.796254360731991, + "grad_norm": 0.29938259720802307, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0102, + "step": 13010 + }, + { + "epoch": 0.7968663932921232, + "grad_norm": 0.35241010785102844, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0073, + "step": 13020 + }, + { + "epoch": 0.7974784258522554, + "grad_norm": 0.2929113805294037, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0061, + "step": 13030 + }, + { + "epoch": 0.7980904584123876, + "grad_norm": 0.24052929878234863, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0046, + "step": 13040 + }, + { + "epoch": 0.7987024909725198, + "grad_norm": 0.21611042320728302, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0043, + "step": 13050 + }, + { + "epoch": 0.799314523532652, + "grad_norm": 0.23498570919036865, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0046, + "step": 13060 + }, + { + "epoch": 0.7999265560927842, + "grad_norm": 0.30229923129081726, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0068, + "step": 13070 + }, + { + "epoch": 0.8005385886529164, + "grad_norm": 0.2916681170463562, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0062, + "step": 13080 + }, + { + "epoch": 0.8011506212130486, + "grad_norm": 0.31905195116996765, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0048, + "step": 13090 + }, + { + "epoch": 0.8017626537731807, + "grad_norm": 0.22307109832763672, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0049, + "step": 13100 + }, + { + "epoch": 0.8023746863333129, + "grad_norm": 0.2815198004245758, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0054, + "step": 13110 + }, + { + "epoch": 0.8029867188934451, + "grad_norm": 0.18762829899787903, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0064, + "step": 13120 + }, + { + "epoch": 0.8035987514535773, + "grad_norm": 0.1918255090713501, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0064, + "step": 13130 + }, + { + "epoch": 0.8042107840137095, + "grad_norm": 0.3726229667663574, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0065, + "step": 13140 + }, + { + "epoch": 0.8048228165738417, + "grad_norm": 0.423285573720932, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0062, + "step": 13150 + }, + { + "epoch": 0.8054348491339739, + "grad_norm": 0.1709958165884018, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0052, + "step": 13160 + }, + { + "epoch": 0.8060468816941061, + "grad_norm": 0.3615981936454773, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0054, + "step": 13170 + }, + { + "epoch": 0.8066589142542383, + "grad_norm": 0.2101999819278717, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0053, + "step": 13180 + }, + { + "epoch": 0.8072709468143705, + "grad_norm": 0.14393582940101624, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0047, + "step": 13190 + }, + { + "epoch": 0.8078829793745027, + "grad_norm": 0.3704521656036377, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0056, + "step": 13200 + }, + { + "epoch": 0.8084950119346349, + "grad_norm": 0.23275913298130035, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0051, + "step": 13210 + }, + { + "epoch": 0.8091070444947671, + "grad_norm": 0.18429698050022125, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0045, + "step": 13220 + }, + { + "epoch": 0.8097190770548993, + "grad_norm": 0.21721667051315308, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0052, + "step": 13230 + }, + { + "epoch": 0.8103311096150315, + "grad_norm": 0.29456019401550293, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 0.8109431421751637, + "grad_norm": 0.19854630529880524, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0071, + "step": 13250 + }, + { + "epoch": 0.8115551747352959, + "grad_norm": 0.4318163990974426, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0059, + "step": 13260 + }, + { + "epoch": 0.8121672072954281, + "grad_norm": 0.3421531915664673, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.006, + "step": 13270 + }, + { + "epoch": 0.8127792398555603, + "grad_norm": 0.2370125651359558, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0054, + "step": 13280 + }, + { + "epoch": 0.8133912724156925, + "grad_norm": 0.2996460497379303, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 0.8140033049758247, + "grad_norm": 0.2911904454231262, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0053, + "step": 13300 + }, + { + "epoch": 0.8146153375359569, + "grad_norm": 0.26010408997535706, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0053, + "step": 13310 + }, + { + "epoch": 0.8152273700960891, + "grad_norm": 0.404702752828598, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0081, + "step": 13320 + }, + { + "epoch": 0.8158394026562213, + "grad_norm": 0.25591781735420227, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0057, + "step": 13330 + }, + { + "epoch": 0.8164514352163535, + "grad_norm": 0.1437849998474121, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0064, + "step": 13340 + }, + { + "epoch": 0.8170634677764858, + "grad_norm": 0.12252022325992584, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0047, + "step": 13350 + }, + { + "epoch": 0.817675500336618, + "grad_norm": 0.1861230581998825, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0063, + "step": 13360 + }, + { + "epoch": 0.8182875328967502, + "grad_norm": 0.2313026636838913, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0066, + "step": 13370 + }, + { + "epoch": 0.8188995654568824, + "grad_norm": 0.5445839166641235, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0076, + "step": 13380 + }, + { + "epoch": 0.8195115980170145, + "grad_norm": 0.21818871796131134, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0068, + "step": 13390 + }, + { + "epoch": 0.8201236305771467, + "grad_norm": 0.21823963522911072, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0072, + "step": 13400 + }, + { + "epoch": 0.8207356631372789, + "grad_norm": 0.1730659157037735, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0051, + "step": 13410 + }, + { + "epoch": 0.8213476956974111, + "grad_norm": 0.1301007866859436, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0075, + "step": 13420 + }, + { + "epoch": 0.8219597282575433, + "grad_norm": 0.32452520728111267, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.005, + "step": 13430 + }, + { + "epoch": 0.8225717608176755, + "grad_norm": 0.24771001935005188, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0058, + "step": 13440 + }, + { + "epoch": 0.8231837933778077, + "grad_norm": 0.4575227200984955, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0062, + "step": 13450 + }, + { + "epoch": 0.8237958259379399, + "grad_norm": 0.16441279649734497, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0081, + "step": 13460 + }, + { + "epoch": 0.8244078584980721, + "grad_norm": 0.26582902669906616, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0069, + "step": 13470 + }, + { + "epoch": 0.8250198910582043, + "grad_norm": 0.18871302902698517, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0068, + "step": 13480 + }, + { + "epoch": 0.8256319236183365, + "grad_norm": 0.23244783282279968, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0063, + "step": 13490 + }, + { + "epoch": 0.8262439561784687, + "grad_norm": 0.2399880290031433, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0036, + "step": 13500 + }, + { + "epoch": 0.8268559887386009, + "grad_norm": 0.25766822695732117, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0074, + "step": 13510 + }, + { + "epoch": 0.8274680212987331, + "grad_norm": 0.24792100489139557, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0066, + "step": 13520 + }, + { + "epoch": 0.8280800538588653, + "grad_norm": 0.3371896743774414, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0069, + "step": 13530 + }, + { + "epoch": 0.8286920864189975, + "grad_norm": 0.16249819099903107, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0079, + "step": 13540 + }, + { + "epoch": 0.8293041189791297, + "grad_norm": 0.2705139219760895, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0069, + "step": 13550 + }, + { + "epoch": 0.8299161515392619, + "grad_norm": 0.1905352771282196, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0055, + "step": 13560 + }, + { + "epoch": 0.8305281840993941, + "grad_norm": 0.23938500881195068, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0055, + "step": 13570 + }, + { + "epoch": 0.8311402166595263, + "grad_norm": 0.3562251031398773, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0054, + "step": 13580 + }, + { + "epoch": 0.8317522492196585, + "grad_norm": 0.2934769093990326, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0064, + "step": 13590 + }, + { + "epoch": 0.8323642817797907, + "grad_norm": 0.252366840839386, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0078, + "step": 13600 + }, + { + "epoch": 0.8329763143399229, + "grad_norm": 0.16646964848041534, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0046, + "step": 13610 + }, + { + "epoch": 0.8335883469000551, + "grad_norm": 0.22584658861160278, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0071, + "step": 13620 + }, + { + "epoch": 0.8342003794601873, + "grad_norm": 0.3578774034976959, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0049, + "step": 13630 + }, + { + "epoch": 0.8348124120203195, + "grad_norm": 0.3447739779949188, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0065, + "step": 13640 + }, + { + "epoch": 0.8354244445804517, + "grad_norm": 0.381954550743103, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0057, + "step": 13650 + }, + { + "epoch": 0.8360364771405839, + "grad_norm": 0.3563731908798218, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0065, + "step": 13660 + }, + { + "epoch": 0.836648509700716, + "grad_norm": 0.29516372084617615, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0053, + "step": 13670 + }, + { + "epoch": 0.8372605422608482, + "grad_norm": 0.22686618566513062, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0043, + "step": 13680 + }, + { + "epoch": 0.8378725748209804, + "grad_norm": 0.4608387351036072, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.005, + "step": 13690 + }, + { + "epoch": 0.8384846073811126, + "grad_norm": 0.31025534868240356, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0055, + "step": 13700 + }, + { + "epoch": 0.8390966399412448, + "grad_norm": 0.32904690504074097, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0055, + "step": 13710 + }, + { + "epoch": 0.839708672501377, + "grad_norm": 0.2547053098678589, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0061, + "step": 13720 + }, + { + "epoch": 0.8403207050615092, + "grad_norm": 0.30524104833602905, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.005, + "step": 13730 + }, + { + "epoch": 0.8409327376216414, + "grad_norm": 0.17741642892360687, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0051, + "step": 13740 + }, + { + "epoch": 0.8415447701817736, + "grad_norm": 0.23125578463077545, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0053, + "step": 13750 + }, + { + "epoch": 0.8421568027419059, + "grad_norm": 0.3080023229122162, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0075, + "step": 13760 + }, + { + "epoch": 0.842768835302038, + "grad_norm": 0.2509821951389313, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0053, + "step": 13770 + }, + { + "epoch": 0.8433808678621703, + "grad_norm": 0.17483864724636078, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.005, + "step": 13780 + }, + { + "epoch": 0.8439929004223025, + "grad_norm": 0.3952518403530121, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0056, + "step": 13790 + }, + { + "epoch": 0.8446049329824347, + "grad_norm": 0.2945535480976105, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0055, + "step": 13800 + }, + { + "epoch": 0.8452169655425669, + "grad_norm": 0.13024291396141052, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0073, + "step": 13810 + }, + { + "epoch": 0.8458289981026991, + "grad_norm": 0.1840520054101944, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0061, + "step": 13820 + }, + { + "epoch": 0.8464410306628313, + "grad_norm": 0.2368786782026291, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0058, + "step": 13830 + }, + { + "epoch": 0.8470530632229635, + "grad_norm": 0.2885456085205078, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0055, + "step": 13840 + }, + { + "epoch": 0.8476650957830957, + "grad_norm": 0.2782488167285919, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0046, + "step": 13850 + }, + { + "epoch": 0.8482771283432279, + "grad_norm": 0.1711442470550537, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0055, + "step": 13860 + }, + { + "epoch": 0.8488891609033601, + "grad_norm": 0.22235877811908722, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0056, + "step": 13870 + }, + { + "epoch": 0.8495011934634923, + "grad_norm": 0.1937183290719986, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0047, + "step": 13880 + }, + { + "epoch": 0.8501132260236245, + "grad_norm": 0.33960190415382385, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0063, + "step": 13890 + }, + { + "epoch": 0.8507252585837567, + "grad_norm": 0.1983388215303421, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0045, + "step": 13900 + }, + { + "epoch": 0.8513372911438889, + "grad_norm": 0.2968246638774872, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0051, + "step": 13910 + }, + { + "epoch": 0.8519493237040211, + "grad_norm": 0.25328314304351807, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0057, + "step": 13920 + }, + { + "epoch": 0.8525613562641533, + "grad_norm": 0.2435184270143509, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0056, + "step": 13930 + }, + { + "epoch": 0.8531733888242855, + "grad_norm": 0.24512560665607452, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0053, + "step": 13940 + }, + { + "epoch": 0.8537854213844176, + "grad_norm": 0.22028976678848267, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.006, + "step": 13950 + }, + { + "epoch": 0.8543974539445498, + "grad_norm": 0.24743935465812683, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0065, + "step": 13960 + }, + { + "epoch": 0.855009486504682, + "grad_norm": 0.1393810361623764, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0081, + "step": 13970 + }, + { + "epoch": 0.8556215190648142, + "grad_norm": 0.25975972414016724, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0063, + "step": 13980 + }, + { + "epoch": 0.8562335516249464, + "grad_norm": 0.1944616585969925, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0049, + "step": 13990 + }, + { + "epoch": 0.8568455841850786, + "grad_norm": 0.21936742961406708, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0056, + "step": 14000 + }, + { + "epoch": 0.8574576167452108, + "grad_norm": 0.1556629091501236, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0047, + "step": 14010 + }, + { + "epoch": 0.858069649305343, + "grad_norm": 0.23696991801261902, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.006, + "step": 14020 + }, + { + "epoch": 0.8586816818654752, + "grad_norm": 0.32507795095443726, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0061, + "step": 14030 + }, + { + "epoch": 0.8592937144256074, + "grad_norm": 0.35332199931144714, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0058, + "step": 14040 + }, + { + "epoch": 0.8599057469857396, + "grad_norm": 0.1835644394159317, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0046, + "step": 14050 + }, + { + "epoch": 0.8605177795458718, + "grad_norm": 0.19127517938613892, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0044, + "step": 14060 + }, + { + "epoch": 0.861129812106004, + "grad_norm": 0.30748996138572693, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0055, + "step": 14070 + }, + { + "epoch": 0.8617418446661362, + "grad_norm": 0.178785502910614, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0049, + "step": 14080 + }, + { + "epoch": 0.8623538772262684, + "grad_norm": 0.16979056596755981, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0044, + "step": 14090 + }, + { + "epoch": 0.8629659097864006, + "grad_norm": 0.19519983232021332, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0063, + "step": 14100 + }, + { + "epoch": 0.8635779423465328, + "grad_norm": 0.2722550928592682, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0057, + "step": 14110 + }, + { + "epoch": 0.864189974906665, + "grad_norm": 0.1956222504377365, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0054, + "step": 14120 + }, + { + "epoch": 0.8648020074667973, + "grad_norm": 0.32274308800697327, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0066, + "step": 14130 + }, + { + "epoch": 0.8654140400269295, + "grad_norm": 0.25953641533851624, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.0056, + "step": 14140 + }, + { + "epoch": 0.8660260725870617, + "grad_norm": 0.3293299674987793, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0057, + "step": 14150 + }, + { + "epoch": 0.8666381051471939, + "grad_norm": 0.35404127836227417, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0072, + "step": 14160 + }, + { + "epoch": 0.8672501377073261, + "grad_norm": 0.24674376845359802, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0064, + "step": 14170 + }, + { + "epoch": 0.8678621702674583, + "grad_norm": 0.23506462574005127, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0047, + "step": 14180 + }, + { + "epoch": 0.8684742028275905, + "grad_norm": 0.30500903725624084, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0059, + "step": 14190 + }, + { + "epoch": 0.8690862353877227, + "grad_norm": 0.23000167310237885, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0051, + "step": 14200 + }, + { + "epoch": 0.8696982679478549, + "grad_norm": 0.17339368164539337, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0041, + "step": 14210 + }, + { + "epoch": 0.8703103005079871, + "grad_norm": 0.2505367696285248, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.0059, + "step": 14220 + }, + { + "epoch": 0.8709223330681192, + "grad_norm": 0.22645734250545502, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0044, + "step": 14230 + }, + { + "epoch": 0.8715343656282514, + "grad_norm": 0.3509127199649811, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0043, + "step": 14240 + }, + { + "epoch": 0.8721463981883836, + "grad_norm": 0.2758972644805908, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0055, + "step": 14250 + }, + { + "epoch": 0.8727584307485158, + "grad_norm": 0.1943834275007248, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.006, + "step": 14260 + }, + { + "epoch": 0.873370463308648, + "grad_norm": 0.32881075143814087, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0061, + "step": 14270 + }, + { + "epoch": 0.8739824958687802, + "grad_norm": 0.35203438997268677, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0054, + "step": 14280 + }, + { + "epoch": 0.8745945284289124, + "grad_norm": 0.13618917763233185, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0044, + "step": 14290 + }, + { + "epoch": 0.8752065609890446, + "grad_norm": 0.22939404845237732, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0044, + "step": 14300 + }, + { + "epoch": 0.8758185935491768, + "grad_norm": 0.2027491182088852, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0051, + "step": 14310 + }, + { + "epoch": 0.876430626109309, + "grad_norm": 0.21950028836727142, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0105, + "step": 14320 + }, + { + "epoch": 0.8770426586694412, + "grad_norm": 0.307913213968277, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0051, + "step": 14330 + }, + { + "epoch": 0.8776546912295734, + "grad_norm": 0.1669110357761383, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0058, + "step": 14340 + }, + { + "epoch": 0.8782667237897056, + "grad_norm": 0.3033636808395386, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0066, + "step": 14350 + }, + { + "epoch": 0.8788787563498378, + "grad_norm": 0.25514236092567444, + "learning_rate": 1.210961823379053e-05, + "loss": 0.005, + "step": 14360 + }, + { + "epoch": 0.87949078890997, + "grad_norm": 0.2574418783187866, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0069, + "step": 14370 + }, + { + "epoch": 0.8801028214701022, + "grad_norm": 0.17803016304969788, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.006, + "step": 14380 + }, + { + "epoch": 0.8807148540302344, + "grad_norm": 0.31375741958618164, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0077, + "step": 14390 + }, + { + "epoch": 0.8813268865903666, + "grad_norm": 0.18031778931617737, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0052, + "step": 14400 + }, + { + "epoch": 0.8819389191504988, + "grad_norm": 0.18077519536018372, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0055, + "step": 14410 + }, + { + "epoch": 0.882550951710631, + "grad_norm": 0.22171644866466522, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0059, + "step": 14420 + }, + { + "epoch": 0.8831629842707632, + "grad_norm": 0.16187389194965363, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0044, + "step": 14430 + }, + { + "epoch": 0.8837750168308954, + "grad_norm": 0.27667325735092163, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0063, + "step": 14440 + }, + { + "epoch": 0.8843870493910276, + "grad_norm": 0.2493051290512085, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0072, + "step": 14450 + }, + { + "epoch": 0.8849990819511598, + "grad_norm": 0.3519611656665802, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 0.885611114511292, + "grad_norm": 0.17942464351654053, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0057, + "step": 14470 + }, + { + "epoch": 0.8862231470714242, + "grad_norm": 0.24518658220767975, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0044, + "step": 14480 + }, + { + "epoch": 0.8868351796315564, + "grad_norm": 0.28493785858154297, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0055, + "step": 14490 + }, + { + "epoch": 0.8874472121916887, + "grad_norm": 0.22260263562202454, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0062, + "step": 14500 + }, + { + "epoch": 0.8880592447518207, + "grad_norm": 0.2804561257362366, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0051, + "step": 14510 + }, + { + "epoch": 0.888671277311953, + "grad_norm": 0.24349385499954224, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0045, + "step": 14520 + }, + { + "epoch": 0.8892833098720851, + "grad_norm": 0.262207955121994, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0082, + "step": 14530 + }, + { + "epoch": 0.8898953424322174, + "grad_norm": 0.15527820587158203, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0061, + "step": 14540 + }, + { + "epoch": 0.8905073749923496, + "grad_norm": 0.23850804567337036, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0042, + "step": 14550 + }, + { + "epoch": 0.8911194075524818, + "grad_norm": 0.2665582001209259, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0053, + "step": 14560 + }, + { + "epoch": 0.891731440112614, + "grad_norm": 0.2652167081832886, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 0.8923434726727462, + "grad_norm": 0.21386243402957916, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0072, + "step": 14580 + }, + { + "epoch": 0.8929555052328784, + "grad_norm": 0.3087247312068939, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0082, + "step": 14590 + }, + { + "epoch": 0.8935675377930106, + "grad_norm": 0.2003909796476364, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0048, + "step": 14600 + }, + { + "epoch": 0.8941795703531428, + "grad_norm": 0.2214624583721161, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0062, + "step": 14610 + }, + { + "epoch": 0.894791602913275, + "grad_norm": 0.2500647306442261, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0052, + "step": 14620 + }, + { + "epoch": 0.8954036354734072, + "grad_norm": 0.2615419030189514, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0054, + "step": 14630 + }, + { + "epoch": 0.8960156680335394, + "grad_norm": 0.21347551047801971, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0048, + "step": 14640 + }, + { + "epoch": 0.8966277005936716, + "grad_norm": 0.35483887791633606, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0054, + "step": 14650 + }, + { + "epoch": 0.8972397331538038, + "grad_norm": 0.2423439472913742, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0055, + "step": 14660 + }, + { + "epoch": 0.897851765713936, + "grad_norm": 0.16826359927654266, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0067, + "step": 14670 + }, + { + "epoch": 0.8984637982740682, + "grad_norm": 0.3589499294757843, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0059, + "step": 14680 + }, + { + "epoch": 0.8990758308342004, + "grad_norm": 0.3081042468547821, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0057, + "step": 14690 + }, + { + "epoch": 0.8996878633943326, + "grad_norm": 0.31996914744377136, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0063, + "step": 14700 + }, + { + "epoch": 0.9002998959544648, + "grad_norm": 0.301209419965744, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0073, + "step": 14710 + }, + { + "epoch": 0.900911928514597, + "grad_norm": 0.19257168471813202, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0055, + "step": 14720 + }, + { + "epoch": 0.9015239610747292, + "grad_norm": 0.15221600234508514, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0053, + "step": 14730 + }, + { + "epoch": 0.9021359936348614, + "grad_norm": 0.21519577503204346, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0055, + "step": 14740 + }, + { + "epoch": 0.9027480261949936, + "grad_norm": 0.23772196471691132, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.006, + "step": 14750 + }, + { + "epoch": 0.9033600587551258, + "grad_norm": 0.2872219979763031, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0057, + "step": 14760 + }, + { + "epoch": 0.903972091315258, + "grad_norm": 0.2589483857154846, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0056, + "step": 14770 + }, + { + "epoch": 0.9045841238753902, + "grad_norm": 0.31850162148475647, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0051, + "step": 14780 + }, + { + "epoch": 0.9051961564355223, + "grad_norm": 0.27179282903671265, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0051, + "step": 14790 + }, + { + "epoch": 0.9058081889956545, + "grad_norm": 0.4132739007472992, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.005, + "step": 14800 + }, + { + "epoch": 0.9064202215557867, + "grad_norm": 0.19336774945259094, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0041, + "step": 14810 + }, + { + "epoch": 0.9070322541159189, + "grad_norm": 0.20783282816410065, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0065, + "step": 14820 + }, + { + "epoch": 0.9076442866760511, + "grad_norm": 0.26141899824142456, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0069, + "step": 14830 + }, + { + "epoch": 0.9082563192361833, + "grad_norm": 0.2158539742231369, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0081, + "step": 14840 + }, + { + "epoch": 0.9088683517963155, + "grad_norm": 0.3233732581138611, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0065, + "step": 14850 + }, + { + "epoch": 0.9094803843564477, + "grad_norm": 0.23924769461154938, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0042, + "step": 14860 + }, + { + "epoch": 0.9100924169165799, + "grad_norm": 0.17663812637329102, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.004, + "step": 14870 + }, + { + "epoch": 0.9107044494767121, + "grad_norm": 0.34379643201828003, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.005, + "step": 14880 + }, + { + "epoch": 0.9113164820368443, + "grad_norm": 0.29971349239349365, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0059, + "step": 14890 + }, + { + "epoch": 0.9119285145969765, + "grad_norm": 0.24832949042320251, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0042, + "step": 14900 + }, + { + "epoch": 0.9125405471571088, + "grad_norm": 0.22288024425506592, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0065, + "step": 14910 + }, + { + "epoch": 0.913152579717241, + "grad_norm": 0.2806689441204071, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0043, + "step": 14920 + }, + { + "epoch": 0.9137646122773732, + "grad_norm": 0.3908274173736572, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0047, + "step": 14930 + }, + { + "epoch": 0.9143766448375054, + "grad_norm": 0.16255778074264526, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0068, + "step": 14940 + }, + { + "epoch": 0.9149886773976376, + "grad_norm": 0.430791437625885, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0085, + "step": 14950 + }, + { + "epoch": 0.9156007099577698, + "grad_norm": 0.1739969551563263, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0056, + "step": 14960 + }, + { + "epoch": 0.916212742517902, + "grad_norm": 0.24298283457756042, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0055, + "step": 14970 + }, + { + "epoch": 0.9168247750780342, + "grad_norm": 0.21269915997982025, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0051, + "step": 14980 + }, + { + "epoch": 0.9174368076381664, + "grad_norm": 0.263388991355896, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0058, + "step": 14990 + }, + { + "epoch": 0.9180488401982986, + "grad_norm": 0.28030532598495483, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0068, + "step": 15000 + }, + { + "epoch": 0.9186608727584308, + "grad_norm": 0.17051894962787628, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 0.919272905318563, + "grad_norm": 0.2763383388519287, + "learning_rate": 1.146875176249365e-05, + "loss": 0.004, + "step": 15020 + }, + { + "epoch": 0.9198849378786952, + "grad_norm": 0.2616822421550751, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0052, + "step": 15030 + }, + { + "epoch": 0.9204969704388274, + "grad_norm": 0.21407093107700348, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0062, + "step": 15040 + }, + { + "epoch": 0.9211090029989596, + "grad_norm": 0.23936578631401062, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0073, + "step": 15050 + }, + { + "epoch": 0.9217210355590918, + "grad_norm": 0.26383110880851746, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.006, + "step": 15060 + }, + { + "epoch": 0.922333068119224, + "grad_norm": 0.19477945566177368, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0043, + "step": 15070 + }, + { + "epoch": 0.9229451006793561, + "grad_norm": 0.16677282750606537, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0061, + "step": 15080 + }, + { + "epoch": 0.9235571332394883, + "grad_norm": 0.26856037974357605, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0065, + "step": 15090 + }, + { + "epoch": 0.9241691657996205, + "grad_norm": 0.20086173713207245, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0056, + "step": 15100 + }, + { + "epoch": 0.9247811983597527, + "grad_norm": 0.26998719573020935, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0034, + "step": 15110 + }, + { + "epoch": 0.9253932309198849, + "grad_norm": 0.12727728486061096, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0043, + "step": 15120 + }, + { + "epoch": 0.9260052634800171, + "grad_norm": 0.11288347095251083, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0055, + "step": 15130 + }, + { + "epoch": 0.9266172960401493, + "grad_norm": 0.1109771579504013, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0048, + "step": 15140 + }, + { + "epoch": 0.9272293286002815, + "grad_norm": 0.2556479275226593, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0056, + "step": 15150 + }, + { + "epoch": 0.9278413611604137, + "grad_norm": 0.2149561196565628, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.005, + "step": 15160 + }, + { + "epoch": 0.9284533937205459, + "grad_norm": 0.16953054070472717, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0063, + "step": 15170 + }, + { + "epoch": 0.9290654262806781, + "grad_norm": 0.18306049704551697, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.004, + "step": 15180 + }, + { + "epoch": 0.9296774588408103, + "grad_norm": 0.15755385160446167, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0035, + "step": 15190 + }, + { + "epoch": 0.9302894914009425, + "grad_norm": 0.21062517166137695, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0049, + "step": 15200 + }, + { + "epoch": 0.9309015239610747, + "grad_norm": 0.1403888463973999, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0051, + "step": 15210 + }, + { + "epoch": 0.9315135565212069, + "grad_norm": 0.4044550359249115, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0062, + "step": 15220 + }, + { + "epoch": 0.9321255890813391, + "grad_norm": 0.22543896734714508, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 0.9327376216414713, + "grad_norm": 0.2025403380393982, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0055, + "step": 15240 + }, + { + "epoch": 0.9333496542016035, + "grad_norm": 1.0549683570861816, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0092, + "step": 15250 + }, + { + "epoch": 0.9339616867617357, + "grad_norm": 0.3442397117614746, + "learning_rate": 1.123494277220359e-05, + "loss": 0.005, + "step": 15260 + }, + { + "epoch": 0.934573719321868, + "grad_norm": 0.1678813248872757, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.005, + "step": 15270 + }, + { + "epoch": 0.9351857518820001, + "grad_norm": 0.31081119179725647, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0052, + "step": 15280 + }, + { + "epoch": 0.9357977844421324, + "grad_norm": 0.25498780608177185, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.006, + "step": 15290 + }, + { + "epoch": 0.9364098170022646, + "grad_norm": 0.21825125813484192, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0054, + "step": 15300 + }, + { + "epoch": 0.9370218495623968, + "grad_norm": 0.19719983637332916, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0074, + "step": 15310 + }, + { + "epoch": 0.937633882122529, + "grad_norm": 0.32297465205192566, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0058, + "step": 15320 + }, + { + "epoch": 0.9382459146826612, + "grad_norm": 0.2717733383178711, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0035, + "step": 15330 + }, + { + "epoch": 0.9388579472427934, + "grad_norm": 0.22138433158397675, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0048, + "step": 15340 + }, + { + "epoch": 0.9394699798029256, + "grad_norm": 0.1943465769290924, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0063, + "step": 15350 + }, + { + "epoch": 0.9400820123630577, + "grad_norm": 0.18422184884548187, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0054, + "step": 15360 + }, + { + "epoch": 0.9406940449231899, + "grad_norm": 0.17614246904850006, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0054, + "step": 15370 + }, + { + "epoch": 0.9413060774833221, + "grad_norm": 0.17661592364311218, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0053, + "step": 15380 + }, + { + "epoch": 0.9419181100434543, + "grad_norm": 0.42976850271224976, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0054, + "step": 15390 + }, + { + "epoch": 0.9425301426035865, + "grad_norm": 0.34272316098213196, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0044, + "step": 15400 + }, + { + "epoch": 0.9431421751637187, + "grad_norm": 0.3346613645553589, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0042, + "step": 15410 + }, + { + "epoch": 0.9437542077238509, + "grad_norm": 0.15300114452838898, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0057, + "step": 15420 + }, + { + "epoch": 0.9443662402839831, + "grad_norm": 0.23935656249523163, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0084, + "step": 15430 + }, + { + "epoch": 0.9449782728441153, + "grad_norm": 0.21595227718353271, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0051, + "step": 15440 + }, + { + "epoch": 0.9455903054042475, + "grad_norm": 0.2670149505138397, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0059, + "step": 15450 + }, + { + "epoch": 0.9462023379643797, + "grad_norm": 0.2214009314775467, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0054, + "step": 15460 + }, + { + "epoch": 0.9468143705245119, + "grad_norm": 0.3491996228694916, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 0.9474264030846441, + "grad_norm": 0.28213024139404297, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0054, + "step": 15480 + }, + { + "epoch": 0.9480384356447763, + "grad_norm": 0.30218765139579773, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0049, + "step": 15490 + }, + { + "epoch": 0.9486504682049085, + "grad_norm": 0.17068025469779968, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0046, + "step": 15500 + }, + { + "epoch": 0.9492625007650407, + "grad_norm": 0.23325121402740479, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0054, + "step": 15510 + }, + { + "epoch": 0.9498745333251729, + "grad_norm": 0.22118528187274933, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0035, + "step": 15520 + }, + { + "epoch": 0.9504865658853051, + "grad_norm": 0.20202121138572693, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0043, + "step": 15530 + }, + { + "epoch": 0.9510985984454373, + "grad_norm": 0.28455010056495667, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0039, + "step": 15540 + }, + { + "epoch": 0.9517106310055695, + "grad_norm": 0.26871445775032043, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0046, + "step": 15550 + }, + { + "epoch": 0.9523226635657017, + "grad_norm": 0.33665943145751953, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0058, + "step": 15560 + }, + { + "epoch": 0.9529346961258339, + "grad_norm": 0.3182595670223236, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0053, + "step": 15570 + }, + { + "epoch": 0.9535467286859661, + "grad_norm": 0.2867930829524994, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0068, + "step": 15580 + }, + { + "epoch": 0.9541587612460983, + "grad_norm": 0.21562239527702332, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.0051, + "step": 15590 + }, + { + "epoch": 0.9547707938062305, + "grad_norm": 0.19122859835624695, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0046, + "step": 15600 + }, + { + "epoch": 0.9553828263663627, + "grad_norm": 0.24596959352493286, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.005, + "step": 15610 + }, + { + "epoch": 0.9559948589264949, + "grad_norm": 0.182195246219635, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0038, + "step": 15620 + }, + { + "epoch": 0.9566068914866271, + "grad_norm": 0.3122585415840149, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0053, + "step": 15630 + }, + { + "epoch": 0.9572189240467592, + "grad_norm": 0.25725093483924866, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0063, + "step": 15640 + }, + { + "epoch": 0.9578309566068914, + "grad_norm": 0.19965514540672302, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0053, + "step": 15650 + }, + { + "epoch": 0.9584429891670236, + "grad_norm": 0.3474758267402649, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.006, + "step": 15660 + }, + { + "epoch": 0.9590550217271558, + "grad_norm": 0.18151336908340454, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0048, + "step": 15670 + }, + { + "epoch": 0.959667054287288, + "grad_norm": 0.18923020362854004, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0037, + "step": 15680 + }, + { + "epoch": 0.9602790868474202, + "grad_norm": 0.19792871177196503, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0049, + "step": 15690 + }, + { + "epoch": 0.9608911194075525, + "grad_norm": 0.20296797156333923, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0047, + "step": 15700 + }, + { + "epoch": 0.9615031519676847, + "grad_norm": 0.2556051015853882, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0054, + "step": 15710 + }, + { + "epoch": 0.9621151845278169, + "grad_norm": 0.35538288950920105, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0037, + "step": 15720 + }, + { + "epoch": 0.9627272170879491, + "grad_norm": 0.45357266068458557, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0065, + "step": 15730 + }, + { + "epoch": 0.9633392496480813, + "grad_norm": 0.23721693456172943, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0046, + "step": 15740 + }, + { + "epoch": 0.9639512822082135, + "grad_norm": 0.2727845013141632, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0052, + "step": 15750 + }, + { + "epoch": 0.9645633147683457, + "grad_norm": 0.2647950351238251, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0054, + "step": 15760 + }, + { + "epoch": 0.9651753473284779, + "grad_norm": 0.23364882171154022, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.005, + "step": 15770 + }, + { + "epoch": 0.9657873798886101, + "grad_norm": 0.2035825401544571, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0054, + "step": 15780 + }, + { + "epoch": 0.9663994124487423, + "grad_norm": 0.2411692589521408, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0062, + "step": 15790 + }, + { + "epoch": 0.9670114450088745, + "grad_norm": 0.23559266328811646, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 0.9676234775690067, + "grad_norm": 0.23872418701648712, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0063, + "step": 15810 + }, + { + "epoch": 0.9682355101291389, + "grad_norm": 0.27072128653526306, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0052, + "step": 15820 + }, + { + "epoch": 0.9688475426892711, + "grad_norm": 0.42610588669776917, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0056, + "step": 15830 + }, + { + "epoch": 0.9694595752494033, + "grad_norm": 0.13065233826637268, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0044, + "step": 15840 + }, + { + "epoch": 0.9700716078095355, + "grad_norm": 0.2479996383190155, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0049, + "step": 15850 + }, + { + "epoch": 0.9706836403696677, + "grad_norm": 0.22867974638938904, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 0.9712956729297999, + "grad_norm": 0.21570387482643127, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0048, + "step": 15870 + }, + { + "epoch": 0.9719077054899321, + "grad_norm": 0.26354169845581055, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0073, + "step": 15880 + }, + { + "epoch": 0.9725197380500643, + "grad_norm": 0.19785451889038086, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0044, + "step": 15890 + }, + { + "epoch": 0.9731317706101965, + "grad_norm": 0.09346124529838562, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0051, + "step": 15900 + }, + { + "epoch": 0.9737438031703287, + "grad_norm": 0.18946298956871033, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0049, + "step": 15910 + }, + { + "epoch": 0.9743558357304608, + "grad_norm": 0.1761726588010788, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0057, + "step": 15920 + }, + { + "epoch": 0.974967868290593, + "grad_norm": 0.2610328495502472, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0061, + "step": 15930 + }, + { + "epoch": 0.9755799008507252, + "grad_norm": 0.1841743141412735, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0046, + "step": 15940 + }, + { + "epoch": 0.9761919334108574, + "grad_norm": 0.14279355108737946, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0038, + "step": 15950 + }, + { + "epoch": 0.9768039659709896, + "grad_norm": 0.1717681884765625, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0035, + "step": 15960 + }, + { + "epoch": 0.9774159985311218, + "grad_norm": 0.2102527618408203, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.007, + "step": 15970 + }, + { + "epoch": 0.978028031091254, + "grad_norm": 0.29462379217147827, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0058, + "step": 15980 + }, + { + "epoch": 0.9786400636513862, + "grad_norm": 0.1863207072019577, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0058, + "step": 15990 + }, + { + "epoch": 0.9792520962115184, + "grad_norm": 0.2764773964881897, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0051, + "step": 16000 + }, + { + "epoch": 0.9798641287716506, + "grad_norm": 0.2723250091075897, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0056, + "step": 16010 + }, + { + "epoch": 0.9804761613317828, + "grad_norm": 0.21564331650733948, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0048, + "step": 16020 + }, + { + "epoch": 0.981088193891915, + "grad_norm": 0.20242232084274292, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0058, + "step": 16030 + }, + { + "epoch": 0.9817002264520472, + "grad_norm": 0.21522754430770874, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0039, + "step": 16040 + }, + { + "epoch": 0.9823122590121794, + "grad_norm": 0.20013833045959473, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0051, + "step": 16050 + }, + { + "epoch": 0.9829242915723116, + "grad_norm": 0.3008810579776764, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 0.9835363241324439, + "grad_norm": 0.2994979918003082, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0033, + "step": 16070 + }, + { + "epoch": 0.984148356692576, + "grad_norm": 0.22704628109931946, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0046, + "step": 16080 + }, + { + "epoch": 0.9847603892527083, + "grad_norm": 0.3253551423549652, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0043, + "step": 16090 + }, + { + "epoch": 0.9853724218128405, + "grad_norm": 0.14902091026306152, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0042, + "step": 16100 + }, + { + "epoch": 0.9859844543729727, + "grad_norm": 0.15155524015426636, + "learning_rate": 1.04066696184376e-05, + "loss": 0.005, + "step": 16110 + }, + { + "epoch": 0.9865964869331049, + "grad_norm": 0.1859518140554428, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0063, + "step": 16120 + }, + { + "epoch": 0.9872085194932371, + "grad_norm": 0.5434902906417847, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0072, + "step": 16130 + }, + { + "epoch": 0.9878205520533693, + "grad_norm": 0.19308103621006012, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0046, + "step": 16140 + }, + { + "epoch": 0.9884325846135015, + "grad_norm": 0.21260593831539154, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0077, + "step": 16150 + }, + { + "epoch": 0.9890446171736337, + "grad_norm": 0.15255668759346008, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0059, + "step": 16160 + }, + { + "epoch": 0.9896566497337659, + "grad_norm": 0.18739885091781616, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0047, + "step": 16170 + }, + { + "epoch": 0.9902686822938981, + "grad_norm": 0.2112029641866684, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0049, + "step": 16180 + }, + { + "epoch": 0.9908807148540303, + "grad_norm": 0.35941991209983826, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.005, + "step": 16190 + }, + { + "epoch": 0.9914927474141624, + "grad_norm": 0.16792108118534088, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0051, + "step": 16200 + }, + { + "epoch": 0.9921047799742946, + "grad_norm": 0.1985466182231903, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0063, + "step": 16210 + }, + { + "epoch": 0.9927168125344268, + "grad_norm": 0.17579570412635803, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0046, + "step": 16220 + }, + { + "epoch": 0.993328845094559, + "grad_norm": 0.23352178931236267, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0061, + "step": 16230 + }, + { + "epoch": 0.9939408776546912, + "grad_norm": 0.3543553054332733, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0054, + "step": 16240 + }, + { + "epoch": 0.9945529102148234, + "grad_norm": 0.18603719770908356, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0049, + "step": 16250 + }, + { + "epoch": 0.9951649427749556, + "grad_norm": 0.31745344400405884, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0061, + "step": 16260 + }, + { + "epoch": 0.9957769753350878, + "grad_norm": 0.1416773498058319, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0063, + "step": 16270 + }, + { + "epoch": 0.99638900789522, + "grad_norm": 0.18451642990112305, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0055, + "step": 16280 + }, + { + "epoch": 0.9970010404553522, + "grad_norm": 0.13422183692455292, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0047, + "step": 16290 + }, + { + "epoch": 0.9976130730154844, + "grad_norm": 0.15831588208675385, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0045, + "step": 16300 + }, + { + "epoch": 0.9982251055756166, + "grad_norm": 0.42520084977149963, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0053, + "step": 16310 + }, + { + "epoch": 0.9988371381357488, + "grad_norm": 0.20889437198638916, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0043, + "step": 16320 + }, + { + "epoch": 0.999449170695881, + "grad_norm": 0.17016667127609253, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0072, + "step": 16330 + }, + { + "epoch": 1.0000612032560132, + "grad_norm": 0.3129214346408844, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0054, + "step": 16340 + }, + { + "epoch": 1.0006732358161454, + "grad_norm": 0.334224134683609, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.0037, + "step": 16350 + }, + { + "epoch": 1.0012852683762776, + "grad_norm": 0.28502705693244934, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0065, + "step": 16360 + }, + { + "epoch": 1.0018973009364098, + "grad_norm": 0.21431966125965118, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0046, + "step": 16370 + }, + { + "epoch": 1.002509333496542, + "grad_norm": 0.22898051142692566, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.006, + "step": 16380 + }, + { + "epoch": 1.0031213660566742, + "grad_norm": 0.41625624895095825, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0068, + "step": 16390 + }, + { + "epoch": 1.0037333986168064, + "grad_norm": 0.2510327398777008, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0043, + "step": 16400 + }, + { + "epoch": 1.0043454311769386, + "grad_norm": 0.23560962080955505, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0049, + "step": 16410 + }, + { + "epoch": 1.0049574637370708, + "grad_norm": 0.2081199437379837, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0061, + "step": 16420 + }, + { + "epoch": 1.005569496297203, + "grad_norm": 0.12456244230270386, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0057, + "step": 16430 + }, + { + "epoch": 1.0061815288573353, + "grad_norm": 0.22212636470794678, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0052, + "step": 16440 + }, + { + "epoch": 1.0067935614174675, + "grad_norm": 0.27772897481918335, + "learning_rate": 1.007637577910799e-05, + "loss": 0.007, + "step": 16450 + }, + { + "epoch": 1.0074055939775997, + "grad_norm": 0.40040507912635803, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0051, + "step": 16460 + }, + { + "epoch": 1.0080176265377319, + "grad_norm": 0.19763565063476562, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0047, + "step": 16470 + }, + { + "epoch": 1.008629659097864, + "grad_norm": 0.2906181514263153, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0055, + "step": 16480 + }, + { + "epoch": 1.0092416916579963, + "grad_norm": 0.29949888586997986, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0045, + "step": 16490 + }, + { + "epoch": 1.0098537242181285, + "grad_norm": 0.3900962769985199, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0053, + "step": 16500 + }, + { + "epoch": 1.0104657567782607, + "grad_norm": 0.22380846738815308, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0043, + "step": 16510 + }, + { + "epoch": 1.0110777893383929, + "grad_norm": 0.3426673412322998, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0052, + "step": 16520 + }, + { + "epoch": 1.011689821898525, + "grad_norm": 0.2452230006456375, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0055, + "step": 16530 + }, + { + "epoch": 1.0123018544586573, + "grad_norm": 0.24280408024787903, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0042, + "step": 16540 + }, + { + "epoch": 1.0129138870187895, + "grad_norm": 0.18271701037883759, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0047, + "step": 16550 + }, + { + "epoch": 1.0135259195789217, + "grad_norm": 0.2874322235584259, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0059, + "step": 16560 + }, + { + "epoch": 1.0141379521390539, + "grad_norm": 0.17367394268512726, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0048, + "step": 16570 + }, + { + "epoch": 1.014749984699186, + "grad_norm": 0.167460098862648, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0074, + "step": 16580 + }, + { + "epoch": 1.0153620172593183, + "grad_norm": 0.21867765486240387, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0042, + "step": 16590 + }, + { + "epoch": 1.0159740498194505, + "grad_norm": 0.2539086639881134, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0057, + "step": 16600 + }, + { + "epoch": 1.0165860823795827, + "grad_norm": 0.1415795534849167, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0044, + "step": 16610 + }, + { + "epoch": 1.0171981149397147, + "grad_norm": 0.12702493369579315, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0038, + "step": 16620 + }, + { + "epoch": 1.0178101474998469, + "grad_norm": 0.16548305749893188, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0042, + "step": 16630 + }, + { + "epoch": 1.018422180059979, + "grad_norm": 0.4413173496723175, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0059, + "step": 16640 + }, + { + "epoch": 1.0190342126201113, + "grad_norm": 0.30871614813804626, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0045, + "step": 16650 + }, + { + "epoch": 1.0196462451802435, + "grad_norm": 0.259650319814682, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0044, + "step": 16660 + }, + { + "epoch": 1.0202582777403757, + "grad_norm": 0.36035388708114624, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0068, + "step": 16670 + }, + { + "epoch": 1.020870310300508, + "grad_norm": 0.3487808406352997, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0038, + "step": 16680 + }, + { + "epoch": 1.02148234286064, + "grad_norm": 0.2898370623588562, + "learning_rate": 9.843955128197274e-06, + "loss": 0.004, + "step": 16690 + }, + { + "epoch": 1.0220943754207723, + "grad_norm": 0.2942182719707489, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0042, + "step": 16700 + }, + { + "epoch": 1.0227064079809045, + "grad_norm": 0.27839869260787964, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0042, + "step": 16710 + }, + { + "epoch": 1.0233184405410367, + "grad_norm": 0.17199957370758057, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0059, + "step": 16720 + }, + { + "epoch": 1.023930473101169, + "grad_norm": 0.2521669566631317, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0073, + "step": 16730 + }, + { + "epoch": 1.0245425056613011, + "grad_norm": 0.19908513128757477, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0047, + "step": 16740 + }, + { + "epoch": 1.0251545382214333, + "grad_norm": 0.23300328850746155, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0055, + "step": 16750 + }, + { + "epoch": 1.0257665707815655, + "grad_norm": 0.24671277403831482, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0043, + "step": 16760 + }, + { + "epoch": 1.0263786033416977, + "grad_norm": 0.23183101415634155, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0052, + "step": 16770 + }, + { + "epoch": 1.02699063590183, + "grad_norm": 0.13460612297058105, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0035, + "step": 16780 + }, + { + "epoch": 1.0276026684619621, + "grad_norm": 0.1990940123796463, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0044, + "step": 16790 + }, + { + "epoch": 1.0282147010220943, + "grad_norm": 0.21223406493663788, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0036, + "step": 16800 + }, + { + "epoch": 1.0288267335822265, + "grad_norm": 0.2649106979370117, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0044, + "step": 16810 + }, + { + "epoch": 1.0294387661423587, + "grad_norm": 0.2524845600128174, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0048, + "step": 16820 + }, + { + "epoch": 1.030050798702491, + "grad_norm": 0.22169779241085052, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 1.0306628312626231, + "grad_norm": 0.16642418503761292, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0048, + "step": 16840 + }, + { + "epoch": 1.0312748638227553, + "grad_norm": 0.22939598560333252, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0048, + "step": 16850 + }, + { + "epoch": 1.0318868963828876, + "grad_norm": 0.2131129503250122, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0055, + "step": 16860 + }, + { + "epoch": 1.0324989289430198, + "grad_norm": 0.20492705702781677, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0041, + "step": 16870 + }, + { + "epoch": 1.033110961503152, + "grad_norm": 0.2988845705986023, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0042, + "step": 16880 + }, + { + "epoch": 1.0337229940632842, + "grad_norm": 0.18579600751399994, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0042, + "step": 16890 + }, + { + "epoch": 1.0343350266234164, + "grad_norm": 0.2553490698337555, + "learning_rate": 9.641222698101725e-06, + "loss": 0.005, + "step": 16900 + }, + { + "epoch": 1.0349470591835486, + "grad_norm": 0.338440865278244, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0036, + "step": 16910 + }, + { + "epoch": 1.0355590917436808, + "grad_norm": 0.12755723297595978, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0044, + "step": 16920 + }, + { + "epoch": 1.036171124303813, + "grad_norm": 0.12222232669591904, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0037, + "step": 16930 + }, + { + "epoch": 1.0367831568639452, + "grad_norm": 0.20246204733848572, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0055, + "step": 16940 + }, + { + "epoch": 1.0373951894240774, + "grad_norm": 0.36903291940689087, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0051, + "step": 16950 + }, + { + "epoch": 1.0380072219842096, + "grad_norm": 0.3166116178035736, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0045, + "step": 16960 + }, + { + "epoch": 1.0386192545443418, + "grad_norm": 0.2777375280857086, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0041, + "step": 16970 + }, + { + "epoch": 1.039231287104474, + "grad_norm": 0.3173989951610565, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0053, + "step": 16980 + }, + { + "epoch": 1.0398433196646062, + "grad_norm": 0.2135571539402008, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0051, + "step": 16990 + }, + { + "epoch": 1.0404553522247384, + "grad_norm": 0.18536782264709473, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0037, + "step": 17000 + }, + { + "epoch": 1.0410673847848706, + "grad_norm": 0.17782410979270935, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0052, + "step": 17010 + }, + { + "epoch": 1.0416794173450028, + "grad_norm": 0.31509512662887573, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0099, + "step": 17020 + }, + { + "epoch": 1.042291449905135, + "grad_norm": 0.22748225927352905, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 1.0429034824652672, + "grad_norm": 0.14924705028533936, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0053, + "step": 17040 + }, + { + "epoch": 1.0435155150253994, + "grad_norm": 0.21390999853610992, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0044, + "step": 17050 + }, + { + "epoch": 1.0441275475855316, + "grad_norm": 0.25828516483306885, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0042, + "step": 17060 + }, + { + "epoch": 1.0447395801456638, + "grad_norm": 0.24069662392139435, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0069, + "step": 17070 + }, + { + "epoch": 1.045351612705796, + "grad_norm": 0.1090504601597786, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0036, + "step": 17080 + }, + { + "epoch": 1.0459636452659282, + "grad_norm": 0.17990687489509583, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0049, + "step": 17090 + }, + { + "epoch": 1.0465756778260604, + "grad_norm": 0.21505555510520935, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0051, + "step": 17100 + }, + { + "epoch": 1.0471877103861926, + "grad_norm": 0.2157493680715561, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0063, + "step": 17110 + }, + { + "epoch": 1.0477997429463248, + "grad_norm": 0.30865493416786194, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0053, + "step": 17120 + }, + { + "epoch": 1.048411775506457, + "grad_norm": 0.16882938146591187, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0043, + "step": 17130 + }, + { + "epoch": 1.0490238080665892, + "grad_norm": 0.14921846985816956, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0043, + "step": 17140 + }, + { + "epoch": 1.0496358406267214, + "grad_norm": 0.15723800659179688, + "learning_rate": 9.400800085133245e-06, + "loss": 0.005, + "step": 17150 + }, + { + "epoch": 1.0502478731868536, + "grad_norm": 0.19597285985946655, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0046, + "step": 17160 + }, + { + "epoch": 1.0508599057469858, + "grad_norm": 0.1684723198413849, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0053, + "step": 17170 + }, + { + "epoch": 1.051471938307118, + "grad_norm": 0.1733175367116928, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0053, + "step": 17180 + }, + { + "epoch": 1.0520839708672503, + "grad_norm": 0.23111647367477417, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0048, + "step": 17190 + }, + { + "epoch": 1.0526960034273822, + "grad_norm": 0.36174628138542175, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0049, + "step": 17200 + }, + { + "epoch": 1.0533080359875144, + "grad_norm": 0.15791575610637665, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0048, + "step": 17210 + }, + { + "epoch": 1.0539200685476466, + "grad_norm": 0.16026809811592102, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0047, + "step": 17220 + }, + { + "epoch": 1.0545321011077788, + "grad_norm": 0.13964296877384186, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0033, + "step": 17230 + }, + { + "epoch": 1.055144133667911, + "grad_norm": 0.22623896598815918, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0041, + "step": 17240 + }, + { + "epoch": 1.0557561662280432, + "grad_norm": 0.15534555912017822, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0067, + "step": 17250 + }, + { + "epoch": 1.0563681987881754, + "grad_norm": 0.09519665688276291, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0035, + "step": 17260 + }, + { + "epoch": 1.0569802313483077, + "grad_norm": 0.19323785603046417, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0045, + "step": 17270 + }, + { + "epoch": 1.0575922639084399, + "grad_norm": 0.21194952726364136, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0047, + "step": 17280 + }, + { + "epoch": 1.058204296468572, + "grad_norm": 0.28977999091148376, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0049, + "step": 17290 + }, + { + "epoch": 1.0588163290287043, + "grad_norm": 0.1739121824502945, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0062, + "step": 17300 + }, + { + "epoch": 1.0594283615888365, + "grad_norm": 0.23189865052700043, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0055, + "step": 17310 + }, + { + "epoch": 1.0600403941489687, + "grad_norm": 0.15705449879169464, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0033, + "step": 17320 + }, + { + "epoch": 1.0606524267091009, + "grad_norm": 0.23189882934093475, + "learning_rate": 9.228411903689187e-06, + "loss": 0.003, + "step": 17330 + }, + { + "epoch": 1.061264459269233, + "grad_norm": 0.19559095799922943, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0051, + "step": 17340 + }, + { + "epoch": 1.0618764918293653, + "grad_norm": 0.2560543715953827, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0049, + "step": 17350 + }, + { + "epoch": 1.0624885243894975, + "grad_norm": 0.35167232155799866, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0042, + "step": 17360 + }, + { + "epoch": 1.0631005569496297, + "grad_norm": 0.17626497149467468, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0039, + "step": 17370 + }, + { + "epoch": 1.0637125895097619, + "grad_norm": 0.18818546831607819, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0043, + "step": 17380 + }, + { + "epoch": 1.064324622069894, + "grad_norm": 0.10237561911344528, + "learning_rate": 9.171095634265995e-06, + "loss": 0.0053, + "step": 17390 + }, + { + "epoch": 1.0649366546300263, + "grad_norm": 0.21828459203243256, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0042, + "step": 17400 + }, + { + "epoch": 1.0655486871901585, + "grad_norm": 0.09354235231876373, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0034, + "step": 17410 + }, + { + "epoch": 1.0661607197502907, + "grad_norm": 0.18106088042259216, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0051, + "step": 17420 + }, + { + "epoch": 1.066772752310423, + "grad_norm": 0.21538101136684418, + "learning_rate": 9.132927564918328e-06, + "loss": 0.0056, + "step": 17430 + }, + { + "epoch": 1.067384784870555, + "grad_norm": 0.18729519844055176, + "learning_rate": 9.1233909973763e-06, + "loss": 0.004, + "step": 17440 + }, + { + "epoch": 1.0679968174306873, + "grad_norm": 0.3791484832763672, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0052, + "step": 17450 + }, + { + "epoch": 1.0686088499908195, + "grad_norm": 0.19206254184246063, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0042, + "step": 17460 + }, + { + "epoch": 1.0692208825509517, + "grad_norm": 0.15434518456459045, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0061, + "step": 17470 + }, + { + "epoch": 1.069832915111084, + "grad_norm": 0.17898093163967133, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0045, + "step": 17480 + }, + { + "epoch": 1.0704449476712161, + "grad_norm": 0.21975649893283844, + "learning_rate": 9.07574141798717e-06, + "loss": 0.005, + "step": 17490 + }, + { + "epoch": 1.0710569802313483, + "grad_norm": 0.1380346417427063, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0032, + "step": 17500 + }, + { + "epoch": 1.0716690127914805, + "grad_norm": 0.28567400574684143, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0044, + "step": 17510 + }, + { + "epoch": 1.0722810453516127, + "grad_norm": 0.22925534844398499, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0041, + "step": 17520 + }, + { + "epoch": 1.072893077911745, + "grad_norm": 0.27094215154647827, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0047, + "step": 17530 + }, + { + "epoch": 1.0735051104718771, + "grad_norm": 0.32299691438674927, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0048, + "step": 17540 + }, + { + "epoch": 1.0741171430320093, + "grad_norm": 0.26789531111717224, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0047, + "step": 17550 + }, + { + "epoch": 1.0747291755921415, + "grad_norm": 0.3175952434539795, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0077, + "step": 17560 + }, + { + "epoch": 1.0753412081522737, + "grad_norm": 0.24784249067306519, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0048, + "step": 17570 + }, + { + "epoch": 1.075953240712406, + "grad_norm": 0.3081960380077362, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0046, + "step": 17580 + }, + { + "epoch": 1.0765652732725381, + "grad_norm": 0.25334152579307556, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0039, + "step": 17590 + }, + { + "epoch": 1.0771773058326704, + "grad_norm": 0.24747619032859802, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0059, + "step": 17600 + }, + { + "epoch": 1.0777893383928026, + "grad_norm": 0.19048908352851868, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0049, + "step": 17610 + }, + { + "epoch": 1.0784013709529348, + "grad_norm": 0.18883349001407623, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0047, + "step": 17620 + }, + { + "epoch": 1.079013403513067, + "grad_norm": 0.18653099238872528, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0044, + "step": 17630 + }, + { + "epoch": 1.0796254360731992, + "grad_norm": 0.1320251226425171, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0042, + "step": 17640 + }, + { + "epoch": 1.0802374686333314, + "grad_norm": 0.14996238052845, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0041, + "step": 17650 + }, + { + "epoch": 1.0808495011934636, + "grad_norm": 0.4576573073863983, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0059, + "step": 17660 + }, + { + "epoch": 1.0814615337535958, + "grad_norm": 0.19582511484622955, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0051, + "step": 17670 + }, + { + "epoch": 1.082073566313728, + "grad_norm": 0.21973003447055817, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0059, + "step": 17680 + }, + { + "epoch": 1.0826855988738602, + "grad_norm": 0.18183568120002747, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0056, + "step": 17690 + }, + { + "epoch": 1.0832976314339924, + "grad_norm": 0.1761978417634964, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0049, + "step": 17700 + }, + { + "epoch": 1.0839096639941246, + "grad_norm": 0.10185366123914719, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0041, + "step": 17710 + }, + { + "epoch": 1.0845216965542568, + "grad_norm": 0.262513130903244, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0046, + "step": 17720 + }, + { + "epoch": 1.0851337291143888, + "grad_norm": 0.36413198709487915, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0043, + "step": 17730 + }, + { + "epoch": 1.085745761674521, + "grad_norm": 0.2258218675851822, + "learning_rate": 8.83836825410936e-06, + "loss": 0.005, + "step": 17740 + }, + { + "epoch": 1.0863577942346532, + "grad_norm": 0.20840497314929962, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0067, + "step": 17750 + }, + { + "epoch": 1.0869698267947854, + "grad_norm": 0.33392995595932007, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0063, + "step": 17760 + }, + { + "epoch": 1.0875818593549176, + "grad_norm": 0.18477876484394073, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0039, + "step": 17770 + }, + { + "epoch": 1.0881938919150498, + "grad_norm": 0.14785899221897125, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0063, + "step": 17780 + }, + { + "epoch": 1.088805924475182, + "grad_norm": 0.12930043041706085, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0055, + "step": 17790 + }, + { + "epoch": 1.0894179570353142, + "grad_norm": 0.1541786789894104, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0035, + "step": 17800 + }, + { + "epoch": 1.0900299895954464, + "grad_norm": 0.1781499683856964, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0054, + "step": 17810 + }, + { + "epoch": 1.0906420221555786, + "grad_norm": 0.13659314811229706, + "learning_rate": 8.762735374981932e-06, + "loss": 0.0047, + "step": 17820 + }, + { + "epoch": 1.0912540547157108, + "grad_norm": 0.18936918675899506, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0041, + "step": 17830 + }, + { + "epoch": 1.091866087275843, + "grad_norm": 0.24795638024806976, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0043, + "step": 17840 + }, + { + "epoch": 1.0924781198359752, + "grad_norm": 0.28090324997901917, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0036, + "step": 17850 + }, + { + "epoch": 1.0930901523961074, + "grad_norm": 0.3130576014518738, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0042, + "step": 17860 + }, + { + "epoch": 1.0937021849562396, + "grad_norm": 0.19758646190166473, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0047, + "step": 17870 + }, + { + "epoch": 1.0943142175163718, + "grad_norm": 0.20309071242809296, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0042, + "step": 17880 + }, + { + "epoch": 1.094926250076504, + "grad_norm": 0.19741898775100708, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0057, + "step": 17890 + }, + { + "epoch": 1.0955382826366362, + "grad_norm": 0.19182747602462769, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0042, + "step": 17900 + }, + { + "epoch": 1.0961503151967684, + "grad_norm": 0.14508575201034546, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0044, + "step": 17910 + }, + { + "epoch": 1.0967623477569006, + "grad_norm": 0.19854849576950073, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0064, + "step": 17920 + }, + { + "epoch": 1.0973743803170328, + "grad_norm": 0.15055720508098602, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0049, + "step": 17930 + }, + { + "epoch": 1.097986412877165, + "grad_norm": 0.1855372190475464, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0043, + "step": 17940 + }, + { + "epoch": 1.0985984454372972, + "grad_norm": 0.13770940899848938, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0058, + "step": 17950 + }, + { + "epoch": 1.0992104779974294, + "grad_norm": 0.24905221164226532, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0048, + "step": 17960 + }, + { + "epoch": 1.0998225105575616, + "grad_norm": 0.1951165348291397, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0043, + "step": 17970 + }, + { + "epoch": 1.1004345431176938, + "grad_norm": 0.18365852534770966, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0051, + "step": 17980 + }, + { + "epoch": 1.101046575677826, + "grad_norm": 0.16304127871990204, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0034, + "step": 17990 + }, + { + "epoch": 1.1016586082379582, + "grad_norm": 0.262677401304245, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0042, + "step": 18000 + }, + { + "epoch": 1.1022706407980905, + "grad_norm": 0.6157310605049133, + "learning_rate": 8.583791146965244e-06, + "loss": 0.007, + "step": 18010 + }, + { + "epoch": 1.1028826733582227, + "grad_norm": 0.2832951247692108, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0056, + "step": 18020 + }, + { + "epoch": 1.1034947059183549, + "grad_norm": 0.1781810224056244, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0049, + "step": 18030 + }, + { + "epoch": 1.104106738478487, + "grad_norm": 0.23228950798511505, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0045, + "step": 18040 + }, + { + "epoch": 1.1047187710386193, + "grad_norm": 0.2573170065879822, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0048, + "step": 18050 + }, + { + "epoch": 1.1053308035987515, + "grad_norm": 0.30996036529541016, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0054, + "step": 18060 + }, + { + "epoch": 1.1059428361588837, + "grad_norm": 0.24979132413864136, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0045, + "step": 18070 + }, + { + "epoch": 1.1065548687190159, + "grad_norm": 0.17564314603805542, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0033, + "step": 18080 + }, + { + "epoch": 1.107166901279148, + "grad_norm": 0.14539776742458344, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0047, + "step": 18090 + }, + { + "epoch": 1.1077789338392803, + "grad_norm": 0.2530387341976166, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0058, + "step": 18100 + }, + { + "epoch": 1.1083909663994125, + "grad_norm": 0.2038760781288147, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0052, + "step": 18110 + }, + { + "epoch": 1.1090029989595447, + "grad_norm": 0.1769075244665146, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0043, + "step": 18120 + }, + { + "epoch": 1.1096150315196769, + "grad_norm": 0.1686626374721527, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0055, + "step": 18130 + }, + { + "epoch": 1.110227064079809, + "grad_norm": 0.21752336621284485, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0052, + "step": 18140 + }, + { + "epoch": 1.1108390966399413, + "grad_norm": 0.2739295959472656, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0039, + "step": 18150 + }, + { + "epoch": 1.1114511292000735, + "grad_norm": 0.18259567022323608, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0038, + "step": 18160 + }, + { + "epoch": 1.1120631617602057, + "grad_norm": 0.21565310657024384, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0043, + "step": 18170 + }, + { + "epoch": 1.112675194320338, + "grad_norm": 0.2141607403755188, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0059, + "step": 18180 + }, + { + "epoch": 1.11328722688047, + "grad_norm": 0.3017563819885254, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0044, + "step": 18190 + }, + { + "epoch": 1.1138992594406023, + "grad_norm": 0.2021455019712448, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0044, + "step": 18200 + }, + { + "epoch": 1.1145112920007345, + "grad_norm": 0.2113070785999298, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0048, + "step": 18210 + }, + { + "epoch": 1.1151233245608667, + "grad_norm": 0.18945784866809845, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0029, + "step": 18220 + }, + { + "epoch": 1.115735357120999, + "grad_norm": 0.15259192883968353, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0043, + "step": 18230 + }, + { + "epoch": 1.1163473896811311, + "grad_norm": 0.17555822432041168, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0039, + "step": 18240 + }, + { + "epoch": 1.1169594222412633, + "grad_norm": 0.20105648040771484, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0058, + "step": 18250 + }, + { + "epoch": 1.1175714548013955, + "grad_norm": 0.31626567244529724, + "learning_rate": 8.349909816537207e-06, + "loss": 0.0044, + "step": 18260 + }, + { + "epoch": 1.1181834873615277, + "grad_norm": 0.16219007968902588, + "learning_rate": 8.340593854157868e-06, + "loss": 0.005, + "step": 18270 + }, + { + "epoch": 1.11879551992166, + "grad_norm": 0.2174186110496521, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0055, + "step": 18280 + }, + { + "epoch": 1.1194075524817921, + "grad_norm": 0.13639339804649353, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0043, + "step": 18290 + }, + { + "epoch": 1.1200195850419243, + "grad_norm": 0.15100249648094177, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0042, + "step": 18300 + }, + { + "epoch": 1.1206316176020565, + "grad_norm": 0.2114904671907425, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0043, + "step": 18310 + }, + { + "epoch": 1.1212436501621887, + "grad_norm": 0.2941966950893402, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0052, + "step": 18320 + }, + { + "epoch": 1.1218556827223207, + "grad_norm": 0.21695150434970856, + "learning_rate": 8.28476400245882e-06, + "loss": 0.005, + "step": 18330 + }, + { + "epoch": 1.122467715282453, + "grad_norm": 0.11768218129873276, + "learning_rate": 8.275470116190976e-06, + "loss": 0.005, + "step": 18340 + }, + { + "epoch": 1.1230797478425851, + "grad_norm": 0.1427483856678009, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0039, + "step": 18350 + }, + { + "epoch": 1.1236917804027173, + "grad_norm": 0.1837971955537796, + "learning_rate": 8.256891946721157e-06, + "loss": 0.004, + "step": 18360 + }, + { + "epoch": 1.1243038129628495, + "grad_norm": 0.30968883633613586, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0037, + "step": 18370 + }, + { + "epoch": 1.1249158455229817, + "grad_norm": 0.13366396725177765, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0042, + "step": 18380 + }, + { + "epoch": 1.125527878083114, + "grad_norm": 0.1829235553741455, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0039, + "step": 18390 + }, + { + "epoch": 1.1261399106432461, + "grad_norm": 0.3106991648674011, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0052, + "step": 18400 + }, + { + "epoch": 1.1267519432033783, + "grad_norm": 0.38655754923820496, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0046, + "step": 18410 + }, + { + "epoch": 1.1273639757635106, + "grad_norm": 0.23598383367061615, + "learning_rate": 8.201235047388747e-06, + "loss": 0.004, + "step": 18420 + }, + { + "epoch": 1.1279760083236428, + "grad_norm": 0.17428012192249298, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0046, + "step": 18430 + }, + { + "epoch": 1.128588040883775, + "grad_norm": 0.1847466081380844, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0043, + "step": 18440 + }, + { + "epoch": 1.1292000734439072, + "grad_norm": 0.14917762577533722, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0049, + "step": 18450 + }, + { + "epoch": 1.1298121060040394, + "grad_norm": 0.2882528305053711, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0046, + "step": 18460 + }, + { + "epoch": 1.1304241385641716, + "grad_norm": 0.36186549067497253, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0065, + "step": 18470 + }, + { + "epoch": 1.1310361711243038, + "grad_norm": 0.1604463905096054, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0037, + "step": 18480 + }, + { + "epoch": 1.131648203684436, + "grad_norm": 0.17751921713352203, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0034, + "step": 18490 + }, + { + "epoch": 1.1322602362445682, + "grad_norm": 0.15355733036994934, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0059, + "step": 18500 + }, + { + "epoch": 1.1328722688047004, + "grad_norm": 0.21558596193790436, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0044, + "step": 18510 + }, + { + "epoch": 1.1334843013648326, + "grad_norm": 0.20114412903785706, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 1.1340963339249648, + "grad_norm": 0.17260855436325073, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0053, + "step": 18530 + }, + { + "epoch": 1.134708366485097, + "grad_norm": 0.16089287400245667, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0032, + "step": 18540 + }, + { + "epoch": 1.1353203990452292, + "grad_norm": 0.14655937254428864, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0043, + "step": 18550 + }, + { + "epoch": 1.1359324316053614, + "grad_norm": 0.16373249888420105, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0058, + "step": 18560 + }, + { + "epoch": 1.1365444641654936, + "grad_norm": 0.14543801546096802, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0043, + "step": 18570 + }, + { + "epoch": 1.1371564967256258, + "grad_norm": 0.3515278100967407, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0043, + "step": 18580 + }, + { + "epoch": 1.137768529285758, + "grad_norm": 0.21776945888996124, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0053, + "step": 18590 + }, + { + "epoch": 1.1383805618458902, + "grad_norm": 0.21879829466342926, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0051, + "step": 18600 + }, + { + "epoch": 1.1389925944060224, + "grad_norm": 0.16967973113059998, + "learning_rate": 8.025779439806006e-06, + "loss": 0.0048, + "step": 18610 + }, + { + "epoch": 1.1396046269661546, + "grad_norm": 0.4298441410064697, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0056, + "step": 18620 + }, + { + "epoch": 1.1402166595262868, + "grad_norm": 0.1858961284160614, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0067, + "step": 18630 + }, + { + "epoch": 1.140828692086419, + "grad_norm": 0.25853803753852844, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0057, + "step": 18640 + }, + { + "epoch": 1.1414407246465512, + "grad_norm": 0.18566234409809113, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0048, + "step": 18650 + }, + { + "epoch": 1.1420527572066834, + "grad_norm": 0.3471083343029022, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0042, + "step": 18660 + }, + { + "epoch": 1.1426647897668156, + "grad_norm": 0.2092636376619339, + "learning_rate": 7.970630670012853e-06, + "loss": 0.004, + "step": 18670 + }, + { + "epoch": 1.1432768223269478, + "grad_norm": 0.3432580828666687, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0044, + "step": 18680 + }, + { + "epoch": 1.14388885488708, + "grad_norm": 0.14227882027626038, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0041, + "step": 18690 + }, + { + "epoch": 1.1445008874472122, + "grad_norm": 0.2128007709980011, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0037, + "step": 18700 + }, + { + "epoch": 1.1451129200073444, + "grad_norm": 0.25377482175827026, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0049, + "step": 18710 + }, + { + "epoch": 1.1457249525674766, + "grad_norm": 0.1905982494354248, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0037, + "step": 18720 + }, + { + "epoch": 1.1463369851276088, + "grad_norm": 0.3090096712112427, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0079, + "step": 18730 + }, + { + "epoch": 1.146949017687741, + "grad_norm": 0.15604345500469208, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0037, + "step": 18740 + }, + { + "epoch": 1.1475610502478732, + "grad_norm": 0.21756386756896973, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0043, + "step": 18750 + }, + { + "epoch": 1.1481730828080055, + "grad_norm": 0.23869304358959198, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0038, + "step": 18760 + }, + { + "epoch": 1.1487851153681377, + "grad_norm": 0.18082380294799805, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0073, + "step": 18770 + }, + { + "epoch": 1.1493971479282699, + "grad_norm": 0.4032754898071289, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0061, + "step": 18780 + }, + { + "epoch": 1.150009180488402, + "grad_norm": 0.3173290491104126, + "learning_rate": 7.860719408056385e-06, + "loss": 0.004, + "step": 18790 + }, + { + "epoch": 1.1506212130485343, + "grad_norm": 0.18892645835876465, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0044, + "step": 18800 + }, + { + "epoch": 1.1512332456086665, + "grad_norm": 0.26740241050720215, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0056, + "step": 18810 + }, + { + "epoch": 1.1518452781687987, + "grad_norm": 0.3046218752861023, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0072, + "step": 18820 + }, + { + "epoch": 1.1524573107289309, + "grad_norm": 0.17181983590126038, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0034, + "step": 18830 + }, + { + "epoch": 1.1530693432890629, + "grad_norm": 0.22095724940299988, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0045, + "step": 18840 + }, + { + "epoch": 1.153681375849195, + "grad_norm": 0.1514609307050705, + "learning_rate": 7.80596155940873e-06, + "loss": 0.004, + "step": 18850 + }, + { + "epoch": 1.1542934084093273, + "grad_norm": 0.15244366228580475, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0047, + "step": 18860 + }, + { + "epoch": 1.1549054409694595, + "grad_norm": 0.24359947443008423, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0039, + "step": 18870 + }, + { + "epoch": 1.1555174735295917, + "grad_norm": 0.15558156371116638, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0036, + "step": 18880 + }, + { + "epoch": 1.1561295060897239, + "grad_norm": 0.33679234981536865, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0041, + "step": 18890 + }, + { + "epoch": 1.156741538649856, + "grad_norm": 0.15811999142169952, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0062, + "step": 18900 + }, + { + "epoch": 1.1573535712099883, + "grad_norm": 0.14838527143001556, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0029, + "step": 18910 + }, + { + "epoch": 1.1579656037701205, + "grad_norm": 0.23024815320968628, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0038, + "step": 18920 + }, + { + "epoch": 1.1585776363302527, + "grad_norm": 0.18455618619918823, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0044, + "step": 18930 + }, + { + "epoch": 1.1591896688903849, + "grad_norm": 0.20213079452514648, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0038, + "step": 18940 + }, + { + "epoch": 1.159801701450517, + "grad_norm": 0.19000643491744995, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0043, + "step": 18950 + }, + { + "epoch": 1.1604137340106493, + "grad_norm": 0.14075686037540436, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0047, + "step": 18960 + }, + { + "epoch": 1.1610257665707815, + "grad_norm": 0.22101792693138123, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0045, + "step": 18970 + }, + { + "epoch": 1.1616377991309137, + "grad_norm": 0.1097906231880188, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0037, + "step": 18980 + }, + { + "epoch": 1.162249831691046, + "grad_norm": 0.16169370710849762, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0033, + "step": 18990 + }, + { + "epoch": 1.162861864251178, + "grad_norm": 0.32931753993034363, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0052, + "step": 19000 + }, + { + "epoch": 1.1634738968113103, + "grad_norm": 0.2494741678237915, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0057, + "step": 19010 + }, + { + "epoch": 1.1640859293714425, + "grad_norm": 0.18492171168327332, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0056, + "step": 19020 + }, + { + "epoch": 1.1646979619315747, + "grad_norm": 0.18830963969230652, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0036, + "step": 19030 + }, + { + "epoch": 1.165309994491707, + "grad_norm": 0.1331586092710495, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0046, + "step": 19040 + }, + { + "epoch": 1.1659220270518391, + "grad_norm": 0.2433806210756302, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0053, + "step": 19050 + }, + { + "epoch": 1.1665340596119713, + "grad_norm": 0.24491485953330994, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0037, + "step": 19060 + }, + { + "epoch": 1.1671460921721035, + "grad_norm": 0.1789211630821228, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0046, + "step": 19070 + }, + { + "epoch": 1.1677581247322357, + "grad_norm": 0.2729121148586273, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0043, + "step": 19080 + }, + { + "epoch": 1.168370157292368, + "grad_norm": 0.19535189867019653, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0056, + "step": 19090 + }, + { + "epoch": 1.1689821898525001, + "grad_norm": 0.2282983660697937, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0048, + "step": 19100 + }, + { + "epoch": 1.1695942224126323, + "grad_norm": 0.1281195729970932, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0045, + "step": 19110 + }, + { + "epoch": 1.1702062549727645, + "grad_norm": 0.2850968539714813, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0034, + "step": 19120 + }, + { + "epoch": 1.1708182875328967, + "grad_norm": 0.12891536951065063, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0054, + "step": 19130 + }, + { + "epoch": 1.171430320093029, + "grad_norm": 0.13464727997779846, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0033, + "step": 19140 + }, + { + "epoch": 1.1720423526531611, + "grad_norm": 0.2415568083524704, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0041, + "step": 19150 + }, + { + "epoch": 1.1726543852132933, + "grad_norm": 0.15686331689357758, + "learning_rate": 7.525246655150879e-06, + "loss": 0.004, + "step": 19160 + }, + { + "epoch": 1.1732664177734256, + "grad_norm": 0.15490666031837463, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0039, + "step": 19170 + }, + { + "epoch": 1.1738784503335578, + "grad_norm": 0.14095450937747955, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0034, + "step": 19180 + }, + { + "epoch": 1.17449048289369, + "grad_norm": 0.19024531543254852, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0084, + "step": 19190 + }, + { + "epoch": 1.1751025154538222, + "grad_norm": 0.2583692669868469, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0042, + "step": 19200 + }, + { + "epoch": 1.1757145480139544, + "grad_norm": 0.19117654860019684, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0038, + "step": 19210 + }, + { + "epoch": 1.1763265805740866, + "grad_norm": 0.15838374197483063, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0036, + "step": 19220 + }, + { + "epoch": 1.1769386131342188, + "grad_norm": 0.30352044105529785, + "learning_rate": 7.4623904967312e-06, + "loss": 0.0052, + "step": 19230 + }, + { + "epoch": 1.177550645694351, + "grad_norm": 0.229969322681427, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0043, + "step": 19240 + }, + { + "epoch": 1.1781626782544832, + "grad_norm": 0.17781461775302887, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0046, + "step": 19250 + }, + { + "epoch": 1.1787747108146154, + "grad_norm": 0.1306339055299759, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0032, + "step": 19260 + }, + { + "epoch": 1.1793867433747476, + "grad_norm": 0.15727253258228302, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0045, + "step": 19270 + }, + { + "epoch": 1.1799987759348798, + "grad_norm": 0.24909166991710663, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0045, + "step": 19280 + }, + { + "epoch": 1.180610808495012, + "grad_norm": 0.4604126811027527, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0053, + "step": 19290 + }, + { + "epoch": 1.1812228410551442, + "grad_norm": 0.12739762663841248, + "learning_rate": 7.399737764864619e-06, + "loss": 0.004, + "step": 19300 + }, + { + "epoch": 1.1818348736152764, + "grad_norm": 0.2849223017692566, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0043, + "step": 19310 + }, + { + "epoch": 1.1824469061754086, + "grad_norm": 0.26089897751808167, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0044, + "step": 19320 + }, + { + "epoch": 1.1830589387355408, + "grad_norm": 0.1752242147922516, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0076, + "step": 19330 + }, + { + "epoch": 1.183670971295673, + "grad_norm": 0.14917130768299103, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0097, + "step": 19340 + }, + { + "epoch": 1.1842830038558052, + "grad_norm": 0.1599114090204239, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0061, + "step": 19350 + }, + { + "epoch": 1.1848950364159374, + "grad_norm": 0.16370004415512085, + "learning_rate": 7.346200065486093e-06, + "loss": 0.0035, + "step": 19360 + }, + { + "epoch": 1.1855070689760696, + "grad_norm": 0.19354844093322754, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0032, + "step": 19370 + }, + { + "epoch": 1.1861191015362018, + "grad_norm": 0.19689561426639557, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0067, + "step": 19380 + }, + { + "epoch": 1.186731134096334, + "grad_norm": 0.22203278541564941, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0041, + "step": 19390 + }, + { + "epoch": 1.1873431666564662, + "grad_norm": 0.13579773902893066, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0048, + "step": 19400 + }, + { + "epoch": 1.1879551992165984, + "grad_norm": 0.12321218848228455, + "learning_rate": 7.301703138094429e-06, + "loss": 0.004, + "step": 19410 + }, + { + "epoch": 1.1885672317767306, + "grad_norm": 0.28819525241851807, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0042, + "step": 19420 + }, + { + "epoch": 1.1891792643368628, + "grad_norm": 0.2577916085720062, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0039, + "step": 19430 + }, + { + "epoch": 1.189791296896995, + "grad_norm": 0.26840633153915405, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0062, + "step": 19440 + }, + { + "epoch": 1.1904033294571272, + "grad_norm": 0.24222144484519958, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0044, + "step": 19450 + }, + { + "epoch": 1.1910153620172594, + "grad_norm": 0.157009556889534, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0038, + "step": 19460 + }, + { + "epoch": 1.1916273945773916, + "grad_norm": 0.19925500452518463, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0042, + "step": 19470 + }, + { + "epoch": 1.1922394271375236, + "grad_norm": 0.19200846552848816, + "learning_rate": 7.239590017751423e-06, + "loss": 0.004, + "step": 19480 + }, + { + "epoch": 1.1928514596976558, + "grad_norm": 0.18441490828990936, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0056, + "step": 19490 + }, + { + "epoch": 1.193463492257788, + "grad_norm": 0.27565324306488037, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0046, + "step": 19500 + }, + { + "epoch": 1.1940755248179202, + "grad_norm": 0.17830556631088257, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0043, + "step": 19510 + }, + { + "epoch": 1.1946875573780524, + "grad_norm": 0.2769330143928528, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0037, + "step": 19520 + }, + { + "epoch": 1.1952995899381846, + "grad_norm": 0.168451189994812, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0039, + "step": 19530 + }, + { + "epoch": 1.1959116224983168, + "grad_norm": 0.31246763467788696, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0046, + "step": 19540 + }, + { + "epoch": 1.196523655058449, + "grad_norm": 0.21112671494483948, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0041, + "step": 19550 + }, + { + "epoch": 1.1971356876185812, + "grad_norm": 0.31681302189826965, + "learning_rate": 7.168868583990693e-06, + "loss": 0.005, + "step": 19560 + }, + { + "epoch": 1.1977477201787134, + "grad_norm": 0.18634411692619324, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0042, + "step": 19570 + }, + { + "epoch": 1.1983597527388457, + "grad_norm": 0.17780153453350067, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0057, + "step": 19580 + }, + { + "epoch": 1.1989717852989779, + "grad_norm": 0.19183002412319183, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0043, + "step": 19590 + }, + { + "epoch": 1.19958381785911, + "grad_norm": 0.28469574451446533, + "learning_rate": 7.133615440411572e-06, + "loss": 0.004, + "step": 19600 + }, + { + "epoch": 1.2001958504192423, + "grad_norm": 0.22470368444919586, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0044, + "step": 19610 + }, + { + "epoch": 1.2008078829793745, + "grad_norm": 0.23563240468502045, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0041, + "step": 19620 + }, + { + "epoch": 1.2014199155395067, + "grad_norm": 0.18467430770397186, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0053, + "step": 19630 + }, + { + "epoch": 1.2020319480996389, + "grad_norm": 0.12539178133010864, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0047, + "step": 19640 + }, + { + "epoch": 1.202643980659771, + "grad_norm": 0.2552005648612976, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.004, + "step": 19650 + }, + { + "epoch": 1.2032560132199033, + "grad_norm": 0.13963459432125092, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0035, + "step": 19660 + }, + { + "epoch": 1.2038680457800355, + "grad_norm": 0.17387327551841736, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0038, + "step": 19670 + }, + { + "epoch": 1.2044800783401677, + "grad_norm": 0.1284111589193344, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0044, + "step": 19680 + }, + { + "epoch": 1.2050921109002999, + "grad_norm": 0.22337380051612854, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0041, + "step": 19690 + }, + { + "epoch": 1.205704143460432, + "grad_norm": 0.2254808247089386, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0033, + "step": 19700 + }, + { + "epoch": 1.2063161760205643, + "grad_norm": 0.19316980242729187, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0041, + "step": 19710 + }, + { + "epoch": 1.2069282085806965, + "grad_norm": 0.17951075732707977, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0038, + "step": 19720 + }, + { + "epoch": 1.2075402411408287, + "grad_norm": 0.3105165660381317, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0043, + "step": 19730 + }, + { + "epoch": 1.208152273700961, + "grad_norm": 0.21083533763885498, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0039, + "step": 19740 + }, + { + "epoch": 1.208764306261093, + "grad_norm": 0.20121195912361145, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0035, + "step": 19750 + }, + { + "epoch": 1.2093763388212253, + "grad_norm": 0.20067447423934937, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0047, + "step": 19760 + }, + { + "epoch": 1.2099883713813575, + "grad_norm": 0.15943066775798798, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0039, + "step": 19770 + }, + { + "epoch": 1.2106004039414897, + "grad_norm": 0.21581032872200012, + "learning_rate": 6.975884226362e-06, + "loss": 0.0045, + "step": 19780 + }, + { + "epoch": 1.211212436501622, + "grad_norm": 0.16258753836154938, + "learning_rate": 6.967165692827958e-06, + "loss": 0.004, + "step": 19790 + }, + { + "epoch": 1.2118244690617541, + "grad_norm": 0.18742400407791138, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0047, + "step": 19800 + }, + { + "epoch": 1.2124365016218863, + "grad_norm": 0.09035168588161469, + "learning_rate": 6.949742834253074e-06, + "loss": 0.004, + "step": 19810 + }, + { + "epoch": 1.2130485341820185, + "grad_norm": 0.21749694645404816, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0054, + "step": 19820 + }, + { + "epoch": 1.2136605667421507, + "grad_norm": 0.3189448416233063, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0043, + "step": 19830 + }, + { + "epoch": 1.214272599302283, + "grad_norm": 0.26815512776374817, + "learning_rate": 6.923644220932124e-06, + "loss": 0.005, + "step": 19840 + }, + { + "epoch": 1.2148846318624151, + "grad_norm": 0.19533704221248627, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0037, + "step": 19850 + }, + { + "epoch": 1.2154966644225473, + "grad_norm": 0.36249589920043945, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0064, + "step": 19860 + }, + { + "epoch": 1.2161086969826795, + "grad_norm": 0.19801265001296997, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0042, + "step": 19870 + }, + { + "epoch": 1.2167207295428117, + "grad_norm": 0.10341386497020721, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0053, + "step": 19880 + }, + { + "epoch": 1.217332762102944, + "grad_norm": 0.17985381186008453, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0045, + "step": 19890 + }, + { + "epoch": 1.2179447946630761, + "grad_norm": 0.18160982429981232, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0061, + "step": 19900 + }, + { + "epoch": 1.2185568272232083, + "grad_norm": 0.15552182495594025, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0047, + "step": 19910 + }, + { + "epoch": 1.2191688597833406, + "grad_norm": 0.34908807277679443, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0046, + "step": 19920 + }, + { + "epoch": 1.2197808923434728, + "grad_norm": 0.14835652709007263, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0042, + "step": 19930 + }, + { + "epoch": 1.220392924903605, + "grad_norm": 0.23276430368423462, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0042, + "step": 19940 + }, + { + "epoch": 1.2210049574637372, + "grad_norm": 0.1900823563337326, + "learning_rate": 6.828319751504063e-06, + "loss": 0.004, + "step": 19950 + }, + { + "epoch": 1.2216169900238694, + "grad_norm": 0.134046271443367, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0039, + "step": 19960 + }, + { + "epoch": 1.2222290225840013, + "grad_norm": 0.17264600098133087, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0036, + "step": 19970 + }, + { + "epoch": 1.2228410551441335, + "grad_norm": 0.24845834076404572, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0044, + "step": 19980 + }, + { + "epoch": 1.2234530877042658, + "grad_norm": 0.14805762469768524, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0049, + "step": 19990 + }, + { + "epoch": 1.224065120264398, + "grad_norm": 0.228907972574234, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0043, + "step": 20000 + }, + { + "epoch": 1.2246771528245302, + "grad_norm": 0.16869507730007172, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0041, + "step": 20010 + }, + { + "epoch": 1.2252891853846624, + "grad_norm": 0.1983603835105896, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0041, + "step": 20020 + }, + { + "epoch": 1.2259012179447946, + "grad_norm": 0.17656362056732178, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0028, + "step": 20030 + }, + { + "epoch": 1.2265132505049268, + "grad_norm": 0.1360313892364502, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0069, + "step": 20040 + }, + { + "epoch": 1.227125283065059, + "grad_norm": 0.21057721972465515, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0051, + "step": 20050 + }, + { + "epoch": 1.2277373156251912, + "grad_norm": 0.138632670044899, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0038, + "step": 20060 + }, + { + "epoch": 1.2283493481853234, + "grad_norm": 0.17815573513507843, + "learning_rate": 6.725005485342219e-06, + "loss": 0.003, + "step": 20070 + }, + { + "epoch": 1.2289613807454556, + "grad_norm": 0.1769353598356247, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0066, + "step": 20080 + }, + { + "epoch": 1.2295734133055878, + "grad_norm": 0.23068928718566895, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0048, + "step": 20090 + }, + { + "epoch": 1.23018544586572, + "grad_norm": 0.25139328837394714, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0049, + "step": 20100 + }, + { + "epoch": 1.2307974784258522, + "grad_norm": 0.09128634631633759, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0042, + "step": 20110 + }, + { + "epoch": 1.2314095109859844, + "grad_norm": 0.20516613125801086, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0031, + "step": 20120 + }, + { + "epoch": 1.2320215435461166, + "grad_norm": 0.1518358588218689, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0049, + "step": 20130 + }, + { + "epoch": 1.2326335761062488, + "grad_norm": 0.1673758625984192, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0044, + "step": 20140 + }, + { + "epoch": 1.233245608666381, + "grad_norm": 0.14084585011005402, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0053, + "step": 20150 + }, + { + "epoch": 1.2338576412265132, + "grad_norm": 0.23316942155361176, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0042, + "step": 20160 + }, + { + "epoch": 1.2344696737866454, + "grad_norm": 0.23793813586235046, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0045, + "step": 20170 + }, + { + "epoch": 1.2350817063467776, + "grad_norm": 0.4269389510154724, + "learning_rate": 6.630934952049143e-06, + "loss": 0.005, + "step": 20180 + }, + { + "epoch": 1.2356937389069098, + "grad_norm": 0.15654191374778748, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0039, + "step": 20190 + }, + { + "epoch": 1.236305771467042, + "grad_norm": 0.19204623997211456, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0032, + "step": 20200 + }, + { + "epoch": 1.2369178040271742, + "grad_norm": 0.15817691385746002, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0044, + "step": 20210 + }, + { + "epoch": 1.2375298365873064, + "grad_norm": 0.12637947499752045, + "learning_rate": 6.596880604028027e-06, + "loss": 0.004, + "step": 20220 + }, + { + "epoch": 1.2381418691474386, + "grad_norm": 0.26657921075820923, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0041, + "step": 20230 + }, + { + "epoch": 1.2387539017075708, + "grad_norm": 0.15207791328430176, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0045, + "step": 20240 + }, + { + "epoch": 1.239365934267703, + "grad_norm": 0.32583367824554443, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0042, + "step": 20250 + }, + { + "epoch": 1.2399779668278352, + "grad_norm": 0.15617726743221283, + "learning_rate": 6.562908932779455e-06, + "loss": 0.004, + "step": 20260 + }, + { + "epoch": 1.2405899993879674, + "grad_norm": 0.1935809850692749, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0041, + "step": 20270 + }, + { + "epoch": 1.2412020319480996, + "grad_norm": 0.17422369122505188, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0035, + "step": 20280 + }, + { + "epoch": 1.2418140645082318, + "grad_norm": 0.15332955121994019, + "learning_rate": 6.53748481975927e-06, + "loss": 0.0049, + "step": 20290 + }, + { + "epoch": 1.242426097068364, + "grad_norm": 0.16183018684387207, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0042, + "step": 20300 + }, + { + "epoch": 1.2430381296284962, + "grad_norm": 0.28421106934547424, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0045, + "step": 20310 + }, + { + "epoch": 1.2436501621886284, + "grad_norm": 0.23288874328136444, + "learning_rate": 6.512107839793337e-06, + "loss": 0.004, + "step": 20320 + }, + { + "epoch": 1.2442621947487607, + "grad_norm": 0.17955242097377777, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0036, + "step": 20330 + }, + { + "epoch": 1.2448742273088929, + "grad_norm": 0.20192117989063263, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0055, + "step": 20340 + }, + { + "epoch": 1.245486259869025, + "grad_norm": 0.15365810692310333, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0034, + "step": 20350 + }, + { + "epoch": 1.2460982924291573, + "grad_norm": 0.25220832228660583, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0039, + "step": 20360 + }, + { + "epoch": 1.2467103249892895, + "grad_norm": 0.25777462124824524, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0053, + "step": 20370 + }, + { + "epoch": 1.2473223575494217, + "grad_norm": 0.2693277895450592, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0053, + "step": 20380 + }, + { + "epoch": 1.2479343901095539, + "grad_norm": 0.22846420109272003, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0033, + "step": 20390 + }, + { + "epoch": 1.248546422669686, + "grad_norm": 0.17022505402565002, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0046, + "step": 20400 + }, + { + "epoch": 1.2491584552298183, + "grad_norm": 0.08295682072639465, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0035, + "step": 20410 + }, + { + "epoch": 1.2497704877899505, + "grad_norm": 0.2745625972747803, + "learning_rate": 6.427861749601945e-06, + "loss": 0.0044, + "step": 20420 + }, + { + "epoch": 1.2503825203500827, + "grad_norm": 0.12855033576488495, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0042, + "step": 20430 + }, + { + "epoch": 1.2509945529102149, + "grad_norm": 0.30358386039733887, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0049, + "step": 20440 + }, + { + "epoch": 1.251606585470347, + "grad_norm": 0.15514959394931793, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0048, + "step": 20450 + }, + { + "epoch": 1.2522186180304793, + "grad_norm": 0.1414988487958908, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0088, + "step": 20460 + }, + { + "epoch": 1.2528306505906115, + "grad_norm": 0.17399665713310242, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0046, + "step": 20470 + }, + { + "epoch": 1.2534426831507437, + "grad_norm": 0.22629426419734955, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0038, + "step": 20480 + }, + { + "epoch": 1.254054715710876, + "grad_norm": 0.30595293641090393, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0027, + "step": 20490 + }, + { + "epoch": 1.254666748271008, + "grad_norm": 0.17980262637138367, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0034, + "step": 20500 + }, + { + "epoch": 1.2552787808311403, + "grad_norm": 0.19016452133655548, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0029, + "step": 20510 + }, + { + "epoch": 1.2558908133912725, + "grad_norm": 0.20200394093990326, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0037, + "step": 20520 + }, + { + "epoch": 1.2565028459514047, + "grad_norm": 0.15347513556480408, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0041, + "step": 20530 + }, + { + "epoch": 1.257114878511537, + "grad_norm": 0.1851687729358673, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0042, + "step": 20540 + }, + { + "epoch": 1.2577269110716691, + "grad_norm": 0.2529662549495697, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0037, + "step": 20550 + }, + { + "epoch": 1.2583389436318013, + "grad_norm": 0.18209592998027802, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0037, + "step": 20560 + }, + { + "epoch": 1.2589509761919335, + "grad_norm": 0.18981963396072388, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0036, + "step": 20570 + }, + { + "epoch": 1.2595630087520657, + "grad_norm": 0.13232728838920593, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0035, + "step": 20580 + }, + { + "epoch": 1.260175041312198, + "grad_norm": 0.133514404296875, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0039, + "step": 20590 + }, + { + "epoch": 1.2607870738723301, + "grad_norm": 0.14339123666286469, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0043, + "step": 20600 + }, + { + "epoch": 1.2613991064324623, + "grad_norm": 0.48857489228248596, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0045, + "step": 20610 + }, + { + "epoch": 1.2620111389925945, + "grad_norm": 0.1513262242078781, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0029, + "step": 20620 + }, + { + "epoch": 1.2626231715527267, + "grad_norm": 0.1497354805469513, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0039, + "step": 20630 + }, + { + "epoch": 1.2632352041128587, + "grad_norm": 0.132791206240654, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0037, + "step": 20640 + }, + { + "epoch": 1.263847236672991, + "grad_norm": 0.13804496824741364, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0035, + "step": 20650 + }, + { + "epoch": 1.2644592692331231, + "grad_norm": 0.19393391907215118, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0049, + "step": 20660 + }, + { + "epoch": 1.2650713017932553, + "grad_norm": 0.17623338103294373, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0038, + "step": 20670 + }, + { + "epoch": 1.2656833343533875, + "grad_norm": 0.26931124925613403, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0042, + "step": 20680 + }, + { + "epoch": 1.2662953669135197, + "grad_norm": 0.17984439432621002, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0036, + "step": 20690 + }, + { + "epoch": 1.266907399473652, + "grad_norm": 0.19648219645023346, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0046, + "step": 20700 + }, + { + "epoch": 1.2675194320337841, + "grad_norm": 0.1464766263961792, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0024, + "step": 20710 + }, + { + "epoch": 1.2681314645939163, + "grad_norm": 0.1271074265241623, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0042, + "step": 20720 + }, + { + "epoch": 1.2687434971540485, + "grad_norm": 0.15960967540740967, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0079, + "step": 20730 + }, + { + "epoch": 1.2693555297141808, + "grad_norm": 0.13636153936386108, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0046, + "step": 20740 + }, + { + "epoch": 1.269967562274313, + "grad_norm": 0.19099050760269165, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0046, + "step": 20750 + }, + { + "epoch": 1.2705795948344452, + "grad_norm": 0.28632739186286926, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0036, + "step": 20760 + }, + { + "epoch": 1.2711916273945774, + "grad_norm": 0.2565019726753235, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0055, + "step": 20770 + }, + { + "epoch": 1.2718036599547096, + "grad_norm": 0.24443399906158447, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0053, + "step": 20780 + }, + { + "epoch": 1.2724156925148418, + "grad_norm": 0.1396762877702713, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0029, + "step": 20790 + }, + { + "epoch": 1.273027725074974, + "grad_norm": 0.3028377890586853, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0061, + "step": 20800 + }, + { + "epoch": 1.2736397576351062, + "grad_norm": 0.18195804953575134, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0034, + "step": 20810 + }, + { + "epoch": 1.2742517901952384, + "grad_norm": 0.16194652020931244, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0054, + "step": 20820 + }, + { + "epoch": 1.2748638227553706, + "grad_norm": 0.13011956214904785, + "learning_rate": 6.08816828695283e-06, + "loss": 0.003, + "step": 20830 + }, + { + "epoch": 1.2754758553155028, + "grad_norm": 0.23294220864772797, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0041, + "step": 20840 + }, + { + "epoch": 1.276087887875635, + "grad_norm": 0.1892961710691452, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0031, + "step": 20850 + }, + { + "epoch": 1.2766999204357672, + "grad_norm": 0.1984476000070572, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0046, + "step": 20860 + }, + { + "epoch": 1.2773119529958994, + "grad_norm": 0.158709317445755, + "learning_rate": 6.055535530104466e-06, + "loss": 0.003, + "step": 20870 + }, + { + "epoch": 1.2779239855560316, + "grad_norm": 0.16505110263824463, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0039, + "step": 20880 + }, + { + "epoch": 1.2785360181161638, + "grad_norm": 0.18332232534885406, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0036, + "step": 20890 + }, + { + "epoch": 1.279148050676296, + "grad_norm": 0.1797804981470108, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0049, + "step": 20900 + }, + { + "epoch": 1.2797600832364282, + "grad_norm": 0.19247964024543762, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0053, + "step": 20910 + }, + { + "epoch": 1.2803721157965604, + "grad_norm": 0.17845408618450165, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0045, + "step": 20920 + }, + { + "epoch": 1.2809841483566926, + "grad_norm": 0.09454555809497833, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0027, + "step": 20930 + }, + { + "epoch": 1.2815961809168248, + "grad_norm": 0.12647129595279694, + "learning_rate": 5.998651973182953e-06, + "loss": 0.004, + "step": 20940 + }, + { + "epoch": 1.282208213476957, + "grad_norm": 0.39115941524505615, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0051, + "step": 20950 + }, + { + "epoch": 1.2828202460370892, + "grad_norm": 0.29081296920776367, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0044, + "step": 20960 + }, + { + "epoch": 1.2834322785972214, + "grad_norm": 0.1849275827407837, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0042, + "step": 20970 + }, + { + "epoch": 1.2840443111573536, + "grad_norm": 0.24075689911842346, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0031, + "step": 20980 + }, + { + "epoch": 1.2846563437174858, + "grad_norm": 0.12463482469320297, + "learning_rate": 5.958196751005967e-06, + "loss": 0.003, + "step": 20990 + }, + { + "epoch": 1.285268376277618, + "grad_norm": 0.16987742483615875, + "learning_rate": 5.950123419134817e-06, + "loss": 0.004, + "step": 21000 + }, + { + "epoch": 1.2858804088377502, + "grad_norm": 0.20316782593727112, + "learning_rate": 5.942056013575106e-06, + "loss": 0.004, + "step": 21010 + }, + { + "epoch": 1.2864924413978824, + "grad_norm": 0.20989514887332916, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0053, + "step": 21020 + }, + { + "epoch": 1.2871044739580146, + "grad_norm": 0.33795273303985596, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0048, + "step": 21030 + }, + { + "epoch": 1.2877165065181468, + "grad_norm": 0.13918501138687134, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.004, + "step": 21040 + }, + { + "epoch": 1.288328539078279, + "grad_norm": 0.2992899715900421, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0038, + "step": 21050 + }, + { + "epoch": 1.288940571638411, + "grad_norm": 0.2540164589881897, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0037, + "step": 21060 + }, + { + "epoch": 1.2895526041985432, + "grad_norm": 0.161032035946846, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0047, + "step": 21070 + }, + { + "epoch": 1.2901646367586754, + "grad_norm": 0.1743200421333313, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0037, + "step": 21080 + }, + { + "epoch": 1.2907766693188076, + "grad_norm": 0.26604363322257996, + "learning_rate": 5.877731250949785e-06, + "loss": 0.004, + "step": 21090 + }, + { + "epoch": 1.2913887018789398, + "grad_norm": 0.275696724653244, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0044, + "step": 21100 + }, + { + "epoch": 1.292000734439072, + "grad_norm": 0.16888457536697388, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0042, + "step": 21110 + }, + { + "epoch": 1.2926127669992042, + "grad_norm": 0.12902231514453888, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0048, + "step": 21120 + }, + { + "epoch": 1.2932247995593364, + "grad_norm": 0.14577728509902954, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0046, + "step": 21130 + }, + { + "epoch": 1.2938368321194686, + "grad_norm": 0.1544434279203415, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0031, + "step": 21140 + }, + { + "epoch": 1.2944488646796009, + "grad_norm": 0.09238115698099136, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0035, + "step": 21150 + }, + { + "epoch": 1.295060897239733, + "grad_norm": 0.1770051270723343, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0033, + "step": 21160 + }, + { + "epoch": 1.2956729297998653, + "grad_norm": 0.20360831916332245, + "learning_rate": 5.813791207086085e-06, + "loss": 0.0037, + "step": 21170 + }, + { + "epoch": 1.2962849623599975, + "grad_norm": 0.18503794074058533, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0045, + "step": 21180 + }, + { + "epoch": 1.2968969949201297, + "grad_norm": 0.12918968498706818, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0048, + "step": 21190 + }, + { + "epoch": 1.2975090274802619, + "grad_norm": 0.14289438724517822, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0041, + "step": 21200 + }, + { + "epoch": 1.298121060040394, + "grad_norm": 0.17546117305755615, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0042, + "step": 21210 + }, + { + "epoch": 1.2987330926005263, + "grad_norm": 0.2919277846813202, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0051, + "step": 21220 + }, + { + "epoch": 1.2993451251606585, + "grad_norm": 0.0988069474697113, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0044, + "step": 21230 + }, + { + "epoch": 1.2999571577207907, + "grad_norm": 0.19284513592720032, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0037, + "step": 21240 + }, + { + "epoch": 1.3005691902809229, + "grad_norm": 0.12894058227539062, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0031, + "step": 21250 + }, + { + "epoch": 1.301181222841055, + "grad_norm": 0.14740346372127533, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0037, + "step": 21260 + }, + { + "epoch": 1.3017932554011873, + "grad_norm": 0.16817794740200043, + "learning_rate": 5.734414476316747e-06, + "loss": 0.005, + "step": 21270 + }, + { + "epoch": 1.3024052879613195, + "grad_norm": 0.29237234592437744, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0039, + "step": 21280 + }, + { + "epoch": 1.3030173205214517, + "grad_norm": 0.12649856507778168, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0035, + "step": 21290 + }, + { + "epoch": 1.303629353081584, + "grad_norm": 0.11057443916797638, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0039, + "step": 21300 + }, + { + "epoch": 1.304241385641716, + "grad_norm": 0.13494674861431122, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0035, + "step": 21310 + }, + { + "epoch": 1.3048534182018483, + "grad_norm": 0.3079472482204437, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0042, + "step": 21320 + }, + { + "epoch": 1.3054654507619805, + "grad_norm": 0.13513535261154175, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0048, + "step": 21330 + }, + { + "epoch": 1.3060774833221127, + "grad_norm": 0.39266663789749146, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0046, + "step": 21340 + }, + { + "epoch": 1.306689515882245, + "grad_norm": 0.15097978711128235, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0047, + "step": 21350 + }, + { + "epoch": 1.3073015484423771, + "grad_norm": 0.25206202268600464, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0049, + "step": 21360 + }, + { + "epoch": 1.3079135810025093, + "grad_norm": 0.16765817999839783, + "learning_rate": 5.655655685355026e-06, + "loss": 0.005, + "step": 21370 + }, + { + "epoch": 1.3085256135626415, + "grad_norm": 0.2137158215045929, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0048, + "step": 21380 + }, + { + "epoch": 1.3091376461227737, + "grad_norm": 0.19711454212665558, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0043, + "step": 21390 + }, + { + "epoch": 1.309749678682906, + "grad_norm": 0.1722051054239273, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0044, + "step": 21400 + }, + { + "epoch": 1.3103617112430381, + "grad_norm": 0.1807536482810974, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0045, + "step": 21410 + }, + { + "epoch": 1.3109737438031703, + "grad_norm": 0.15052185952663422, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.004, + "step": 21420 + }, + { + "epoch": 1.3115857763633025, + "grad_norm": 0.1485220491886139, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0036, + "step": 21430 + }, + { + "epoch": 1.3121978089234347, + "grad_norm": 0.15065325796604156, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0037, + "step": 21440 + }, + { + "epoch": 1.312809841483567, + "grad_norm": 0.17903591692447662, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0047, + "step": 21450 + }, + { + "epoch": 1.3134218740436991, + "grad_norm": 0.14310622215270996, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0043, + "step": 21460 + }, + { + "epoch": 1.3140339066038313, + "grad_norm": 0.12117830663919449, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0053, + "step": 21470 + }, + { + "epoch": 1.3146459391639636, + "grad_norm": 0.1484573632478714, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0036, + "step": 21480 + }, + { + "epoch": 1.3152579717240958, + "grad_norm": 0.16559219360351562, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0037, + "step": 21490 + }, + { + "epoch": 1.315870004284228, + "grad_norm": 0.21626432240009308, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0031, + "step": 21500 + }, + { + "epoch": 1.3164820368443602, + "grad_norm": 0.08177383989095688, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0052, + "step": 21510 + }, + { + "epoch": 1.3170940694044924, + "grad_norm": 0.18640732765197754, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0062, + "step": 21520 + }, + { + "epoch": 1.3177061019646246, + "grad_norm": 0.2599853277206421, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0039, + "step": 21530 + }, + { + "epoch": 1.3183181345247568, + "grad_norm": 0.1591203212738037, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0034, + "step": 21540 + }, + { + "epoch": 1.318930167084889, + "grad_norm": 0.2834412455558777, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0037, + "step": 21550 + }, + { + "epoch": 1.3195421996450212, + "grad_norm": 0.13853803277015686, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0038, + "step": 21560 + }, + { + "epoch": 1.3201542322051534, + "grad_norm": 0.14707128703594208, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0042, + "step": 21570 + }, + { + "epoch": 1.3207662647652856, + "grad_norm": 0.12561920285224915, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0038, + "step": 21580 + }, + { + "epoch": 1.3213782973254178, + "grad_norm": 0.4156799018383026, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0051, + "step": 21590 + }, + { + "epoch": 1.32199032988555, + "grad_norm": 0.11400662362575531, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0031, + "step": 21600 + }, + { + "epoch": 1.3226023624456822, + "grad_norm": 0.15658807754516602, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0052, + "step": 21610 + }, + { + "epoch": 1.3232143950058144, + "grad_norm": 0.1212862953543663, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0034, + "step": 21620 + }, + { + "epoch": 1.3238264275659466, + "grad_norm": 0.2201654314994812, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0036, + "step": 21630 + }, + { + "epoch": 1.3244384601260788, + "grad_norm": 0.11623375117778778, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0032, + "step": 21640 + }, + { + "epoch": 1.325050492686211, + "grad_norm": 0.13092897832393646, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0035, + "step": 21650 + }, + { + "epoch": 1.3256625252463432, + "grad_norm": 0.15409153699874878, + "learning_rate": 5.430834687545416e-06, + "loss": 0.004, + "step": 21660 + }, + { + "epoch": 1.3262745578064754, + "grad_norm": 0.3148297369480133, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0031, + "step": 21670 + }, + { + "epoch": 1.3268865903666076, + "grad_norm": 0.13435055315494537, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0033, + "step": 21680 + }, + { + "epoch": 1.3274986229267398, + "grad_norm": 0.17878089845180511, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0029, + "step": 21690 + }, + { + "epoch": 1.328110655486872, + "grad_norm": 0.1823783665895462, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0039, + "step": 21700 + }, + { + "epoch": 1.3287226880470042, + "grad_norm": 0.14492660760879517, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0033, + "step": 21710 + }, + { + "epoch": 1.3293347206071364, + "grad_norm": 0.1730341762304306, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0041, + "step": 21720 + }, + { + "epoch": 1.3299467531672686, + "grad_norm": 0.07961586117744446, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0039, + "step": 21730 + }, + { + "epoch": 1.3305587857274008, + "grad_norm": 0.14440582692623138, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0038, + "step": 21740 + }, + { + "epoch": 1.331170818287533, + "grad_norm": 0.22034496068954468, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0023, + "step": 21750 + }, + { + "epoch": 1.3317828508476652, + "grad_norm": 0.1861305832862854, + "learning_rate": 5.354573491223212e-06, + "loss": 0.005, + "step": 21760 + }, + { + "epoch": 1.3323948834077972, + "grad_norm": 0.15587164461612701, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.0044, + "step": 21770 + }, + { + "epoch": 1.3330069159679294, + "grad_norm": 0.6852900981903076, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0075, + "step": 21780 + }, + { + "epoch": 1.3336189485280616, + "grad_norm": 0.14315280318260193, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0034, + "step": 21790 + }, + { + "epoch": 1.3342309810881938, + "grad_norm": 0.350981205701828, + "learning_rate": 5.324254018551227e-06, + "loss": 0.004, + "step": 21800 + }, + { + "epoch": 1.334843013648326, + "grad_norm": 0.12344911694526672, + "learning_rate": 5.316690780174352e-06, + "loss": 0.004, + "step": 21810 + }, + { + "epoch": 1.3354550462084582, + "grad_norm": 0.18744061887264252, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0035, + "step": 21820 + }, + { + "epoch": 1.3360670787685904, + "grad_norm": 0.22747837007045746, + "learning_rate": 5.301584321328435e-06, + "loss": 0.004, + "step": 21830 + }, + { + "epoch": 1.3366791113287226, + "grad_norm": 0.22695699334144592, + "learning_rate": 5.294041118587667e-06, + "loss": 0.0042, + "step": 21840 + }, + { + "epoch": 1.3372911438888548, + "grad_norm": 0.17258964478969574, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0044, + "step": 21850 + }, + { + "epoch": 1.337903176448987, + "grad_norm": 0.1523793637752533, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0047, + "step": 21860 + }, + { + "epoch": 1.3385152090091192, + "grad_norm": 0.1983587145805359, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0037, + "step": 21870 + }, + { + "epoch": 1.3391272415692514, + "grad_norm": 0.1263747215270996, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0034, + "step": 21880 + }, + { + "epoch": 1.3397392741293837, + "grad_norm": 0.1550009399652481, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0037, + "step": 21890 + }, + { + "epoch": 1.3403513066895159, + "grad_norm": 0.14963915944099426, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0038, + "step": 21900 + }, + { + "epoch": 1.340963339249648, + "grad_norm": 0.17783671617507935, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0036, + "step": 21910 + }, + { + "epoch": 1.3415753718097803, + "grad_norm": 0.2715896964073181, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0036, + "step": 21920 + }, + { + "epoch": 1.3421874043699125, + "grad_norm": 0.22924886643886566, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0037, + "step": 21930 + }, + { + "epoch": 1.3427994369300447, + "grad_norm": 0.13689789175987244, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0033, + "step": 21940 + }, + { + "epoch": 1.3434114694901769, + "grad_norm": 0.09137748926877975, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0031, + "step": 21950 + }, + { + "epoch": 1.344023502050309, + "grad_norm": 0.17097881436347961, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0031, + "step": 21960 + }, + { + "epoch": 1.3446355346104413, + "grad_norm": 0.23919200897216797, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0046, + "step": 21970 + }, + { + "epoch": 1.3452475671705735, + "grad_norm": 0.14261527359485626, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0037, + "step": 21980 + }, + { + "epoch": 1.3458595997307057, + "grad_norm": 0.156734898686409, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0043, + "step": 21990 + }, + { + "epoch": 1.3464716322908379, + "grad_norm": 0.21755588054656982, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0032, + "step": 22000 + }, + { + "epoch": 1.34708366485097, + "grad_norm": 0.1373317390680313, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0033, + "step": 22010 + }, + { + "epoch": 1.3476956974111023, + "grad_norm": 0.1646856814622879, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0047, + "step": 22020 + }, + { + "epoch": 1.3483077299712345, + "grad_norm": 0.1908850073814392, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0044, + "step": 22030 + }, + { + "epoch": 1.3489197625313667, + "grad_norm": 0.24862833321094513, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0041, + "step": 22040 + }, + { + "epoch": 1.349531795091499, + "grad_norm": 0.15980397164821625, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0033, + "step": 22050 + }, + { + "epoch": 1.350143827651631, + "grad_norm": 0.1157977357506752, + "learning_rate": 5.129800405815733e-06, + "loss": 0.0036, + "step": 22060 + }, + { + "epoch": 1.3507558602117633, + "grad_norm": 0.11186888068914413, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0046, + "step": 22070 + }, + { + "epoch": 1.3513678927718955, + "grad_norm": 0.17715996503829956, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0035, + "step": 22080 + }, + { + "epoch": 1.3519799253320277, + "grad_norm": 0.1265174001455307, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0048, + "step": 22090 + }, + { + "epoch": 1.35259195789216, + "grad_norm": 0.13969522714614868, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0028, + "step": 22100 + }, + { + "epoch": 1.3532039904522921, + "grad_norm": 0.13246525824069977, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0026, + "step": 22110 + }, + { + "epoch": 1.3538160230124243, + "grad_norm": 0.14675064384937286, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0082, + "step": 22120 + }, + { + "epoch": 1.3544280555725565, + "grad_norm": 0.15810683369636536, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0031, + "step": 22130 + }, + { + "epoch": 1.3550400881326887, + "grad_norm": 0.20675864815711975, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0035, + "step": 22140 + }, + { + "epoch": 1.355652120692821, + "grad_norm": 0.1921442300081253, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0038, + "step": 22150 + }, + { + "epoch": 1.3562641532529531, + "grad_norm": 0.14300711452960968, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0035, + "step": 22160 + }, + { + "epoch": 1.3568761858130853, + "grad_norm": 0.0656728520989418, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0047, + "step": 22170 + }, + { + "epoch": 1.3574882183732175, + "grad_norm": 0.148203507065773, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0041, + "step": 22180 + }, + { + "epoch": 1.3581002509333495, + "grad_norm": 0.15472126007080078, + "learning_rate": 5.034310349217475e-06, + "loss": 0.004, + "step": 22190 + }, + { + "epoch": 1.3587122834934817, + "grad_norm": 0.12006669491529465, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0034, + "step": 22200 + }, + { + "epoch": 1.359324316053614, + "grad_norm": 0.15345145761966705, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0032, + "step": 22210 + }, + { + "epoch": 1.3599363486137461, + "grad_norm": 0.17429186403751373, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0039, + "step": 22220 + }, + { + "epoch": 1.3605483811738783, + "grad_norm": 0.20691345632076263, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0029, + "step": 22230 + }, + { + "epoch": 1.3611604137340105, + "grad_norm": 0.1874946504831314, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0042, + "step": 22240 + }, + { + "epoch": 1.3617724462941427, + "grad_norm": 0.12159912288188934, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0033, + "step": 22250 + }, + { + "epoch": 1.362384478854275, + "grad_norm": 0.29434919357299805, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0044, + "step": 22260 + }, + { + "epoch": 1.3629965114144071, + "grad_norm": 0.06661798804998398, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0031, + "step": 22270 + }, + { + "epoch": 1.3636085439745393, + "grad_norm": 0.14819994568824768, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0039, + "step": 22280 + }, + { + "epoch": 1.3642205765346715, + "grad_norm": 0.17289887368679047, + "learning_rate": 4.961660586405147e-06, + "loss": 0.0035, + "step": 22290 + }, + { + "epoch": 1.3648326090948038, + "grad_norm": 0.18789313733577728, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0036, + "step": 22300 + }, + { + "epoch": 1.365444641654936, + "grad_norm": 0.1877586394548416, + "learning_rate": 4.947215397583639e-06, + "loss": 0.004, + "step": 22310 + }, + { + "epoch": 1.3660566742150682, + "grad_norm": 0.11696574836969376, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0036, + "step": 22320 + }, + { + "epoch": 1.3666687067752004, + "grad_norm": 0.2511763274669647, + "learning_rate": 4.932798621873274e-06, + "loss": 0.004, + "step": 22330 + }, + { + "epoch": 1.3672807393353326, + "grad_norm": 0.15005314350128174, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0046, + "step": 22340 + }, + { + "epoch": 1.3678927718954648, + "grad_norm": 0.16856855154037476, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0056, + "step": 22350 + }, + { + "epoch": 1.368504804455597, + "grad_norm": 0.24532385170459747, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0035, + "step": 22360 + }, + { + "epoch": 1.3691168370157292, + "grad_norm": 0.29320162534713745, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0047, + "step": 22370 + }, + { + "epoch": 1.3697288695758614, + "grad_norm": 0.1518300473690033, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0041, + "step": 22380 + }, + { + "epoch": 1.3703409021359936, + "grad_norm": 0.13431201875209808, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0033, + "step": 22390 + }, + { + "epoch": 1.3709529346961258, + "grad_norm": 0.17390409111976624, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0039, + "step": 22400 + }, + { + "epoch": 1.371564967256258, + "grad_norm": 0.16482478380203247, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.007, + "step": 22410 + }, + { + "epoch": 1.3721769998163902, + "grad_norm": 0.11469490826129913, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0041, + "step": 22420 + }, + { + "epoch": 1.3727890323765224, + "grad_norm": 0.2327135056257248, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0043, + "step": 22430 + }, + { + "epoch": 1.3734010649366546, + "grad_norm": 0.1373092532157898, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0036, + "step": 22440 + }, + { + "epoch": 1.3740130974967868, + "grad_norm": 0.1534084528684616, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0028, + "step": 22450 + }, + { + "epoch": 1.374625130056919, + "grad_norm": 0.3217960596084595, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0044, + "step": 22460 + }, + { + "epoch": 1.3752371626170512, + "grad_norm": 0.14245563745498657, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0039, + "step": 22470 + }, + { + "epoch": 1.3758491951771834, + "grad_norm": 0.17652876675128937, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0031, + "step": 22480 + }, + { + "epoch": 1.3764612277373156, + "grad_norm": 0.1996244192123413, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0034, + "step": 22490 + }, + { + "epoch": 1.3770732602974478, + "grad_norm": 0.1658472716808319, + "learning_rate": 4.81141273556404e-06, + "loss": 0.003, + "step": 22500 + }, + { + "epoch": 1.37768529285758, + "grad_norm": 0.16233472526073456, + "learning_rate": 4.804337352679613e-06, + "loss": 0.004, + "step": 22510 + }, + { + "epoch": 1.3782973254177122, + "grad_norm": 0.13045033812522888, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0049, + "step": 22520 + }, + { + "epoch": 1.3789093579778444, + "grad_norm": 0.1195274218916893, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0042, + "step": 22530 + }, + { + "epoch": 1.3795213905379766, + "grad_norm": 0.14395804703235626, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0036, + "step": 22540 + }, + { + "epoch": 1.3801334230981088, + "grad_norm": 0.24495497345924377, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0047, + "step": 22550 + }, + { + "epoch": 1.380745455658241, + "grad_norm": 0.14288006722927094, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0044, + "step": 22560 + }, + { + "epoch": 1.3813574882183732, + "grad_norm": 0.16967979073524475, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0051, + "step": 22570 + }, + { + "epoch": 1.3819695207785054, + "grad_norm": 0.2023036777973175, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0032, + "step": 22580 + }, + { + "epoch": 1.3825815533386376, + "grad_norm": 0.1191902756690979, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0026, + "step": 22590 + }, + { + "epoch": 1.3831935858987698, + "grad_norm": 0.16922403872013092, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0037, + "step": 22600 + }, + { + "epoch": 1.383805618458902, + "grad_norm": 0.12394976615905762, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0038, + "step": 22610 + }, + { + "epoch": 1.3844176510190342, + "grad_norm": 0.23889753222465515, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0041, + "step": 22620 + }, + { + "epoch": 1.3850296835791664, + "grad_norm": 0.31215062737464905, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0036, + "step": 22630 + }, + { + "epoch": 1.3856417161392987, + "grad_norm": 0.1519152820110321, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0042, + "step": 22640 + }, + { + "epoch": 1.3862537486994309, + "grad_norm": 0.3375433683395386, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0039, + "step": 22650 + }, + { + "epoch": 1.386865781259563, + "grad_norm": 0.21715323626995087, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0027, + "step": 22660 + }, + { + "epoch": 1.3874778138196953, + "grad_norm": 0.2066027969121933, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0033, + "step": 22670 + }, + { + "epoch": 1.3880898463798275, + "grad_norm": 0.11542408168315887, + "learning_rate": 4.6851750421442e-06, + "loss": 0.004, + "step": 22680 + }, + { + "epoch": 1.3887018789399597, + "grad_norm": 0.1183561235666275, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0037, + "step": 22690 + }, + { + "epoch": 1.3893139115000919, + "grad_norm": 0.24478662014007568, + "learning_rate": 4.67129597392514e-06, + "loss": 0.004, + "step": 22700 + }, + { + "epoch": 1.389925944060224, + "grad_norm": 0.28880801796913147, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0039, + "step": 22710 + }, + { + "epoch": 1.3905379766203563, + "grad_norm": 0.14014701545238495, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0034, + "step": 22720 + }, + { + "epoch": 1.3911500091804885, + "grad_norm": 0.1549793928861618, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0033, + "step": 22730 + }, + { + "epoch": 1.3917620417406207, + "grad_norm": 0.1423012614250183, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0041, + "step": 22740 + }, + { + "epoch": 1.3923740743007529, + "grad_norm": 0.291273832321167, + "learning_rate": 4.636728419531758e-06, + "loss": 0.004, + "step": 22750 + }, + { + "epoch": 1.392986106860885, + "grad_norm": 0.38278621435165405, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0045, + "step": 22760 + }, + { + "epoch": 1.3935981394210173, + "grad_norm": 0.20528365671634674, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0042, + "step": 22770 + }, + { + "epoch": 1.3942101719811495, + "grad_norm": 0.11913729459047318, + "learning_rate": 4.616077433849538e-06, + "loss": 0.003, + "step": 22780 + }, + { + "epoch": 1.3948222045412817, + "grad_norm": 0.21683627367019653, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0027, + "step": 22790 + }, + { + "epoch": 1.395434237101414, + "grad_norm": 0.12143554538488388, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0031, + "step": 22800 + }, + { + "epoch": 1.396046269661546, + "grad_norm": 0.14171159267425537, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0039, + "step": 22810 + }, + { + "epoch": 1.3966583022216783, + "grad_norm": 0.19254790246486664, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0043, + "step": 22820 + }, + { + "epoch": 1.3972703347818105, + "grad_norm": 0.12295825034379959, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0045, + "step": 22830 + }, + { + "epoch": 1.3978823673419427, + "grad_norm": 0.1274985820055008, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0037, + "step": 22840 + }, + { + "epoch": 1.398494399902075, + "grad_norm": 0.2940427362918854, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0059, + "step": 22850 + }, + { + "epoch": 1.3991064324622071, + "grad_norm": 0.15357589721679688, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0032, + "step": 22860 + }, + { + "epoch": 1.3997184650223393, + "grad_norm": 0.12781603634357452, + "learning_rate": 4.554529907376127e-06, + "loss": 0.003, + "step": 22870 + }, + { + "epoch": 1.4003304975824715, + "grad_norm": 0.34976109862327576, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0047, + "step": 22880 + }, + { + "epoch": 1.4009425301426035, + "grad_norm": 0.1797824203968048, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0034, + "step": 22890 + }, + { + "epoch": 1.4015545627027357, + "grad_norm": 0.13750647008419037, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0046, + "step": 22900 + }, + { + "epoch": 1.402166595262868, + "grad_norm": 0.22893266379833221, + "learning_rate": 4.527371771040039e-06, + "loss": 0.005, + "step": 22910 + }, + { + "epoch": 1.4027786278230001, + "grad_norm": 0.1595923751592636, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0045, + "step": 22920 + }, + { + "epoch": 1.4033906603831323, + "grad_norm": 0.11474192142486572, + "learning_rate": 4.513838246961138e-06, + "loss": 0.003, + "step": 22930 + }, + { + "epoch": 1.4040026929432645, + "grad_norm": 0.12208060175180435, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0038, + "step": 22940 + }, + { + "epoch": 1.4046147255033967, + "grad_norm": 0.2919016480445862, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0036, + "step": 22950 + }, + { + "epoch": 1.405226758063529, + "grad_norm": 0.19161155819892883, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0049, + "step": 22960 + }, + { + "epoch": 1.4058387906236611, + "grad_norm": 0.1454700380563736, + "learning_rate": 4.486862604628113e-06, + "loss": 0.004, + "step": 22970 + }, + { + "epoch": 1.4064508231837933, + "grad_norm": 0.227305606007576, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0043, + "step": 22980 + }, + { + "epoch": 1.4070628557439255, + "grad_norm": 0.09430288523435593, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0051, + "step": 22990 + }, + { + "epoch": 1.4076748883040577, + "grad_norm": 0.09664178639650345, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0036, + "step": 23000 + }, + { + "epoch": 1.40828692086419, + "grad_norm": 0.21268269419670105, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0031, + "step": 23010 + }, + { + "epoch": 1.4088989534243221, + "grad_norm": 0.09796992689371109, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0041, + "step": 23020 + }, + { + "epoch": 1.4095109859844543, + "grad_norm": 0.18376071751117706, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0039, + "step": 23030 + }, + { + "epoch": 1.4101230185445865, + "grad_norm": 0.10276145488023758, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0035, + "step": 23040 + }, + { + "epoch": 1.4107350511047188, + "grad_norm": 0.16089564561843872, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0051, + "step": 23050 + }, + { + "epoch": 1.411347083664851, + "grad_norm": 0.1825491487979889, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0036, + "step": 23060 + }, + { + "epoch": 1.4119591162249832, + "grad_norm": 0.24405492842197418, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0028, + "step": 23070 + }, + { + "epoch": 1.4125711487851154, + "grad_norm": 0.14085668325424194, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0039, + "step": 23080 + }, + { + "epoch": 1.4131831813452476, + "grad_norm": 0.11708472669124603, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0035, + "step": 23090 + }, + { + "epoch": 1.4137952139053798, + "grad_norm": 0.12108796834945679, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0036, + "step": 23100 + }, + { + "epoch": 1.414407246465512, + "grad_norm": 0.14601854979991913, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0036, + "step": 23110 + }, + { + "epoch": 1.4150192790256442, + "grad_norm": 0.10614772886037827, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0031, + "step": 23120 + }, + { + "epoch": 1.4156313115857764, + "grad_norm": 0.09014416486024857, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0027, + "step": 23130 + }, + { + "epoch": 1.4162433441459086, + "grad_norm": 0.15246634185314178, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0031, + "step": 23140 + }, + { + "epoch": 1.4168553767060408, + "grad_norm": 0.20104879140853882, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0075, + "step": 23150 + }, + { + "epoch": 1.417467409266173, + "grad_norm": 0.1359969973564148, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0036, + "step": 23160 + }, + { + "epoch": 1.4180794418263052, + "grad_norm": 0.19849587976932526, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0054, + "step": 23170 + }, + { + "epoch": 1.4186914743864374, + "grad_norm": 0.12617377936840057, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0024, + "step": 23180 + }, + { + "epoch": 1.4193035069465696, + "grad_norm": 0.15024134516716003, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0034, + "step": 23190 + }, + { + "epoch": 1.4199155395067018, + "grad_norm": 0.2345605194568634, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0036, + "step": 23200 + }, + { + "epoch": 1.420527572066834, + "grad_norm": 0.13125917315483093, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0035, + "step": 23210 + }, + { + "epoch": 1.4211396046269662, + "grad_norm": 0.20977836847305298, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0038, + "step": 23220 + }, + { + "epoch": 1.4217516371870984, + "grad_norm": 0.3925677537918091, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0043, + "step": 23230 + }, + { + "epoch": 1.4223636697472306, + "grad_norm": 0.17691555619239807, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0048, + "step": 23240 + }, + { + "epoch": 1.4229757023073628, + "grad_norm": 0.18366187810897827, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0033, + "step": 23250 + }, + { + "epoch": 1.423587734867495, + "grad_norm": 0.15539205074310303, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0039, + "step": 23260 + }, + { + "epoch": 1.4241997674276272, + "grad_norm": 0.15048520267009735, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0032, + "step": 23270 + }, + { + "epoch": 1.4248117999877594, + "grad_norm": 0.2631739675998688, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0061, + "step": 23280 + }, + { + "epoch": 1.4254238325478916, + "grad_norm": 0.18545641005039215, + "learning_rate": 4.275502195405868e-06, + "loss": 0.005, + "step": 23290 + }, + { + "epoch": 1.4260358651080238, + "grad_norm": 0.25486356019973755, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0033, + "step": 23300 + }, + { + "epoch": 1.426647897668156, + "grad_norm": 0.2514204978942871, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0043, + "step": 23310 + }, + { + "epoch": 1.427259930228288, + "grad_norm": 0.12997376918792725, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0034, + "step": 23320 + }, + { + "epoch": 1.4278719627884202, + "grad_norm": 0.26096200942993164, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0047, + "step": 23330 + }, + { + "epoch": 1.4284839953485524, + "grad_norm": 0.2292930781841278, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0038, + "step": 23340 + }, + { + "epoch": 1.4290960279086846, + "grad_norm": 0.20056717097759247, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0037, + "step": 23350 + }, + { + "epoch": 1.4297080604688168, + "grad_norm": 0.1608581393957138, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0032, + "step": 23360 + }, + { + "epoch": 1.430320093028949, + "grad_norm": 0.235102578997612, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0034, + "step": 23370 + }, + { + "epoch": 1.4309321255890812, + "grad_norm": 0.11869259178638458, + "learning_rate": 4.217502203129258e-06, + "loss": 0.005, + "step": 23380 + }, + { + "epoch": 1.4315441581492134, + "grad_norm": 0.167036771774292, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0045, + "step": 23390 + }, + { + "epoch": 1.4321561907093456, + "grad_norm": 0.13766071200370789, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0044, + "step": 23400 + }, + { + "epoch": 1.4327682232694778, + "grad_norm": 0.15444986522197723, + "learning_rate": 4.198311874248223e-06, + "loss": 0.004, + "step": 23410 + }, + { + "epoch": 1.43338025582961, + "grad_norm": 0.11997724324464798, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0034, + "step": 23420 + }, + { + "epoch": 1.4339922883897422, + "grad_norm": 0.1533307433128357, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0038, + "step": 23430 + }, + { + "epoch": 1.4346043209498744, + "grad_norm": 0.10954161733388901, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0045, + "step": 23440 + }, + { + "epoch": 1.4352163535100066, + "grad_norm": 0.16601058840751648, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0039, + "step": 23450 + }, + { + "epoch": 1.4358283860701389, + "grad_norm": 0.1756889373064041, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0035, + "step": 23460 + }, + { + "epoch": 1.436440418630271, + "grad_norm": 0.12633845210075378, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0046, + "step": 23470 + }, + { + "epoch": 1.4370524511904033, + "grad_norm": 0.15678541362285614, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0025, + "step": 23480 + }, + { + "epoch": 1.4376644837505355, + "grad_norm": 0.13923659920692444, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0052, + "step": 23490 + }, + { + "epoch": 1.4382765163106677, + "grad_norm": 0.28792211413383484, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0053, + "step": 23500 + }, + { + "epoch": 1.4388885488707999, + "grad_norm": 0.16125047206878662, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0037, + "step": 23510 + }, + { + "epoch": 1.439500581430932, + "grad_norm": 0.2653597593307495, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0067, + "step": 23520 + }, + { + "epoch": 1.4401126139910643, + "grad_norm": 0.2692917585372925, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0031, + "step": 23530 + }, + { + "epoch": 1.4407246465511965, + "grad_norm": 0.2234862893819809, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0044, + "step": 23540 + }, + { + "epoch": 1.4413366791113287, + "grad_norm": 0.17526887357234955, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0025, + "step": 23550 + }, + { + "epoch": 1.4419487116714609, + "grad_norm": 0.10404029488563538, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0041, + "step": 23560 + }, + { + "epoch": 1.442560744231593, + "grad_norm": 0.1385052353143692, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0027, + "step": 23570 + }, + { + "epoch": 1.4431727767917253, + "grad_norm": 0.30865412950515747, + "learning_rate": 4.090929556079854e-06, + "loss": 0.004, + "step": 23580 + }, + { + "epoch": 1.4437848093518575, + "grad_norm": 0.10908320546150208, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0041, + "step": 23590 + }, + { + "epoch": 1.4443968419119897, + "grad_norm": 0.09885916113853455, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0045, + "step": 23600 + }, + { + "epoch": 1.445008874472122, + "grad_norm": 0.1685211956501007, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0031, + "step": 23610 + }, + { + "epoch": 1.445620907032254, + "grad_norm": 0.0967954769730568, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0031, + "step": 23620 + }, + { + "epoch": 1.4462329395923863, + "grad_norm": 0.07489120960235596, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0038, + "step": 23630 + }, + { + "epoch": 1.4468449721525185, + "grad_norm": 0.20616063475608826, + "learning_rate": 4.053587511509546e-06, + "loss": 0.0043, + "step": 23640 + }, + { + "epoch": 1.4474570047126507, + "grad_norm": 0.15788249671459198, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0031, + "step": 23650 + }, + { + "epoch": 1.448069037272783, + "grad_norm": 0.10360633581876755, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0027, + "step": 23660 + }, + { + "epoch": 1.4486810698329151, + "grad_norm": 0.2871163785457611, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0028, + "step": 23670 + }, + { + "epoch": 1.4492931023930473, + "grad_norm": 0.15280364453792572, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0042, + "step": 23680 + }, + { + "epoch": 1.4499051349531795, + "grad_norm": 0.17502477765083313, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0031, + "step": 23690 + }, + { + "epoch": 1.4505171675133117, + "grad_norm": 0.2154005616903305, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0036, + "step": 23700 + }, + { + "epoch": 1.451129200073444, + "grad_norm": 0.15002919733524323, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0033, + "step": 23710 + }, + { + "epoch": 1.4517412326335761, + "grad_norm": 0.10422170162200928, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0032, + "step": 23720 + }, + { + "epoch": 1.4523532651937083, + "grad_norm": 0.15197636187076569, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0043, + "step": 23730 + }, + { + "epoch": 1.4529652977538405, + "grad_norm": 0.2571481466293335, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0039, + "step": 23740 + }, + { + "epoch": 1.4535773303139727, + "grad_norm": 0.12697578966617584, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0034, + "step": 23750 + }, + { + "epoch": 1.454189362874105, + "grad_norm": 0.14347535371780396, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0051, + "step": 23760 + }, + { + "epoch": 1.4548013954342371, + "grad_norm": 0.1494351178407669, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0037, + "step": 23770 + }, + { + "epoch": 1.4554134279943693, + "grad_norm": 0.23901797831058502, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0032, + "step": 23780 + }, + { + "epoch": 1.4560254605545015, + "grad_norm": 0.1434790939092636, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0036, + "step": 23790 + }, + { + "epoch": 1.4566374931146338, + "grad_norm": 0.1456829458475113, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0034, + "step": 23800 + }, + { + "epoch": 1.457249525674766, + "grad_norm": 0.33969590067863464, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0038, + "step": 23810 + }, + { + "epoch": 1.4578615582348982, + "grad_norm": 0.1768753081560135, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0027, + "step": 23820 + }, + { + "epoch": 1.4584735907950304, + "grad_norm": 0.15212708711624146, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0032, + "step": 23830 + }, + { + "epoch": 1.4590856233551626, + "grad_norm": 0.10870973765850067, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0033, + "step": 23840 + }, + { + "epoch": 1.4596976559152948, + "grad_norm": 0.17898528277873993, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0022, + "step": 23850 + }, + { + "epoch": 1.460309688475427, + "grad_norm": 0.15515227615833282, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0034, + "step": 23860 + }, + { + "epoch": 1.4609217210355592, + "grad_norm": 0.11047070473432541, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0032, + "step": 23870 + }, + { + "epoch": 1.4615337535956914, + "grad_norm": 0.08628113567829132, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0032, + "step": 23880 + }, + { + "epoch": 1.4621457861558236, + "grad_norm": 0.358903706073761, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0043, + "step": 23890 + }, + { + "epoch": 1.4627578187159558, + "grad_norm": 0.13986052572727203, + "learning_rate": 3.895183209452123e-06, + "loss": 0.003, + "step": 23900 + }, + { + "epoch": 1.463369851276088, + "grad_norm": 0.09236793220043182, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0029, + "step": 23910 + }, + { + "epoch": 1.4639818838362202, + "grad_norm": 0.14616963267326355, + "learning_rate": 3.883230136754435e-06, + "loss": 0.005, + "step": 23920 + }, + { + "epoch": 1.4645939163963524, + "grad_norm": 0.0754290223121643, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0031, + "step": 23930 + }, + { + "epoch": 1.4652059489564846, + "grad_norm": 0.16520163416862488, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0037, + "step": 23940 + }, + { + "epoch": 1.4658179815166168, + "grad_norm": 0.06801608204841614, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0023, + "step": 23950 + }, + { + "epoch": 1.466430014076749, + "grad_norm": 0.3087909519672394, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0042, + "step": 23960 + }, + { + "epoch": 1.4670420466368812, + "grad_norm": 0.23470532894134521, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0046, + "step": 23970 + }, + { + "epoch": 1.4676540791970134, + "grad_norm": 0.10248749703168869, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0025, + "step": 23980 + }, + { + "epoch": 1.4682661117571456, + "grad_norm": 0.12478570640087128, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0035, + "step": 23990 + }, + { + "epoch": 1.4688781443172778, + "grad_norm": 0.16669252514839172, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0034, + "step": 24000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.499133370171392e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/training_args.bin b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cd9e28a44ae85140e2ef027a82e8be4c39167cc4 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5644791eb57bcb4c4808b4c2429b71e4c49eece4fc60f263f4553a3380f230bb +size 6097 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/added_tokens.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/generation_config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00001-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aae9e860ae372e74936b80c44c96023c6f9e4337 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17d90877dfb234dcca6dc425c0d5fd8923fd973a2c680676596472d722693ce8 +size 4921072616 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00002-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b9b0015b145f6ee8a84ddfe3109a69412f7160a1 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c77ceb6b2ca34eae4bc4e46963649928a63c5c855046459502d8e65c32a4219a +size 4978830984 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00003-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..02cd384e0b3281ab9e1d802e278128ab3a6f2798 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cf3765dc38548b55bf43492bb15588015777a3b0fb9c0ba4ff5bb3dcb7ce02d +size 4100977896 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model.safetensors.index.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/norm_stats.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..f33de4b80f47e0bac1a414431a8354d8345d60c5 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -24.65332532291412, + -30.64622355117798, + -14.452480476760865, + -1.8581012797355654, + -2.2742317820549007, + -1.9569469915390014, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 3.0011677881240857, + 22.348905650329584, + 21.68580058555603, + 2.3937565994262693, + 4.117288079452516, + 3.295379007720948, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + -6.570000648498535, + -1.0618462562561035, + 3.623035430908203, + 0.010442602448165417, + 0.7240540385246277, + 0.44398337602615356, + 0.12898989021778107, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 7.04909086227417, + 17.099597930908203, + 8.363018989562988, + 0.6997263431549072, + 1.1358375549316406, + 0.9687971472740173, + 0.9916459321975708, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.777750787353515, + -21.249025872802733, + -2.4021557040214536, + -4.092200187206268, + -3.2986312219619753, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.645499613952634, + 30.59561934127808, + 14.405443457031247, + 1.8499586300849913, + 2.268683268356323, + 1.963451420021057, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.6817545890808105, + 1.3444018363952637, + -3.5411791801452637, + -0.009792014956474304, + -0.7230188846588135, + -0.44849714636802673, + 0.15749873220920563, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.988739013671875, + 16.884004592895508, + 8.242538452148438, + 0.6991510391235352, + 1.1302146911621094, + 0.9690405130386353, + 0.9875192046165466, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/pi0.yaml b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff8c0ecf109af377331583e4079865e7d8037bc8 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 5 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/special_tokens_map.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/tokenizer.model b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/tokenizer_config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/trainer_state.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4249caf0d5209f1d1022bc62fd28fd6dcee923fe --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/trainer_state.json @@ -0,0 +1,18234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5912846563437175, + "eval_steps": 500, + "global_step": 26000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006120325601321991, + "grad_norm": 2.2432243824005127, + "learning_rate": 1.8e-07, + "loss": 0.1384, + "step": 10 + }, + { + "epoch": 0.0012240651202643981, + "grad_norm": 1.959119439125061, + "learning_rate": 3.8e-07, + "loss": 0.1388, + "step": 20 + }, + { + "epoch": 0.001836097680396597, + "grad_norm": 1.8843899965286255, + "learning_rate": 5.800000000000001e-07, + "loss": 0.1307, + "step": 30 + }, + { + "epoch": 0.0024481302405287963, + "grad_norm": 1.7569042444229126, + "learning_rate": 7.8e-07, + "loss": 0.1238, + "step": 40 + }, + { + "epoch": 0.0030601628006609954, + "grad_norm": 2.6189017295837402, + "learning_rate": 9.800000000000001e-07, + "loss": 0.1275, + "step": 50 + }, + { + "epoch": 0.003672195360793194, + "grad_norm": 1.8418694734573364, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.1032, + "step": 60 + }, + { + "epoch": 0.004284227920925393, + "grad_norm": 1.481676697731018, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0816, + "step": 70 + }, + { + "epoch": 0.004896260481057593, + "grad_norm": 0.9590038061141968, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.0732, + "step": 80 + }, + { + "epoch": 0.005508293041189791, + "grad_norm": 1.002897024154663, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.0608, + "step": 90 + }, + { + "epoch": 0.006120325601321991, + "grad_norm": 0.9830108284950256, + "learning_rate": 1.98e-06, + "loss": 0.042, + "step": 100 + }, + { + "epoch": 0.006732358161454189, + "grad_norm": 0.858244001865387, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.0314, + "step": 110 + }, + { + "epoch": 0.007344390721586388, + "grad_norm": 0.5761063694953918, + "learning_rate": 2.38e-06, + "loss": 0.029, + "step": 120 + }, + { + "epoch": 0.007956423281718587, + "grad_norm": 0.5434514284133911, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.0227, + "step": 130 + }, + { + "epoch": 0.008568455841850786, + "grad_norm": 0.6488766670227051, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.0202, + "step": 140 + }, + { + "epoch": 0.009180488401982986, + "grad_norm": 0.36763015389442444, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.0157, + "step": 150 + }, + { + "epoch": 0.009792520962115185, + "grad_norm": 0.49271446466445923, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.0194, + "step": 160 + }, + { + "epoch": 0.010404553522247383, + "grad_norm": 0.23608209192752838, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.0122, + "step": 170 + }, + { + "epoch": 0.011016586082379582, + "grad_norm": 0.47871828079223633, + "learning_rate": 3.58e-06, + "loss": 0.0131, + "step": 180 + }, + { + "epoch": 0.011628618642511782, + "grad_norm": 0.6862446069717407, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.0131, + "step": 190 + }, + { + "epoch": 0.012240651202643981, + "grad_norm": 0.7964349389076233, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0155, + "step": 200 + }, + { + "epoch": 0.01285268376277618, + "grad_norm": 0.5564846396446228, + "learning_rate": 4.18e-06, + "loss": 0.0104, + "step": 210 + }, + { + "epoch": 0.013464716322908379, + "grad_norm": 0.2810452878475189, + "learning_rate": 4.38e-06, + "loss": 0.0128, + "step": 220 + }, + { + "epoch": 0.014076748883040578, + "grad_norm": 0.4474979341030121, + "learning_rate": 4.58e-06, + "loss": 0.0188, + "step": 230 + }, + { + "epoch": 0.014688781443172776, + "grad_norm": 0.47965875267982483, + "learning_rate": 4.78e-06, + "loss": 0.0141, + "step": 240 + }, + { + "epoch": 0.015300814003304975, + "grad_norm": 0.3410812020301819, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0085, + "step": 250 + }, + { + "epoch": 0.015912846563437173, + "grad_norm": 0.39907002449035645, + "learning_rate": 5.18e-06, + "loss": 0.0106, + "step": 260 + }, + { + "epoch": 0.016524879123569373, + "grad_norm": 0.28909367322921753, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0103, + "step": 270 + }, + { + "epoch": 0.017136911683701572, + "grad_norm": 0.31524109840393066, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0101, + "step": 280 + }, + { + "epoch": 0.017748944243833772, + "grad_norm": 0.29430100321769714, + "learning_rate": 5.78e-06, + "loss": 0.0109, + "step": 290 + }, + { + "epoch": 0.01836097680396597, + "grad_norm": 0.2709169387817383, + "learning_rate": 5.98e-06, + "loss": 0.0102, + "step": 300 + }, + { + "epoch": 0.01897300936409817, + "grad_norm": 0.33067119121551514, + "learning_rate": 6.18e-06, + "loss": 0.0095, + "step": 310 + }, + { + "epoch": 0.01958504192423037, + "grad_norm": 0.28110620379447937, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0102, + "step": 320 + }, + { + "epoch": 0.02019707448436257, + "grad_norm": 0.27736902236938477, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0088, + "step": 330 + }, + { + "epoch": 0.020809107044494766, + "grad_norm": 0.3238557279109955, + "learning_rate": 6.780000000000001e-06, + "loss": 0.01, + "step": 340 + }, + { + "epoch": 0.021421139604626965, + "grad_norm": 0.30263441801071167, + "learning_rate": 6.98e-06, + "loss": 0.0095, + "step": 350 + }, + { + "epoch": 0.022033172164759165, + "grad_norm": 0.2618265450000763, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0096, + "step": 360 + }, + { + "epoch": 0.022645204724891364, + "grad_norm": 0.272565633058548, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0093, + "step": 370 + }, + { + "epoch": 0.023257237285023564, + "grad_norm": 0.44272440671920776, + "learning_rate": 7.58e-06, + "loss": 0.0087, + "step": 380 + }, + { + "epoch": 0.023869269845155763, + "grad_norm": 0.27631404995918274, + "learning_rate": 7.78e-06, + "loss": 0.0093, + "step": 390 + }, + { + "epoch": 0.024481302405287963, + "grad_norm": 0.4108494520187378, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0093, + "step": 400 + }, + { + "epoch": 0.02509333496542016, + "grad_norm": 0.43498387932777405, + "learning_rate": 8.18e-06, + "loss": 0.0098, + "step": 410 + }, + { + "epoch": 0.02570536752555236, + "grad_norm": 0.3419845700263977, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0091, + "step": 420 + }, + { + "epoch": 0.026317400085684558, + "grad_norm": 0.5677013993263245, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0104, + "step": 430 + }, + { + "epoch": 0.026929432645816757, + "grad_norm": 0.24424298107624054, + "learning_rate": 8.78e-06, + "loss": 0.0089, + "step": 440 + }, + { + "epoch": 0.027541465205948957, + "grad_norm": 0.267781138420105, + "learning_rate": 8.98e-06, + "loss": 0.0107, + "step": 450 + }, + { + "epoch": 0.028153497766081156, + "grad_norm": 0.38459253311157227, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0081, + "step": 460 + }, + { + "epoch": 0.028765530326213356, + "grad_norm": 0.2647954523563385, + "learning_rate": 9.38e-06, + "loss": 0.0082, + "step": 470 + }, + { + "epoch": 0.029377562886345552, + "grad_norm": 0.44312018156051636, + "learning_rate": 9.58e-06, + "loss": 0.0102, + "step": 480 + }, + { + "epoch": 0.02998959544647775, + "grad_norm": 0.2309781014919281, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0118, + "step": 490 + }, + { + "epoch": 0.03060162800660995, + "grad_norm": 0.41755014657974243, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0094, + "step": 500 + }, + { + "epoch": 0.03121366056674215, + "grad_norm": 0.38537120819091797, + "learning_rate": 1.018e-05, + "loss": 0.011, + "step": 510 + }, + { + "epoch": 0.031825693126874346, + "grad_norm": 0.49801477789878845, + "learning_rate": 1.038e-05, + "loss": 0.0093, + "step": 520 + }, + { + "epoch": 0.03243772568700655, + "grad_norm": 0.3854966163635254, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0116, + "step": 530 + }, + { + "epoch": 0.033049758247138745, + "grad_norm": 0.3163810968399048, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.008, + "step": 540 + }, + { + "epoch": 0.03366179080727095, + "grad_norm": 0.33000636100769043, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0093, + "step": 550 + }, + { + "epoch": 0.034273823367403145, + "grad_norm": 0.3350297808647156, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0083, + "step": 560 + }, + { + "epoch": 0.03488585592753535, + "grad_norm": 0.18780949711799622, + "learning_rate": 1.138e-05, + "loss": 0.0097, + "step": 570 + }, + { + "epoch": 0.035497888487667544, + "grad_norm": 0.20399607717990875, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0092, + "step": 580 + }, + { + "epoch": 0.03610992104779974, + "grad_norm": 0.15931005775928497, + "learning_rate": 1.178e-05, + "loss": 0.0076, + "step": 590 + }, + { + "epoch": 0.03672195360793194, + "grad_norm": 0.20751547813415527, + "learning_rate": 1.198e-05, + "loss": 0.0079, + "step": 600 + }, + { + "epoch": 0.03733398616806414, + "grad_norm": 0.39666953682899475, + "learning_rate": 1.218e-05, + "loss": 0.0072, + "step": 610 + }, + { + "epoch": 0.03794601872819634, + "grad_norm": 0.385407030582428, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0089, + "step": 620 + }, + { + "epoch": 0.03855805128832854, + "grad_norm": 0.5228332877159119, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0092, + "step": 630 + }, + { + "epoch": 0.03917008384846074, + "grad_norm": 0.29315415024757385, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0098, + "step": 640 + }, + { + "epoch": 0.03978211640859294, + "grad_norm": 0.4300646483898163, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0079, + "step": 650 + }, + { + "epoch": 0.04039414896872514, + "grad_norm": 0.38021156191825867, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0103, + "step": 660 + }, + { + "epoch": 0.041006181528857336, + "grad_norm": 0.43489688634872437, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0105, + "step": 670 + }, + { + "epoch": 0.04161821408898953, + "grad_norm": 0.48019328713417053, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0124, + "step": 680 + }, + { + "epoch": 0.042230246649121735, + "grad_norm": 0.28486984968185425, + "learning_rate": 1.378e-05, + "loss": 0.0122, + "step": 690 + }, + { + "epoch": 0.04284227920925393, + "grad_norm": 0.35172080993652344, + "learning_rate": 1.398e-05, + "loss": 0.0093, + "step": 700 + }, + { + "epoch": 0.043454311769386134, + "grad_norm": 0.32531124353408813, + "learning_rate": 1.418e-05, + "loss": 0.0116, + "step": 710 + }, + { + "epoch": 0.04406634432951833, + "grad_norm": 0.388637512922287, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0077, + "step": 720 + }, + { + "epoch": 0.04467837688965053, + "grad_norm": 0.3816429078578949, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0123, + "step": 730 + }, + { + "epoch": 0.04529040944978273, + "grad_norm": 0.22786036133766174, + "learning_rate": 1.478e-05, + "loss": 0.0089, + "step": 740 + }, + { + "epoch": 0.045902442009914925, + "grad_norm": 0.2965328097343445, + "learning_rate": 1.498e-05, + "loss": 0.011, + "step": 750 + }, + { + "epoch": 0.04651447457004713, + "grad_norm": 0.3568362593650818, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0101, + "step": 760 + }, + { + "epoch": 0.047126507130179324, + "grad_norm": 0.2972166836261749, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0093, + "step": 770 + }, + { + "epoch": 0.04773853969031153, + "grad_norm": 0.4221388101577759, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.012, + "step": 780 + }, + { + "epoch": 0.04835057225044372, + "grad_norm": 0.37255391478538513, + "learning_rate": 1.578e-05, + "loss": 0.0085, + "step": 790 + }, + { + "epoch": 0.048962604810575926, + "grad_norm": 0.36007094383239746, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.008, + "step": 800 + }, + { + "epoch": 0.04957463737070812, + "grad_norm": 0.40588808059692383, + "learning_rate": 1.618e-05, + "loss": 0.0081, + "step": 810 + }, + { + "epoch": 0.05018666993084032, + "grad_norm": 0.46563687920570374, + "learning_rate": 1.638e-05, + "loss": 0.0076, + "step": 820 + }, + { + "epoch": 0.05079870249097252, + "grad_norm": 0.3161381483078003, + "learning_rate": 1.658e-05, + "loss": 0.0129, + "step": 830 + }, + { + "epoch": 0.05141073505110472, + "grad_norm": 0.3800298869609833, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0146, + "step": 840 + }, + { + "epoch": 0.05202276761123692, + "grad_norm": 0.36572107672691345, + "learning_rate": 1.698e-05, + "loss": 0.0148, + "step": 850 + }, + { + "epoch": 0.052634800171369116, + "grad_norm": 0.4084141254425049, + "learning_rate": 1.718e-05, + "loss": 0.0085, + "step": 860 + }, + { + "epoch": 0.05324683273150132, + "grad_norm": 0.2906867265701294, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0116, + "step": 870 + }, + { + "epoch": 0.053858865291633515, + "grad_norm": 0.41204380989074707, + "learning_rate": 1.758e-05, + "loss": 0.0076, + "step": 880 + }, + { + "epoch": 0.05447089785176571, + "grad_norm": 0.5292996764183044, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0094, + "step": 890 + }, + { + "epoch": 0.055082930411897914, + "grad_norm": 0.23192685842514038, + "learning_rate": 1.798e-05, + "loss": 0.0116, + "step": 900 + }, + { + "epoch": 0.05569496297203011, + "grad_norm": 0.41050270199775696, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0099, + "step": 910 + }, + { + "epoch": 0.05630699553216231, + "grad_norm": 0.3336002230644226, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0082, + "step": 920 + }, + { + "epoch": 0.05691902809229451, + "grad_norm": 0.46233776211738586, + "learning_rate": 1.858e-05, + "loss": 0.0104, + "step": 930 + }, + { + "epoch": 0.05753106065242671, + "grad_norm": 0.36776405572891235, + "learning_rate": 1.878e-05, + "loss": 0.0115, + "step": 940 + }, + { + "epoch": 0.05814309321255891, + "grad_norm": 0.47848618030548096, + "learning_rate": 1.898e-05, + "loss": 0.0108, + "step": 950 + }, + { + "epoch": 0.058755125772691104, + "grad_norm": 0.35507604479789734, + "learning_rate": 1.918e-05, + "loss": 0.0095, + "step": 960 + }, + { + "epoch": 0.05936715833282331, + "grad_norm": 0.4613397717475891, + "learning_rate": 1.938e-05, + "loss": 0.0119, + "step": 970 + }, + { + "epoch": 0.0599791908929555, + "grad_norm": 0.34492260217666626, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0109, + "step": 980 + }, + { + "epoch": 0.060591223453087706, + "grad_norm": 0.34624582529067993, + "learning_rate": 1.978e-05, + "loss": 0.0099, + "step": 990 + }, + { + "epoch": 0.0612032560132199, + "grad_norm": 0.9161475896835327, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0109, + "step": 1000 + }, + { + "epoch": 0.061815288573352105, + "grad_norm": 0.367807537317276, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0087, + "step": 1010 + }, + { + "epoch": 0.0624273211334843, + "grad_norm": 0.4043216407299042, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.0084, + "step": 1020 + }, + { + "epoch": 0.0630393536936165, + "grad_norm": 0.315305233001709, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0075, + "step": 1030 + }, + { + "epoch": 0.06365138625374869, + "grad_norm": 0.49702969193458557, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0103, + "step": 1040 + }, + { + "epoch": 0.0642634188138809, + "grad_norm": 0.46286216378211975, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0116, + "step": 1050 + }, + { + "epoch": 0.0648754513740131, + "grad_norm": 0.332142174243927, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0103, + "step": 1060 + }, + { + "epoch": 0.0654874839341453, + "grad_norm": 0.6118510961532593, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0118, + "step": 1070 + }, + { + "epoch": 0.06609951649427749, + "grad_norm": 0.49074795842170715, + "learning_rate": 1.999967041472886e-05, + "loss": 0.011, + "step": 1080 + }, + { + "epoch": 0.0667115490544097, + "grad_norm": 0.42575374245643616, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0125, + "step": 1090 + }, + { + "epoch": 0.0673235816145419, + "grad_norm": 0.3223794996738434, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0091, + "step": 1100 + }, + { + "epoch": 0.06793561417467409, + "grad_norm": 0.4952760636806488, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.011, + "step": 1110 + }, + { + "epoch": 0.06854764673480629, + "grad_norm": 0.36144813895225525, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0096, + "step": 1120 + }, + { + "epoch": 0.06915967929493849, + "grad_norm": 0.31190025806427, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0115, + "step": 1130 + }, + { + "epoch": 0.0697717118550707, + "grad_norm": 0.7014928460121155, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.014, + "step": 1140 + }, + { + "epoch": 0.07038374441520288, + "grad_norm": 0.4382205605506897, + "learning_rate": 1.999882759038658e-05, + "loss": 0.0111, + "step": 1150 + }, + { + "epoch": 0.07099577697533509, + "grad_norm": 0.3750714659690857, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0116, + "step": 1160 + }, + { + "epoch": 0.07160780953546729, + "grad_norm": 0.4174371361732483, + "learning_rate": 1.999849173538598e-05, + "loss": 0.009, + "step": 1170 + }, + { + "epoch": 0.07221984209559948, + "grad_norm": 0.44394591450691223, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0094, + "step": 1180 + }, + { + "epoch": 0.07283187465573168, + "grad_norm": 0.43412888050079346, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0098, + "step": 1190 + }, + { + "epoch": 0.07344390721586389, + "grad_norm": 0.6421196460723877, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.01, + "step": 1200 + }, + { + "epoch": 0.07405593977599609, + "grad_norm": 0.6313903331756592, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0137, + "step": 1210 + }, + { + "epoch": 0.07466797233612828, + "grad_norm": 0.49340254068374634, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0104, + "step": 1220 + }, + { + "epoch": 0.07528000489626048, + "grad_norm": 0.40420663356781006, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0126, + "step": 1230 + }, + { + "epoch": 0.07589203745639268, + "grad_norm": 0.3955318033695221, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.013, + "step": 1240 + }, + { + "epoch": 0.07650407001652489, + "grad_norm": 0.4967520236968994, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0098, + "step": 1250 + }, + { + "epoch": 0.07711610257665708, + "grad_norm": 0.3380029499530792, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0084, + "step": 1260 + }, + { + "epoch": 0.07772813513678928, + "grad_norm": 0.4542321562767029, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.009, + "step": 1270 + }, + { + "epoch": 0.07834016769692148, + "grad_norm": 0.4533286392688751, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0093, + "step": 1280 + }, + { + "epoch": 0.07895220025705367, + "grad_norm": 0.39559242129325867, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0106, + "step": 1290 + }, + { + "epoch": 0.07956423281718587, + "grad_norm": 0.23190362751483917, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.01, + "step": 1300 + }, + { + "epoch": 0.08017626537731808, + "grad_norm": 0.4732286334037781, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0089, + "step": 1310 + }, + { + "epoch": 0.08078829793745028, + "grad_norm": 0.3010174036026001, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0091, + "step": 1320 + }, + { + "epoch": 0.08140033049758247, + "grad_norm": 0.3989834189414978, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0097, + "step": 1330 + }, + { + "epoch": 0.08201236305771467, + "grad_norm": 0.4597114622592926, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.01, + "step": 1340 + }, + { + "epoch": 0.08262439561784687, + "grad_norm": 0.426826536655426, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.011, + "step": 1350 + }, + { + "epoch": 0.08323642817797906, + "grad_norm": 0.4876341223716736, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0105, + "step": 1360 + }, + { + "epoch": 0.08384846073811127, + "grad_norm": 0.5444457530975342, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0099, + "step": 1370 + }, + { + "epoch": 0.08446049329824347, + "grad_norm": 0.5096126794815063, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.007, + "step": 1380 + }, + { + "epoch": 0.08507252585837567, + "grad_norm": 0.43828368186950684, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.009, + "step": 1390 + }, + { + "epoch": 0.08568455841850786, + "grad_norm": 0.40163955092430115, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0103, + "step": 1400 + }, + { + "epoch": 0.08629659097864006, + "grad_norm": 0.3110432028770447, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0115, + "step": 1410 + }, + { + "epoch": 0.08690862353877227, + "grad_norm": 0.8393893241882324, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.012, + "step": 1420 + }, + { + "epoch": 0.08752065609890446, + "grad_norm": 0.2751714289188385, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0093, + "step": 1430 + }, + { + "epoch": 0.08813268865903666, + "grad_norm": 0.36969971656799316, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0112, + "step": 1440 + }, + { + "epoch": 0.08874472121916886, + "grad_norm": 0.3721938729286194, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0113, + "step": 1450 + }, + { + "epoch": 0.08935675377930107, + "grad_norm": 0.26564934849739075, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0107, + "step": 1460 + }, + { + "epoch": 0.08996878633943325, + "grad_norm": 0.36552169919013977, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0123, + "step": 1470 + }, + { + "epoch": 0.09058081889956546, + "grad_norm": 0.23664990067481995, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0074, + "step": 1480 + }, + { + "epoch": 0.09119285145969766, + "grad_norm": 0.49903133511543274, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0084, + "step": 1490 + }, + { + "epoch": 0.09180488401982985, + "grad_norm": 0.43505051732063293, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0115, + "step": 1500 + }, + { + "epoch": 0.09241691657996205, + "grad_norm": 0.20318932831287384, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0088, + "step": 1510 + }, + { + "epoch": 0.09302894914009426, + "grad_norm": 0.3289708197116852, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.008, + "step": 1520 + }, + { + "epoch": 0.09364098170022646, + "grad_norm": 0.3920934200286865, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0145, + "step": 1530 + }, + { + "epoch": 0.09425301426035865, + "grad_norm": 0.40396374464035034, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0081, + "step": 1540 + }, + { + "epoch": 0.09486504682049085, + "grad_norm": 0.4044182300567627, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.012, + "step": 1550 + }, + { + "epoch": 0.09547707938062305, + "grad_norm": 0.2318611741065979, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0115, + "step": 1560 + }, + { + "epoch": 0.09608911194075524, + "grad_norm": 0.3905714750289917, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.008, + "step": 1570 + }, + { + "epoch": 0.09670114450088745, + "grad_norm": 0.2516922652721405, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0084, + "step": 1580 + }, + { + "epoch": 0.09731317706101965, + "grad_norm": 0.338455468416214, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0122, + "step": 1590 + }, + { + "epoch": 0.09792520962115185, + "grad_norm": 0.31875041127204895, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0081, + "step": 1600 + }, + { + "epoch": 0.09853724218128404, + "grad_norm": 0.2996121644973755, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0068, + "step": 1610 + }, + { + "epoch": 0.09914927474141624, + "grad_norm": 0.4381162226200104, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0103, + "step": 1620 + }, + { + "epoch": 0.09976130730154845, + "grad_norm": 0.5531038045883179, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0168, + "step": 1630 + }, + { + "epoch": 0.10037333986168064, + "grad_norm": 1.1283385753631592, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0119, + "step": 1640 + }, + { + "epoch": 0.10098537242181284, + "grad_norm": 0.38017332553863525, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0096, + "step": 1650 + }, + { + "epoch": 0.10159740498194504, + "grad_norm": 0.4669477045536041, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0111, + "step": 1660 + }, + { + "epoch": 0.10220943754207724, + "grad_norm": 0.3903254270553589, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0098, + "step": 1670 + }, + { + "epoch": 0.10282147010220943, + "grad_norm": 0.49671587347984314, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0083, + "step": 1680 + }, + { + "epoch": 0.10343350266234164, + "grad_norm": 0.36555853486061096, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0088, + "step": 1690 + }, + { + "epoch": 0.10404553522247384, + "grad_norm": 0.21804726123809814, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0086, + "step": 1700 + }, + { + "epoch": 0.10465756778260603, + "grad_norm": 0.6744784116744995, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0109, + "step": 1710 + }, + { + "epoch": 0.10526960034273823, + "grad_norm": 0.34379470348358154, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0096, + "step": 1720 + }, + { + "epoch": 0.10588163290287043, + "grad_norm": 0.27760598063468933, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0095, + "step": 1730 + }, + { + "epoch": 0.10649366546300264, + "grad_norm": 0.36294442415237427, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0091, + "step": 1740 + }, + { + "epoch": 0.10710569802313483, + "grad_norm": 0.42200908064842224, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.011, + "step": 1750 + }, + { + "epoch": 0.10771773058326703, + "grad_norm": 0.47863906621932983, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0107, + "step": 1760 + }, + { + "epoch": 0.10832976314339923, + "grad_norm": 0.32717248797416687, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0112, + "step": 1770 + }, + { + "epoch": 0.10894179570353142, + "grad_norm": 0.4255545735359192, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0106, + "step": 1780 + }, + { + "epoch": 0.10955382826366362, + "grad_norm": 0.5034983158111572, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0098, + "step": 1790 + }, + { + "epoch": 0.11016586082379583, + "grad_norm": 0.37071412801742554, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0099, + "step": 1800 + }, + { + "epoch": 0.11077789338392803, + "grad_norm": 0.23624737560749054, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0069, + "step": 1810 + }, + { + "epoch": 0.11138992594406022, + "grad_norm": 0.5815485715866089, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0095, + "step": 1820 + }, + { + "epoch": 0.11200195850419242, + "grad_norm": 1.1828722953796387, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0104, + "step": 1830 + }, + { + "epoch": 0.11261399106432463, + "grad_norm": 0.38099589943885803, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0106, + "step": 1840 + }, + { + "epoch": 0.11322602362445681, + "grad_norm": 0.38476184010505676, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0112, + "step": 1850 + }, + { + "epoch": 0.11383805618458902, + "grad_norm": 0.48982104659080505, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0125, + "step": 1860 + }, + { + "epoch": 0.11445008874472122, + "grad_norm": 0.4165821671485901, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0099, + "step": 1870 + }, + { + "epoch": 0.11506212130485342, + "grad_norm": 0.3412662446498871, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0061, + "step": 1880 + }, + { + "epoch": 0.11567415386498561, + "grad_norm": 0.46617937088012695, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0129, + "step": 1890 + }, + { + "epoch": 0.11628618642511782, + "grad_norm": 0.2705824077129364, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0082, + "step": 1900 + }, + { + "epoch": 0.11689821898525002, + "grad_norm": 0.3567829430103302, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0125, + "step": 1910 + }, + { + "epoch": 0.11751025154538221, + "grad_norm": 0.4438138008117676, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0129, + "step": 1920 + }, + { + "epoch": 0.11812228410551441, + "grad_norm": 0.356703519821167, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0083, + "step": 1930 + }, + { + "epoch": 0.11873431666564661, + "grad_norm": 0.6039804220199585, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0086, + "step": 1940 + }, + { + "epoch": 0.11934634922577882, + "grad_norm": 0.4572801887989044, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0067, + "step": 1950 + }, + { + "epoch": 0.119958381785911, + "grad_norm": 0.5063445568084717, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0091, + "step": 1960 + }, + { + "epoch": 0.12057041434604321, + "grad_norm": 0.3467857837677002, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.008, + "step": 1970 + }, + { + "epoch": 0.12118244690617541, + "grad_norm": 0.4875742197036743, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0102, + "step": 1980 + }, + { + "epoch": 0.1217944794663076, + "grad_norm": 0.3209119141101837, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0081, + "step": 1990 + }, + { + "epoch": 0.1224065120264398, + "grad_norm": 0.4731980860233307, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0118, + "step": 2000 + }, + { + "epoch": 0.123018544586572, + "grad_norm": 0.5742963552474976, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.0125, + "step": 2010 + }, + { + "epoch": 0.12363057714670421, + "grad_norm": 0.41357406973838806, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.1242426097068364, + "grad_norm": 0.6277521252632141, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0096, + "step": 2030 + }, + { + "epoch": 0.1248546422669686, + "grad_norm": 0.41252902150154114, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0108, + "step": 2040 + }, + { + "epoch": 0.1254666748271008, + "grad_norm": 0.782122790813446, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0134, + "step": 2050 + }, + { + "epoch": 0.126078707387233, + "grad_norm": 0.45011264085769653, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0102, + "step": 2060 + }, + { + "epoch": 0.1266907399473652, + "grad_norm": 0.2724951207637787, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0088, + "step": 2070 + }, + { + "epoch": 0.12730277250749739, + "grad_norm": 0.2351481169462204, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.009, + "step": 2080 + }, + { + "epoch": 0.1279148050676296, + "grad_norm": 0.34568479657173157, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0092, + "step": 2090 + }, + { + "epoch": 0.1285268376277618, + "grad_norm": 0.44493499398231506, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0087, + "step": 2100 + }, + { + "epoch": 0.129138870187894, + "grad_norm": 0.3011283874511719, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0108, + "step": 2110 + }, + { + "epoch": 0.1297509027480262, + "grad_norm": 0.4170232117176056, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0087, + "step": 2120 + }, + { + "epoch": 0.1303629353081584, + "grad_norm": 0.2696056365966797, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0093, + "step": 2130 + }, + { + "epoch": 0.1309749678682906, + "grad_norm": 0.4092336893081665, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0083, + "step": 2140 + }, + { + "epoch": 0.13158700042842278, + "grad_norm": 0.36637401580810547, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.01, + "step": 2150 + }, + { + "epoch": 0.13219903298855498, + "grad_norm": 0.28675684332847595, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0079, + "step": 2160 + }, + { + "epoch": 0.13281106554868718, + "grad_norm": 0.27699902653694153, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0071, + "step": 2170 + }, + { + "epoch": 0.1334230981088194, + "grad_norm": 0.3832298517227173, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0104, + "step": 2180 + }, + { + "epoch": 0.1340351306689516, + "grad_norm": 0.3590598702430725, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0074, + "step": 2190 + }, + { + "epoch": 0.1346471632290838, + "grad_norm": 0.21830014884471893, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0093, + "step": 2200 + }, + { + "epoch": 0.135259195789216, + "grad_norm": 0.342492938041687, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0109, + "step": 2210 + }, + { + "epoch": 0.13587122834934817, + "grad_norm": 0.6337023973464966, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0082, + "step": 2220 + }, + { + "epoch": 0.13648326090948038, + "grad_norm": 0.41742798686027527, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0083, + "step": 2230 + }, + { + "epoch": 0.13709529346961258, + "grad_norm": 0.3180190324783325, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0083, + "step": 2240 + }, + { + "epoch": 0.13770732602974478, + "grad_norm": 0.36720144748687744, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0114, + "step": 2250 + }, + { + "epoch": 0.13831935858987698, + "grad_norm": 0.29457366466522217, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0077, + "step": 2260 + }, + { + "epoch": 0.1389313911500092, + "grad_norm": 0.24702222645282745, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0074, + "step": 2270 + }, + { + "epoch": 0.1395434237101414, + "grad_norm": 0.3203345835208893, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0079, + "step": 2280 + }, + { + "epoch": 0.14015545627027357, + "grad_norm": 0.4375395178794861, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0127, + "step": 2290 + }, + { + "epoch": 0.14076748883040577, + "grad_norm": 0.44338247179985046, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0077, + "step": 2300 + }, + { + "epoch": 0.14137952139053797, + "grad_norm": 0.31765618920326233, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0091, + "step": 2310 + }, + { + "epoch": 0.14199155395067017, + "grad_norm": 0.322534441947937, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0068, + "step": 2320 + }, + { + "epoch": 0.14260358651080238, + "grad_norm": 0.23571068048477173, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0094, + "step": 2330 + }, + { + "epoch": 0.14321561907093458, + "grad_norm": 0.26818808913230896, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0092, + "step": 2340 + }, + { + "epoch": 0.14382765163106678, + "grad_norm": 0.31886982917785645, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0084, + "step": 2350 + }, + { + "epoch": 0.14443968419119896, + "grad_norm": 0.5176070928573608, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0104, + "step": 2360 + }, + { + "epoch": 0.14505171675133116, + "grad_norm": 0.4322161078453064, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0081, + "step": 2370 + }, + { + "epoch": 0.14566374931146336, + "grad_norm": 0.4076510965824127, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0102, + "step": 2380 + }, + { + "epoch": 0.14627578187159557, + "grad_norm": 0.3808838725090027, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0096, + "step": 2390 + }, + { + "epoch": 0.14688781443172777, + "grad_norm": 0.5045232176780701, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0109, + "step": 2400 + }, + { + "epoch": 0.14749984699185997, + "grad_norm": 0.3932737708091736, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0077, + "step": 2410 + }, + { + "epoch": 0.14811187955199218, + "grad_norm": 0.28561875224113464, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0141, + "step": 2420 + }, + { + "epoch": 0.14872391211212435, + "grad_norm": 0.414410799741745, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0094, + "step": 2430 + }, + { + "epoch": 0.14933594467225655, + "grad_norm": 0.4587285816669464, + "learning_rate": 1.989086647373215e-05, + "loss": 0.009, + "step": 2440 + }, + { + "epoch": 0.14994797723238876, + "grad_norm": 0.7567377686500549, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0084, + "step": 2450 + }, + { + "epoch": 0.15056000979252096, + "grad_norm": 0.4980221390724182, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0087, + "step": 2460 + }, + { + "epoch": 0.15117204235265316, + "grad_norm": 0.41810303926467896, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0082, + "step": 2470 + }, + { + "epoch": 0.15178407491278537, + "grad_norm": 0.4193445146083832, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0131, + "step": 2480 + }, + { + "epoch": 0.15239610747291757, + "grad_norm": 0.2561246156692505, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0074, + "step": 2490 + }, + { + "epoch": 0.15300814003304977, + "grad_norm": 0.22316500544548035, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0069, + "step": 2500 + }, + { + "epoch": 0.15362017259318195, + "grad_norm": 0.31504112482070923, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0097, + "step": 2510 + }, + { + "epoch": 0.15423220515331415, + "grad_norm": 0.2944568991661072, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0101, + "step": 2520 + }, + { + "epoch": 0.15484423771344635, + "grad_norm": 0.2744649052619934, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0074, + "step": 2530 + }, + { + "epoch": 0.15545627027357856, + "grad_norm": 0.2717166841030121, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.007, + "step": 2540 + }, + { + "epoch": 0.15606830283371076, + "grad_norm": 0.32652929425239563, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0097, + "step": 2550 + }, + { + "epoch": 0.15668033539384296, + "grad_norm": 0.3169964849948883, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0089, + "step": 2560 + }, + { + "epoch": 0.15729236795397517, + "grad_norm": 0.24130010604858398, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0083, + "step": 2570 + }, + { + "epoch": 0.15790440051410734, + "grad_norm": 0.3869011700153351, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0088, + "step": 2580 + }, + { + "epoch": 0.15851643307423954, + "grad_norm": 0.2944110333919525, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0085, + "step": 2590 + }, + { + "epoch": 0.15912846563437175, + "grad_norm": 0.27993839979171753, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0097, + "step": 2600 + }, + { + "epoch": 0.15974049819450395, + "grad_norm": 0.42018845677375793, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0104, + "step": 2610 + }, + { + "epoch": 0.16035253075463615, + "grad_norm": 0.45006832480430603, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0072, + "step": 2620 + }, + { + "epoch": 0.16096456331476836, + "grad_norm": 0.275564581155777, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0072, + "step": 2630 + }, + { + "epoch": 0.16157659587490056, + "grad_norm": 0.503052294254303, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0091, + "step": 2640 + }, + { + "epoch": 0.16218862843503273, + "grad_norm": 0.33740976452827454, + "learning_rate": 1.985678043265668e-05, + "loss": 0.008, + "step": 2650 + }, + { + "epoch": 0.16280066099516494, + "grad_norm": 0.5379078984260559, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0097, + "step": 2660 + }, + { + "epoch": 0.16341269355529714, + "grad_norm": 0.3605813980102539, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0109, + "step": 2670 + }, + { + "epoch": 0.16402472611542934, + "grad_norm": 0.49490585923194885, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.013, + "step": 2680 + }, + { + "epoch": 0.16463675867556155, + "grad_norm": 0.29894375801086426, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0089, + "step": 2690 + }, + { + "epoch": 0.16524879123569375, + "grad_norm": 0.395270437002182, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0092, + "step": 2700 + }, + { + "epoch": 0.16586082379582595, + "grad_norm": 0.25507843494415283, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0078, + "step": 2710 + }, + { + "epoch": 0.16647285635595813, + "grad_norm": 0.3304852843284607, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0178, + "step": 2720 + }, + { + "epoch": 0.16708488891609033, + "grad_norm": 0.4356633126735687, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0082, + "step": 2730 + }, + { + "epoch": 0.16769692147622253, + "grad_norm": 0.4104527533054352, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0086, + "step": 2740 + }, + { + "epoch": 0.16830895403635474, + "grad_norm": 0.25723493099212646, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0097, + "step": 2750 + }, + { + "epoch": 0.16892098659648694, + "grad_norm": 0.3280608057975769, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0079, + "step": 2760 + }, + { + "epoch": 0.16953301915661914, + "grad_norm": 0.4641128480434418, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0081, + "step": 2770 + }, + { + "epoch": 0.17014505171675134, + "grad_norm": 0.2704941928386688, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.0112, + "step": 2780 + }, + { + "epoch": 0.17075708427688352, + "grad_norm": 0.42343780398368835, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0084, + "step": 2790 + }, + { + "epoch": 0.17136911683701572, + "grad_norm": 0.2606532573699951, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0085, + "step": 2800 + }, + { + "epoch": 0.17198114939714793, + "grad_norm": 0.39099374413490295, + "learning_rate": 1.982773261916081e-05, + "loss": 0.014, + "step": 2810 + }, + { + "epoch": 0.17259318195728013, + "grad_norm": 0.32653889060020447, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0095, + "step": 2820 + }, + { + "epoch": 0.17320521451741233, + "grad_norm": 0.34765321016311646, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0084, + "step": 2830 + }, + { + "epoch": 0.17381724707754453, + "grad_norm": 0.2844177186489105, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.011, + "step": 2840 + }, + { + "epoch": 0.17442927963767674, + "grad_norm": 0.5079899430274963, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0114, + "step": 2850 + }, + { + "epoch": 0.1750413121978089, + "grad_norm": 0.4043678045272827, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0073, + "step": 2860 + }, + { + "epoch": 0.17565334475794112, + "grad_norm": 0.3833003640174866, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0076, + "step": 2870 + }, + { + "epoch": 0.17626537731807332, + "grad_norm": 0.2826341986656189, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0072, + "step": 2880 + }, + { + "epoch": 0.17687740987820552, + "grad_norm": 0.6043460965156555, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0084, + "step": 2890 + }, + { + "epoch": 0.17748944243833772, + "grad_norm": 0.3238481879234314, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0149, + "step": 2900 + }, + { + "epoch": 0.17810147499846993, + "grad_norm": 0.45817995071411133, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0103, + "step": 2910 + }, + { + "epoch": 0.17871350755860213, + "grad_norm": 0.21048744022846222, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0094, + "step": 2920 + }, + { + "epoch": 0.1793255401187343, + "grad_norm": 0.3401891887187958, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0058, + "step": 2930 + }, + { + "epoch": 0.1799375726788665, + "grad_norm": 0.3655509948730469, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0071, + "step": 2940 + }, + { + "epoch": 0.1805496052389987, + "grad_norm": 0.47406241297721863, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0067, + "step": 2950 + }, + { + "epoch": 0.18116163779913091, + "grad_norm": 0.3278841972351074, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0121, + "step": 2960 + }, + { + "epoch": 0.18177367035926312, + "grad_norm": 0.271436482667923, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.008, + "step": 2970 + }, + { + "epoch": 0.18238570291939532, + "grad_norm": 0.41475561261177063, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.008, + "step": 2980 + }, + { + "epoch": 0.18299773547952752, + "grad_norm": 0.5389090776443481, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0091, + "step": 2990 + }, + { + "epoch": 0.1836097680396597, + "grad_norm": 0.3958609700202942, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0106, + "step": 3000 + }, + { + "epoch": 0.1842218005997919, + "grad_norm": 0.3456019461154938, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0084, + "step": 3010 + }, + { + "epoch": 0.1848338331599241, + "grad_norm": 0.2959386706352234, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0071, + "step": 3020 + }, + { + "epoch": 0.1854458657200563, + "grad_norm": 0.2617223858833313, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0082, + "step": 3030 + }, + { + "epoch": 0.1860578982801885, + "grad_norm": 0.45173966884613037, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0105, + "step": 3040 + }, + { + "epoch": 0.1866699308403207, + "grad_norm": 0.4127421975135803, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.008, + "step": 3050 + }, + { + "epoch": 0.18728196340045292, + "grad_norm": 0.3142230808734894, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0085, + "step": 3060 + }, + { + "epoch": 0.1878939959605851, + "grad_norm": 0.49720287322998047, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0089, + "step": 3070 + }, + { + "epoch": 0.1885060285207173, + "grad_norm": 0.6417365074157715, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.0088, + "step": 3080 + }, + { + "epoch": 0.1891180610808495, + "grad_norm": 0.44801583886146545, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.0098, + "step": 3090 + }, + { + "epoch": 0.1897300936409817, + "grad_norm": 0.3606127202510834, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0087, + "step": 3100 + }, + { + "epoch": 0.1903421262011139, + "grad_norm": 0.268971711397171, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0092, + "step": 3110 + }, + { + "epoch": 0.1909541587612461, + "grad_norm": 0.2367011308670044, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0074, + "step": 3120 + }, + { + "epoch": 0.1915661913213783, + "grad_norm": 0.41643625497817993, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0062, + "step": 3130 + }, + { + "epoch": 0.19217822388151048, + "grad_norm": 0.33202284574508667, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0081, + "step": 3140 + }, + { + "epoch": 0.1927902564416427, + "grad_norm": 0.279813289642334, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0074, + "step": 3150 + }, + { + "epoch": 0.1934022890017749, + "grad_norm": 0.5127174258232117, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0101, + "step": 3160 + }, + { + "epoch": 0.1940143215619071, + "grad_norm": 0.36921849846839905, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0078, + "step": 3170 + }, + { + "epoch": 0.1946263541220393, + "grad_norm": 0.3509728014469147, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0114, + "step": 3180 + }, + { + "epoch": 0.1952383866821715, + "grad_norm": 0.3088139295578003, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0089, + "step": 3190 + }, + { + "epoch": 0.1958504192423037, + "grad_norm": 0.43653762340545654, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0116, + "step": 3200 + }, + { + "epoch": 0.19646245180243588, + "grad_norm": 0.2522308826446533, + "learning_rate": 1.974353140804231e-05, + "loss": 0.007, + "step": 3210 + }, + { + "epoch": 0.19707448436256808, + "grad_norm": 0.37519100308418274, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0098, + "step": 3220 + }, + { + "epoch": 0.19768651692270028, + "grad_norm": 0.379027783870697, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0086, + "step": 3230 + }, + { + "epoch": 0.1982985494828325, + "grad_norm": 0.2713090479373932, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0073, + "step": 3240 + }, + { + "epoch": 0.1989105820429647, + "grad_norm": 0.41106846928596497, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0188, + "step": 3250 + }, + { + "epoch": 0.1995226146030969, + "grad_norm": 0.3914758861064911, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0097, + "step": 3260 + }, + { + "epoch": 0.2001346471632291, + "grad_norm": 0.4763018488883972, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0082, + "step": 3270 + }, + { + "epoch": 0.20074667972336127, + "grad_norm": 0.23002664744853973, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0085, + "step": 3280 + }, + { + "epoch": 0.20135871228349347, + "grad_norm": 0.2887377142906189, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0074, + "step": 3290 + }, + { + "epoch": 0.20197074484362568, + "grad_norm": 0.2322079837322235, + "learning_rate": 1.972231769371516e-05, + "loss": 0.009, + "step": 3300 + }, + { + "epoch": 0.20258277740375788, + "grad_norm": 0.39307233691215515, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0095, + "step": 3310 + }, + { + "epoch": 0.20319480996389008, + "grad_norm": 0.5209783315658569, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.012, + "step": 3320 + }, + { + "epoch": 0.20380684252402229, + "grad_norm": 0.45187172293663025, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0086, + "step": 3330 + }, + { + "epoch": 0.2044188750841545, + "grad_norm": 0.480970174074173, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0072, + "step": 3340 + }, + { + "epoch": 0.20503090764428666, + "grad_norm": 0.30979010462760925, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0091, + "step": 3350 + }, + { + "epoch": 0.20564294020441887, + "grad_norm": 0.6410729289054871, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0096, + "step": 3360 + }, + { + "epoch": 0.20625497276455107, + "grad_norm": 0.23707512021064758, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0078, + "step": 3370 + }, + { + "epoch": 0.20686700532468327, + "grad_norm": 0.3029544949531555, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0115, + "step": 3380 + }, + { + "epoch": 0.20747903788481548, + "grad_norm": 0.28677740693092346, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0068, + "step": 3390 + }, + { + "epoch": 0.20809107044494768, + "grad_norm": 0.2433662712574005, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0082, + "step": 3400 + }, + { + "epoch": 0.20870310300507988, + "grad_norm": 0.38066667318344116, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0089, + "step": 3410 + }, + { + "epoch": 0.20931513556521206, + "grad_norm": 0.3830282390117645, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0073, + "step": 3420 + }, + { + "epoch": 0.20992716812534426, + "grad_norm": 0.359684556722641, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0088, + "step": 3430 + }, + { + "epoch": 0.21053920068547646, + "grad_norm": 0.3497346341609955, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0082, + "step": 3440 + }, + { + "epoch": 0.21115123324560867, + "grad_norm": 0.3664748966693878, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0091, + "step": 3450 + }, + { + "epoch": 0.21176326580574087, + "grad_norm": 0.382804811000824, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0077, + "step": 3460 + }, + { + "epoch": 0.21237529836587307, + "grad_norm": 0.22746194899082184, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0107, + "step": 3470 + }, + { + "epoch": 0.21298733092600527, + "grad_norm": 0.4094266891479492, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0096, + "step": 3480 + }, + { + "epoch": 0.21359936348613745, + "grad_norm": 0.26990365982055664, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0089, + "step": 3490 + }, + { + "epoch": 0.21421139604626965, + "grad_norm": 0.2602371275424957, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0074, + "step": 3500 + }, + { + "epoch": 0.21482342860640186, + "grad_norm": 0.34200435876846313, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0083, + "step": 3510 + }, + { + "epoch": 0.21543546116653406, + "grad_norm": 0.4260508716106415, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0098, + "step": 3520 + }, + { + "epoch": 0.21604749372666626, + "grad_norm": 0.4017483592033386, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0103, + "step": 3530 + }, + { + "epoch": 0.21665952628679847, + "grad_norm": 0.40005844831466675, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0094, + "step": 3540 + }, + { + "epoch": 0.21727155884693067, + "grad_norm": 0.3856841027736664, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0095, + "step": 3550 + }, + { + "epoch": 0.21788359140706284, + "grad_norm": 0.3245168626308441, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0067, + "step": 3560 + }, + { + "epoch": 0.21849562396719505, + "grad_norm": 0.2698485255241394, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0079, + "step": 3570 + }, + { + "epoch": 0.21910765652732725, + "grad_norm": 0.24520452320575714, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0109, + "step": 3580 + }, + { + "epoch": 0.21971968908745945, + "grad_norm": 0.397175133228302, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0114, + "step": 3590 + }, + { + "epoch": 0.22033172164759166, + "grad_norm": 0.40339091420173645, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0094, + "step": 3600 + }, + { + "epoch": 0.22094375420772386, + "grad_norm": 0.404435396194458, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0087, + "step": 3610 + }, + { + "epoch": 0.22155578676785606, + "grad_norm": 0.3300188183784485, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0073, + "step": 3620 + }, + { + "epoch": 0.22216781932798824, + "grad_norm": 0.23486892879009247, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0087, + "step": 3630 + }, + { + "epoch": 0.22277985188812044, + "grad_norm": 0.37211188673973083, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0078, + "step": 3640 + }, + { + "epoch": 0.22339188444825264, + "grad_norm": 0.32422709465026855, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.009, + "step": 3650 + }, + { + "epoch": 0.22400391700838485, + "grad_norm": 0.43535664677619934, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0099, + "step": 3660 + }, + { + "epoch": 0.22461594956851705, + "grad_norm": 0.3295724093914032, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0074, + "step": 3670 + }, + { + "epoch": 0.22522798212864925, + "grad_norm": 0.2840734124183655, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0082, + "step": 3680 + }, + { + "epoch": 0.22584001468878145, + "grad_norm": 0.2861844599246979, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0079, + "step": 3690 + }, + { + "epoch": 0.22645204724891363, + "grad_norm": 0.3194407820701599, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0071, + "step": 3700 + }, + { + "epoch": 0.22706407980904583, + "grad_norm": 0.38770729303359985, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0076, + "step": 3710 + }, + { + "epoch": 0.22767611236917804, + "grad_norm": 0.4637960195541382, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0093, + "step": 3720 + }, + { + "epoch": 0.22828814492931024, + "grad_norm": 0.31972312927246094, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0078, + "step": 3730 + }, + { + "epoch": 0.22890017748944244, + "grad_norm": 0.5273001790046692, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0076, + "step": 3740 + }, + { + "epoch": 0.22951221004957464, + "grad_norm": 0.30589622259140015, + "learning_rate": 1.960385541132679e-05, + "loss": 0.009, + "step": 3750 + }, + { + "epoch": 0.23012424260970685, + "grad_norm": 0.31634265184402466, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0063, + "step": 3760 + }, + { + "epoch": 0.23073627516983902, + "grad_norm": 0.32762402296066284, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0081, + "step": 3770 + }, + { + "epoch": 0.23134830772997123, + "grad_norm": 0.42696496844291687, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0089, + "step": 3780 + }, + { + "epoch": 0.23196034029010343, + "grad_norm": 0.4676671624183655, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0093, + "step": 3790 + }, + { + "epoch": 0.23257237285023563, + "grad_norm": 0.3347911536693573, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0131, + "step": 3800 + }, + { + "epoch": 0.23318440541036783, + "grad_norm": 0.3083193600177765, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0072, + "step": 3810 + }, + { + "epoch": 0.23379643797050004, + "grad_norm": 0.38178423047065735, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0083, + "step": 3820 + }, + { + "epoch": 0.23440847053063224, + "grad_norm": 0.2796846330165863, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0072, + "step": 3830 + }, + { + "epoch": 0.23502050309076442, + "grad_norm": 0.37444883584976196, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.008, + "step": 3840 + }, + { + "epoch": 0.23563253565089662, + "grad_norm": 0.3286772668361664, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0092, + "step": 3850 + }, + { + "epoch": 0.23624456821102882, + "grad_norm": 0.45423513650894165, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0068, + "step": 3860 + }, + { + "epoch": 0.23685660077116102, + "grad_norm": 0.36881721019744873, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0068, + "step": 3870 + }, + { + "epoch": 0.23746863333129323, + "grad_norm": 0.3560579717159271, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0084, + "step": 3880 + }, + { + "epoch": 0.23808066589142543, + "grad_norm": 0.43887296319007874, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0076, + "step": 3890 + }, + { + "epoch": 0.23869269845155763, + "grad_norm": 0.3080165982246399, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0069, + "step": 3900 + }, + { + "epoch": 0.2393047310116898, + "grad_norm": 0.2327195703983307, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0077, + "step": 3910 + }, + { + "epoch": 0.239916763571822, + "grad_norm": 0.5960802435874939, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0088, + "step": 3920 + }, + { + "epoch": 0.24052879613195421, + "grad_norm": 0.36213600635528564, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0076, + "step": 3930 + }, + { + "epoch": 0.24114082869208642, + "grad_norm": 0.2950032949447632, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0115, + "step": 3940 + }, + { + "epoch": 0.24175286125221862, + "grad_norm": 0.4527084529399872, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0089, + "step": 3950 + }, + { + "epoch": 0.24236489381235082, + "grad_norm": 0.4422491192817688, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0135, + "step": 3960 + }, + { + "epoch": 0.24297692637248303, + "grad_norm": 0.45049232244491577, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 0.2435889589326152, + "grad_norm": 0.2566494941711426, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0095, + "step": 3980 + }, + { + "epoch": 0.2442009914927474, + "grad_norm": 0.49880343675613403, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0099, + "step": 3990 + }, + { + "epoch": 0.2448130240528796, + "grad_norm": 0.4699341952800751, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0084, + "step": 4000 + }, + { + "epoch": 0.2454250566130118, + "grad_norm": 0.41230708360671997, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0071, + "step": 4010 + }, + { + "epoch": 0.246037089173144, + "grad_norm": 0.4836854934692383, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.01, + "step": 4020 + }, + { + "epoch": 0.24664912173327622, + "grad_norm": 0.3056115508079529, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0082, + "step": 4030 + }, + { + "epoch": 0.24726115429340842, + "grad_norm": 0.151325523853302, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0062, + "step": 4040 + }, + { + "epoch": 0.2478731868535406, + "grad_norm": 0.3798811137676239, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0081, + "step": 4050 + }, + { + "epoch": 0.2484852194136728, + "grad_norm": 0.3308229148387909, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0072, + "step": 4060 + }, + { + "epoch": 0.249097251973805, + "grad_norm": 0.2891339957714081, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0074, + "step": 4070 + }, + { + "epoch": 0.2497092845339372, + "grad_norm": 0.24179549515247345, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 0.2503213170940694, + "grad_norm": 0.20879383385181427, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0064, + "step": 4090 + }, + { + "epoch": 0.2509333496542016, + "grad_norm": 0.39275774359703064, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0073, + "step": 4100 + }, + { + "epoch": 0.2515453822143338, + "grad_norm": 0.2925782799720764, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0095, + "step": 4110 + }, + { + "epoch": 0.252157414774466, + "grad_norm": 0.6465128660202026, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0102, + "step": 4120 + }, + { + "epoch": 0.2527694473345982, + "grad_norm": 0.34663915634155273, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.007, + "step": 4130 + }, + { + "epoch": 0.2533814798947304, + "grad_norm": 0.3387165367603302, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0091, + "step": 4140 + }, + { + "epoch": 0.2539935124548626, + "grad_norm": 0.32989630103111267, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0084, + "step": 4150 + }, + { + "epoch": 0.25460554501499477, + "grad_norm": 0.22870391607284546, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0071, + "step": 4160 + }, + { + "epoch": 0.255217577575127, + "grad_norm": 0.3866496682167053, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0079, + "step": 4170 + }, + { + "epoch": 0.2558296101352592, + "grad_norm": 0.29885268211364746, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0068, + "step": 4180 + }, + { + "epoch": 0.2564416426953914, + "grad_norm": 0.4693736135959625, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0095, + "step": 4190 + }, + { + "epoch": 0.2570536752555236, + "grad_norm": 0.2822454273700714, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0074, + "step": 4200 + }, + { + "epoch": 0.2576657078156558, + "grad_norm": 0.21141012012958527, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0099, + "step": 4210 + }, + { + "epoch": 0.258277740375788, + "grad_norm": 0.2284570336341858, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0102, + "step": 4220 + }, + { + "epoch": 0.2588897729359202, + "grad_norm": 0.4675048887729645, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0112, + "step": 4230 + }, + { + "epoch": 0.2595018054960524, + "grad_norm": 0.3906441628932953, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0088, + "step": 4240 + }, + { + "epoch": 0.2601138380561846, + "grad_norm": 0.22990387678146362, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0091, + "step": 4250 + }, + { + "epoch": 0.2607258706163168, + "grad_norm": 0.41871073842048645, + "learning_rate": 1.944490251296856e-05, + "loss": 0.009, + "step": 4260 + }, + { + "epoch": 0.261337903176449, + "grad_norm": 0.2724440395832062, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0074, + "step": 4270 + }, + { + "epoch": 0.2619499357365812, + "grad_norm": 0.42590636014938354, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0069, + "step": 4280 + }, + { + "epoch": 0.2625619682967134, + "grad_norm": 0.3604855239391327, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0068, + "step": 4290 + }, + { + "epoch": 0.26317400085684556, + "grad_norm": 0.475304514169693, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0082, + "step": 4300 + }, + { + "epoch": 0.26378603341697776, + "grad_norm": 0.24752479791641235, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0065, + "step": 4310 + }, + { + "epoch": 0.26439806597710996, + "grad_norm": 0.4384835958480835, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0104, + "step": 4320 + }, + { + "epoch": 0.26501009853724217, + "grad_norm": 0.24999107420444489, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0076, + "step": 4330 + }, + { + "epoch": 0.26562213109737437, + "grad_norm": 0.292491614818573, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0069, + "step": 4340 + }, + { + "epoch": 0.2662341636575066, + "grad_norm": 0.2380208522081375, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0069, + "step": 4350 + }, + { + "epoch": 0.2668461962176388, + "grad_norm": 0.2906023859977722, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0063, + "step": 4360 + }, + { + "epoch": 0.267458228777771, + "grad_norm": 0.4718990623950958, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0074, + "step": 4370 + }, + { + "epoch": 0.2680702613379032, + "grad_norm": 0.33257269859313965, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0073, + "step": 4380 + }, + { + "epoch": 0.2686822938980354, + "grad_norm": 0.34411463141441345, + "learning_rate": 1.940024231916886e-05, + "loss": 0.006, + "step": 4390 + }, + { + "epoch": 0.2692943264581676, + "grad_norm": 0.40312516689300537, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0093, + "step": 4400 + }, + { + "epoch": 0.2699063590182998, + "grad_norm": 0.2248350828886032, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0082, + "step": 4410 + }, + { + "epoch": 0.270518391578432, + "grad_norm": 0.30094820261001587, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0092, + "step": 4420 + }, + { + "epoch": 0.2711304241385642, + "grad_norm": 0.4277440309524536, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0099, + "step": 4430 + }, + { + "epoch": 0.27174245669869634, + "grad_norm": 0.2876254916191101, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0078, + "step": 4440 + }, + { + "epoch": 0.27235448925882855, + "grad_norm": 0.3453986346721649, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0096, + "step": 4450 + }, + { + "epoch": 0.27296652181896075, + "grad_norm": 0.31379634141921997, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0088, + "step": 4460 + }, + { + "epoch": 0.27357855437909295, + "grad_norm": 0.294477254152298, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0073, + "step": 4470 + }, + { + "epoch": 0.27419058693922516, + "grad_norm": 0.3773270845413208, + "learning_rate": 1.936834723687526e-05, + "loss": 0.008, + "step": 4480 + }, + { + "epoch": 0.27480261949935736, + "grad_norm": 0.31942978501319885, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0086, + "step": 4490 + }, + { + "epoch": 0.27541465205948956, + "grad_norm": 0.46827632188796997, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0081, + "step": 4500 + }, + { + "epoch": 0.27602668461962176, + "grad_norm": 0.2735249102115631, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0107, + "step": 4510 + }, + { + "epoch": 0.27663871717975397, + "grad_norm": 0.30048197507858276, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0082, + "step": 4520 + }, + { + "epoch": 0.27725074973988617, + "grad_norm": 0.3507469594478607, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0093, + "step": 4530 + }, + { + "epoch": 0.2778627823000184, + "grad_norm": 0.5642989277839661, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0091, + "step": 4540 + }, + { + "epoch": 0.2784748148601506, + "grad_norm": 0.2769993245601654, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0105, + "step": 4550 + }, + { + "epoch": 0.2790868474202828, + "grad_norm": 0.30269622802734375, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0066, + "step": 4560 + }, + { + "epoch": 0.279698879980415, + "grad_norm": 0.3717023432254791, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0114, + "step": 4570 + }, + { + "epoch": 0.28031091254054713, + "grad_norm": 0.5065163373947144, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0075, + "step": 4580 + }, + { + "epoch": 0.28092294510067933, + "grad_norm": 0.4302189350128174, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0077, + "step": 4590 + }, + { + "epoch": 0.28153497766081154, + "grad_norm": 0.44008374214172363, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0086, + "step": 4600 + }, + { + "epoch": 0.28214701022094374, + "grad_norm": 0.4647364318370819, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0073, + "step": 4610 + }, + { + "epoch": 0.28275904278107594, + "grad_norm": 0.4229913651943207, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0077, + "step": 4620 + }, + { + "epoch": 0.28337107534120815, + "grad_norm": 0.36600178480148315, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0078, + "step": 4630 + }, + { + "epoch": 0.28398310790134035, + "grad_norm": 0.47143280506134033, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0085, + "step": 4640 + }, + { + "epoch": 0.28459514046147255, + "grad_norm": 0.29140496253967285, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0056, + "step": 4650 + }, + { + "epoch": 0.28520717302160475, + "grad_norm": 0.3964666426181793, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0071, + "step": 4660 + }, + { + "epoch": 0.28581920558173696, + "grad_norm": 0.407536119222641, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0095, + "step": 4670 + }, + { + "epoch": 0.28643123814186916, + "grad_norm": 0.33687031269073486, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0091, + "step": 4680 + }, + { + "epoch": 0.28704327070200136, + "grad_norm": 0.3182448446750641, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0087, + "step": 4690 + }, + { + "epoch": 0.28765530326213357, + "grad_norm": 0.40998023748397827, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0099, + "step": 4700 + }, + { + "epoch": 0.28826733582226577, + "grad_norm": 0.28750360012054443, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0087, + "step": 4710 + }, + { + "epoch": 0.2888793683823979, + "grad_norm": 0.36494627594947815, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0062, + "step": 4720 + }, + { + "epoch": 0.2894914009425301, + "grad_norm": 0.37047910690307617, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0094, + "step": 4730 + }, + { + "epoch": 0.2901034335026623, + "grad_norm": 0.2577553987503052, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0082, + "step": 4740 + }, + { + "epoch": 0.2907154660627945, + "grad_norm": 0.24589397013187408, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0067, + "step": 4750 + }, + { + "epoch": 0.29132749862292673, + "grad_norm": 0.37927499413490295, + "learning_rate": 1.926404507646751e-05, + "loss": 0.008, + "step": 4760 + }, + { + "epoch": 0.29193953118305893, + "grad_norm": 0.40547946095466614, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0101, + "step": 4770 + }, + { + "epoch": 0.29255156374319113, + "grad_norm": 0.47896578907966614, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0114, + "step": 4780 + }, + { + "epoch": 0.29316359630332334, + "grad_norm": 0.42911696434020996, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0066, + "step": 4790 + }, + { + "epoch": 0.29377562886345554, + "grad_norm": 0.21735505759716034, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0072, + "step": 4800 + }, + { + "epoch": 0.29438766142358774, + "grad_norm": 0.25916650891304016, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0078, + "step": 4810 + }, + { + "epoch": 0.29499969398371995, + "grad_norm": 0.23863966763019562, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0086, + "step": 4820 + }, + { + "epoch": 0.29561172654385215, + "grad_norm": 0.41552650928497314, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0078, + "step": 4830 + }, + { + "epoch": 0.29622375910398435, + "grad_norm": 0.2775874733924866, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0082, + "step": 4840 + }, + { + "epoch": 0.29683579166411656, + "grad_norm": 0.28962916135787964, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0088, + "step": 4850 + }, + { + "epoch": 0.2974478242242487, + "grad_norm": 0.3488757610321045, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0076, + "step": 4860 + }, + { + "epoch": 0.2980598567843809, + "grad_norm": 0.3833489716053009, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0096, + "step": 4870 + }, + { + "epoch": 0.2986718893445131, + "grad_norm": 0.20357537269592285, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.0061, + "step": 4880 + }, + { + "epoch": 0.2992839219046453, + "grad_norm": 0.4648539423942566, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0099, + "step": 4890 + }, + { + "epoch": 0.2998959544647775, + "grad_norm": 0.2701941728591919, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0062, + "step": 4900 + }, + { + "epoch": 0.3005079870249097, + "grad_norm": 0.31277161836624146, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0069, + "step": 4910 + }, + { + "epoch": 0.3011200195850419, + "grad_norm": 0.27697697281837463, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0094, + "step": 4920 + }, + { + "epoch": 0.3017320521451741, + "grad_norm": 0.22880606353282928, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0074, + "step": 4930 + }, + { + "epoch": 0.3023440847053063, + "grad_norm": 0.258404940366745, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0078, + "step": 4940 + }, + { + "epoch": 0.30295611726543853, + "grad_norm": 0.394394189119339, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0109, + "step": 4950 + }, + { + "epoch": 0.30356814982557073, + "grad_norm": 0.24108687043190002, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0082, + "step": 4960 + }, + { + "epoch": 0.30418018238570294, + "grad_norm": 0.34520867466926575, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0098, + "step": 4970 + }, + { + "epoch": 0.30479221494583514, + "grad_norm": 0.33723267912864685, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0104, + "step": 4980 + }, + { + "epoch": 0.30540424750596734, + "grad_norm": 0.28276878595352173, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0072, + "step": 4990 + }, + { + "epoch": 0.30601628006609954, + "grad_norm": 0.32236188650131226, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.012, + "step": 5000 + }, + { + "epoch": 0.3066283126262317, + "grad_norm": 0.20596888661384583, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0073, + "step": 5010 + }, + { + "epoch": 0.3072403451863639, + "grad_norm": 0.37921255826950073, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0073, + "step": 5020 + }, + { + "epoch": 0.3078523777464961, + "grad_norm": 0.30738911032676697, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0083, + "step": 5030 + }, + { + "epoch": 0.3084644103066283, + "grad_norm": 0.1938163936138153, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0065, + "step": 5040 + }, + { + "epoch": 0.3090764428667605, + "grad_norm": 0.25826898217201233, + "learning_rate": 1.914800406458133e-05, + "loss": 0.008, + "step": 5050 + }, + { + "epoch": 0.3096884754268927, + "grad_norm": 0.18951697647571564, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0058, + "step": 5060 + }, + { + "epoch": 0.3103005079870249, + "grad_norm": 0.3877381980419159, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0095, + "step": 5070 + }, + { + "epoch": 0.3109125405471571, + "grad_norm": 0.3133573830127716, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0088, + "step": 5080 + }, + { + "epoch": 0.3115245731072893, + "grad_norm": 0.33131852746009827, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0062, + "step": 5090 + }, + { + "epoch": 0.3121366056674215, + "grad_norm": 0.21276263892650604, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0071, + "step": 5100 + }, + { + "epoch": 0.3127486382275537, + "grad_norm": 0.46878281235694885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0084, + "step": 5110 + }, + { + "epoch": 0.3133606707876859, + "grad_norm": 0.44227683544158936, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0097, + "step": 5120 + }, + { + "epoch": 0.3139727033478181, + "grad_norm": 0.41950204968452454, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0115, + "step": 5130 + }, + { + "epoch": 0.31458473590795033, + "grad_norm": 0.4214445948600769, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0081, + "step": 5140 + }, + { + "epoch": 0.3151967684680825, + "grad_norm": 0.3779868483543396, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0075, + "step": 5150 + }, + { + "epoch": 0.3158088010282147, + "grad_norm": 0.4587777853012085, + "learning_rate": 1.910187855634501e-05, + "loss": 0.009, + "step": 5160 + }, + { + "epoch": 0.3164208335883469, + "grad_norm": 0.4875587224960327, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0099, + "step": 5170 + }, + { + "epoch": 0.3170328661484791, + "grad_norm": 0.22378237545490265, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0071, + "step": 5180 + }, + { + "epoch": 0.3176448987086113, + "grad_norm": 0.3360678553581238, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0101, + "step": 5190 + }, + { + "epoch": 0.3182569312687435, + "grad_norm": 0.36370640993118286, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0068, + "step": 5200 + }, + { + "epoch": 0.3188689638288757, + "grad_norm": 0.25814393162727356, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0076, + "step": 5210 + }, + { + "epoch": 0.3194809963890079, + "grad_norm": 0.39010074734687805, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0066, + "step": 5220 + }, + { + "epoch": 0.3200930289491401, + "grad_norm": 0.44009074568748474, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0078, + "step": 5230 + }, + { + "epoch": 0.3207050615092723, + "grad_norm": 0.45733046531677246, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0074, + "step": 5240 + }, + { + "epoch": 0.3213170940694045, + "grad_norm": 0.4555135667324066, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0089, + "step": 5250 + }, + { + "epoch": 0.3219291266295367, + "grad_norm": 0.5864276885986328, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0083, + "step": 5260 + }, + { + "epoch": 0.3225411591896689, + "grad_norm": 0.3305470943450928, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0094, + "step": 5270 + }, + { + "epoch": 0.3231531917498011, + "grad_norm": 0.21458053588867188, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.0091, + "step": 5280 + }, + { + "epoch": 0.32376522430993326, + "grad_norm": 0.2927384376525879, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.007, + "step": 5290 + }, + { + "epoch": 0.32437725687006547, + "grad_norm": 0.387608140707016, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0068, + "step": 5300 + }, + { + "epoch": 0.32498928943019767, + "grad_norm": 0.28193122148513794, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0065, + "step": 5310 + }, + { + "epoch": 0.3256013219903299, + "grad_norm": 0.33098119497299194, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0082, + "step": 5320 + }, + { + "epoch": 0.3262133545504621, + "grad_norm": 0.5442482233047485, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0124, + "step": 5330 + }, + { + "epoch": 0.3268253871105943, + "grad_norm": 0.503669798374176, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0093, + "step": 5340 + }, + { + "epoch": 0.3274374196707265, + "grad_norm": 0.2307574301958084, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0071, + "step": 5350 + }, + { + "epoch": 0.3280494522308587, + "grad_norm": 0.3543917238712311, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.007, + "step": 5360 + }, + { + "epoch": 0.3286614847909909, + "grad_norm": 0.21763169765472412, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0059, + "step": 5370 + }, + { + "epoch": 0.3292735173511231, + "grad_norm": 0.38023391366004944, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0068, + "step": 5380 + }, + { + "epoch": 0.3298855499112553, + "grad_norm": 0.44597327709198, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0069, + "step": 5390 + }, + { + "epoch": 0.3304975824713875, + "grad_norm": 0.2994389533996582, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0083, + "step": 5400 + }, + { + "epoch": 0.3311096150315197, + "grad_norm": 0.26668304204940796, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0071, + "step": 5410 + }, + { + "epoch": 0.3317216475916519, + "grad_norm": 0.25944197177886963, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0065, + "step": 5420 + }, + { + "epoch": 0.33233368015178405, + "grad_norm": 0.3646431267261505, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0065, + "step": 5430 + }, + { + "epoch": 0.33294571271191625, + "grad_norm": 0.34860959649086, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0098, + "step": 5440 + }, + { + "epoch": 0.33355774527204846, + "grad_norm": 0.33718568086624146, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0069, + "step": 5450 + }, + { + "epoch": 0.33416977783218066, + "grad_norm": 0.2417302280664444, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0064, + "step": 5460 + }, + { + "epoch": 0.33478181039231286, + "grad_norm": 0.26607826352119446, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0048, + "step": 5470 + }, + { + "epoch": 0.33539384295244506, + "grad_norm": 0.31762364506721497, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0064, + "step": 5480 + }, + { + "epoch": 0.33600587551257727, + "grad_norm": 0.21427015960216522, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0079, + "step": 5490 + }, + { + "epoch": 0.33661790807270947, + "grad_norm": 0.3372637629508972, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0077, + "step": 5500 + }, + { + "epoch": 0.3372299406328417, + "grad_norm": 0.3760700821876526, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0066, + "step": 5510 + }, + { + "epoch": 0.3378419731929739, + "grad_norm": 0.22838029265403748, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0061, + "step": 5520 + }, + { + "epoch": 0.3384540057531061, + "grad_norm": 0.3105243444442749, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0089, + "step": 5530 + }, + { + "epoch": 0.3390660383132383, + "grad_norm": 0.23694929480552673, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0086, + "step": 5540 + }, + { + "epoch": 0.3396780708733705, + "grad_norm": 0.22935174405574799, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0074, + "step": 5550 + }, + { + "epoch": 0.3402901034335027, + "grad_norm": 0.26384714245796204, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0069, + "step": 5560 + }, + { + "epoch": 0.34090213599363484, + "grad_norm": 0.33245643973350525, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0061, + "step": 5570 + }, + { + "epoch": 0.34151416855376704, + "grad_norm": 0.3904813230037689, + "learning_rate": 1.891523933768891e-05, + "loss": 0.009, + "step": 5580 + }, + { + "epoch": 0.34212620111389924, + "grad_norm": 0.33858415484428406, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0067, + "step": 5590 + }, + { + "epoch": 0.34273823367403145, + "grad_norm": 0.3197486996650696, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0082, + "step": 5600 + }, + { + "epoch": 0.34335026623416365, + "grad_norm": 0.23814789950847626, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0068, + "step": 5610 + }, + { + "epoch": 0.34396229879429585, + "grad_norm": 0.3820457458496094, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0082, + "step": 5620 + }, + { + "epoch": 0.34457433135442805, + "grad_norm": 0.27518680691719055, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0063, + "step": 5630 + }, + { + "epoch": 0.34518636391456026, + "grad_norm": 0.24741721153259277, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0076, + "step": 5640 + }, + { + "epoch": 0.34579839647469246, + "grad_norm": 0.5140052437782288, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0178, + "step": 5650 + }, + { + "epoch": 0.34641042903482466, + "grad_norm": 0.5363543033599854, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0097, + "step": 5660 + }, + { + "epoch": 0.34702246159495687, + "grad_norm": 0.41116055846214294, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0078, + "step": 5670 + }, + { + "epoch": 0.34763449415508907, + "grad_norm": 0.412762314081192, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0086, + "step": 5680 + }, + { + "epoch": 0.34824652671522127, + "grad_norm": 0.399527907371521, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0088, + "step": 5690 + }, + { + "epoch": 0.3488585592753535, + "grad_norm": 0.3447834551334381, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0084, + "step": 5700 + }, + { + "epoch": 0.3494705918354856, + "grad_norm": 0.3418859541416168, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0092, + "step": 5710 + }, + { + "epoch": 0.3500826243956178, + "grad_norm": 0.3336535692214966, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.0065, + "step": 5720 + }, + { + "epoch": 0.35069465695575003, + "grad_norm": 0.34575122594833374, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0068, + "step": 5730 + }, + { + "epoch": 0.35130668951588223, + "grad_norm": 0.34325110912323, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.01, + "step": 5740 + }, + { + "epoch": 0.35191872207601443, + "grad_norm": 0.20104236900806427, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0085, + "step": 5750 + }, + { + "epoch": 0.35253075463614664, + "grad_norm": 0.33699074387550354, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0073, + "step": 5760 + }, + { + "epoch": 0.35314278719627884, + "grad_norm": 0.33322635293006897, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0091, + "step": 5770 + }, + { + "epoch": 0.35375481975641104, + "grad_norm": 0.26897475123405457, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0087, + "step": 5780 + }, + { + "epoch": 0.35436685231654325, + "grad_norm": 0.5310013890266418, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0067, + "step": 5790 + }, + { + "epoch": 0.35497888487667545, + "grad_norm": 0.4203440845012665, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0097, + "step": 5800 + }, + { + "epoch": 0.35559091743680765, + "grad_norm": 0.2179369181394577, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0082, + "step": 5810 + }, + { + "epoch": 0.35620294999693985, + "grad_norm": 0.2789444625377655, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0066, + "step": 5820 + }, + { + "epoch": 0.35681498255707206, + "grad_norm": 0.28009694814682007, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.007, + "step": 5830 + }, + { + "epoch": 0.35742701511720426, + "grad_norm": 0.304768443107605, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0065, + "step": 5840 + }, + { + "epoch": 0.3580390476773364, + "grad_norm": 0.2829401195049286, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0061, + "step": 5850 + }, + { + "epoch": 0.3586510802374686, + "grad_norm": 0.3388998508453369, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0083, + "step": 5860 + }, + { + "epoch": 0.3592631127976008, + "grad_norm": 0.3313426673412323, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0069, + "step": 5870 + }, + { + "epoch": 0.359875145357733, + "grad_norm": 0.2886904180049896, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0094, + "step": 5880 + }, + { + "epoch": 0.3604871779178652, + "grad_norm": 0.3132432997226715, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0086, + "step": 5890 + }, + { + "epoch": 0.3610992104779974, + "grad_norm": 0.37195107340812683, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0137, + "step": 5900 + }, + { + "epoch": 0.3617112430381296, + "grad_norm": 0.30853375792503357, + "learning_rate": 1.875708056549365e-05, + "loss": 0.01, + "step": 5910 + }, + { + "epoch": 0.36232327559826183, + "grad_norm": 0.39785459637641907, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0073, + "step": 5920 + }, + { + "epoch": 0.36293530815839403, + "grad_norm": 0.26958727836608887, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0059, + "step": 5930 + }, + { + "epoch": 0.36354734071852624, + "grad_norm": 0.354956716299057, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0069, + "step": 5940 + }, + { + "epoch": 0.36415937327865844, + "grad_norm": 0.3470858037471771, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0066, + "step": 5950 + }, + { + "epoch": 0.36477140583879064, + "grad_norm": 0.30000701546669006, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0075, + "step": 5960 + }, + { + "epoch": 0.36538343839892284, + "grad_norm": 0.5558263063430786, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0083, + "step": 5970 + }, + { + "epoch": 0.36599547095905505, + "grad_norm": 0.39146295189857483, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0062, + "step": 5980 + }, + { + "epoch": 0.3666075035191872, + "grad_norm": 0.44002753496170044, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0089, + "step": 5990 + }, + { + "epoch": 0.3672195360793194, + "grad_norm": 0.3220095932483673, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0097, + "step": 6000 + }, + { + "epoch": 0.3678315686394516, + "grad_norm": 0.3569507598876953, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0067, + "step": 6010 + }, + { + "epoch": 0.3684436011995838, + "grad_norm": 0.3004184365272522, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0084, + "step": 6020 + }, + { + "epoch": 0.369055633759716, + "grad_norm": 0.2931320071220398, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0079, + "step": 6030 + }, + { + "epoch": 0.3696676663198482, + "grad_norm": 0.39551016688346863, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0088, + "step": 6040 + }, + { + "epoch": 0.3702796988799804, + "grad_norm": 0.33755603432655334, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0082, + "step": 6050 + }, + { + "epoch": 0.3708917314401126, + "grad_norm": 0.3101558983325958, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0069, + "step": 6060 + }, + { + "epoch": 0.3715037640002448, + "grad_norm": 0.2921602129936218, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0065, + "step": 6070 + }, + { + "epoch": 0.372115796560377, + "grad_norm": 0.3601403832435608, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0063, + "step": 6080 + }, + { + "epoch": 0.3727278291205092, + "grad_norm": 0.34929168224334717, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0073, + "step": 6090 + }, + { + "epoch": 0.3733398616806414, + "grad_norm": 0.3987390995025635, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0068, + "step": 6100 + }, + { + "epoch": 0.37395189424077363, + "grad_norm": 0.2641090452671051, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0091, + "step": 6110 + }, + { + "epoch": 0.37456392680090583, + "grad_norm": 0.23139338195323944, + "learning_rate": 1.865125972978549e-05, + "loss": 0.006, + "step": 6120 + }, + { + "epoch": 0.375175959361038, + "grad_norm": 0.26552167534828186, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0056, + "step": 6130 + }, + { + "epoch": 0.3757879919211702, + "grad_norm": 0.43827885389328003, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0084, + "step": 6140 + }, + { + "epoch": 0.3764000244813024, + "grad_norm": 0.27495354413986206, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.012, + "step": 6150 + }, + { + "epoch": 0.3770120570414346, + "grad_norm": 0.36078640818595886, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0104, + "step": 6160 + }, + { + "epoch": 0.3776240896015668, + "grad_norm": 0.28252753615379333, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0103, + "step": 6170 + }, + { + "epoch": 0.378236122161699, + "grad_norm": 0.2674558162689209, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0069, + "step": 6180 + }, + { + "epoch": 0.3788481547218312, + "grad_norm": 0.21457509696483612, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0089, + "step": 6190 + }, + { + "epoch": 0.3794601872819634, + "grad_norm": 0.3142339885234833, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0075, + "step": 6200 + }, + { + "epoch": 0.3800722198420956, + "grad_norm": 0.32714203000068665, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0084, + "step": 6210 + }, + { + "epoch": 0.3806842524022278, + "grad_norm": 0.2632557153701782, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0072, + "step": 6220 + }, + { + "epoch": 0.38129628496236, + "grad_norm": 0.1893932968378067, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0063, + "step": 6230 + }, + { + "epoch": 0.3819083175224922, + "grad_norm": 0.49935290217399597, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0087, + "step": 6240 + }, + { + "epoch": 0.3825203500826244, + "grad_norm": 0.34605127573013306, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0101, + "step": 6250 + }, + { + "epoch": 0.3831323826427566, + "grad_norm": 0.3294198513031006, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0067, + "step": 6260 + }, + { + "epoch": 0.38374441520288877, + "grad_norm": 0.34797370433807373, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0065, + "step": 6270 + }, + { + "epoch": 0.38435644776302097, + "grad_norm": 0.37710750102996826, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0061, + "step": 6280 + }, + { + "epoch": 0.3849684803231532, + "grad_norm": 0.39949893951416016, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0066, + "step": 6290 + }, + { + "epoch": 0.3855805128832854, + "grad_norm": 0.33014294505119324, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0095, + "step": 6300 + }, + { + "epoch": 0.3861925454434176, + "grad_norm": 0.4329249858856201, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0089, + "step": 6310 + }, + { + "epoch": 0.3868045780035498, + "grad_norm": 0.298330157995224, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0076, + "step": 6320 + }, + { + "epoch": 0.387416610563682, + "grad_norm": 0.2672661542892456, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0074, + "step": 6330 + }, + { + "epoch": 0.3880286431238142, + "grad_norm": 0.48193076252937317, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0103, + "step": 6340 + }, + { + "epoch": 0.3886406756839464, + "grad_norm": 0.29180601239204407, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0075, + "step": 6350 + }, + { + "epoch": 0.3892527082440786, + "grad_norm": 0.21320492029190063, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0077, + "step": 6360 + }, + { + "epoch": 0.3898647408042108, + "grad_norm": 0.37252935767173767, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0078, + "step": 6370 + }, + { + "epoch": 0.390476773364343, + "grad_norm": 0.284586101770401, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0089, + "step": 6380 + }, + { + "epoch": 0.3910888059244752, + "grad_norm": 0.5030382871627808, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0088, + "step": 6390 + }, + { + "epoch": 0.3917008384846074, + "grad_norm": 0.357239305973053, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0069, + "step": 6400 + }, + { + "epoch": 0.39231287104473955, + "grad_norm": 0.20308594405651093, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0062, + "step": 6410 + }, + { + "epoch": 0.39292490360487176, + "grad_norm": 0.2678150534629822, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0085, + "step": 6420 + }, + { + "epoch": 0.39353693616500396, + "grad_norm": 0.35160595178604126, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0072, + "step": 6430 + }, + { + "epoch": 0.39414896872513616, + "grad_norm": 0.33254173398017883, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0083, + "step": 6440 + }, + { + "epoch": 0.39476100128526836, + "grad_norm": 0.22763408720493317, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0061, + "step": 6450 + }, + { + "epoch": 0.39537303384540057, + "grad_norm": 0.20889192819595337, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0067, + "step": 6460 + }, + { + "epoch": 0.39598506640553277, + "grad_norm": 0.22515206038951874, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0086, + "step": 6470 + }, + { + "epoch": 0.396597098965665, + "grad_norm": 0.36421817541122437, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.0064, + "step": 6480 + }, + { + "epoch": 0.3972091315257972, + "grad_norm": 0.3869773745536804, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0084, + "step": 6490 + }, + { + "epoch": 0.3978211640859294, + "grad_norm": 0.26248687505722046, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0061, + "step": 6500 + }, + { + "epoch": 0.3984331966460616, + "grad_norm": 0.22152310609817505, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0057, + "step": 6510 + }, + { + "epoch": 0.3990452292061938, + "grad_norm": 0.25921961665153503, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0071, + "step": 6520 + }, + { + "epoch": 0.399657261766326, + "grad_norm": 0.3289903998374939, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0076, + "step": 6530 + }, + { + "epoch": 0.4002692943264582, + "grad_norm": 0.2767571210861206, + "learning_rate": 1.8427795928237e-05, + "loss": 0.01, + "step": 6540 + }, + { + "epoch": 0.40088132688659034, + "grad_norm": 0.46339666843414307, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0064, + "step": 6550 + }, + { + "epoch": 0.40149335944672254, + "grad_norm": 0.2942553460597992, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0067, + "step": 6560 + }, + { + "epoch": 0.40210539200685474, + "grad_norm": 0.3868240714073181, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0085, + "step": 6570 + }, + { + "epoch": 0.40271742456698695, + "grad_norm": 0.3999684154987335, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0067, + "step": 6580 + }, + { + "epoch": 0.40332945712711915, + "grad_norm": 0.42856812477111816, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0091, + "step": 6590 + }, + { + "epoch": 0.40394148968725135, + "grad_norm": 0.3099806010723114, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0084, + "step": 6600 + }, + { + "epoch": 0.40455352224738356, + "grad_norm": 0.3798827826976776, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0066, + "step": 6610 + }, + { + "epoch": 0.40516555480751576, + "grad_norm": 0.19007280468940735, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0068, + "step": 6620 + }, + { + "epoch": 0.40577758736764796, + "grad_norm": 0.3723277151584625, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0085, + "step": 6630 + }, + { + "epoch": 0.40638961992778017, + "grad_norm": 0.21034900844097137, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0069, + "step": 6640 + }, + { + "epoch": 0.40700165248791237, + "grad_norm": 0.29838645458221436, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0095, + "step": 6650 + }, + { + "epoch": 0.40761368504804457, + "grad_norm": 0.2645854353904724, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0061, + "step": 6660 + }, + { + "epoch": 0.4082257176081768, + "grad_norm": 0.21633592247962952, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.006, + "step": 6670 + }, + { + "epoch": 0.408837750168309, + "grad_norm": 0.25387731194496155, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.008, + "step": 6680 + }, + { + "epoch": 0.4094497827284412, + "grad_norm": 0.3752288520336151, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0092, + "step": 6690 + }, + { + "epoch": 0.41006181528857333, + "grad_norm": 0.33368971943855286, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0072, + "step": 6700 + }, + { + "epoch": 0.41067384784870553, + "grad_norm": 0.34388917684555054, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0074, + "step": 6710 + }, + { + "epoch": 0.41128588040883773, + "grad_norm": 0.2683192789554596, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.007, + "step": 6720 + }, + { + "epoch": 0.41189791296896994, + "grad_norm": 0.5121234059333801, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0074, + "step": 6730 + }, + { + "epoch": 0.41250994552910214, + "grad_norm": 0.333406925201416, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0117, + "step": 6740 + }, + { + "epoch": 0.41312197808923434, + "grad_norm": 0.26011794805526733, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0062, + "step": 6750 + }, + { + "epoch": 0.41373401064936655, + "grad_norm": 0.28925821185112, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0066, + "step": 6760 + }, + { + "epoch": 0.41434604320949875, + "grad_norm": 0.2202957570552826, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0058, + "step": 6770 + }, + { + "epoch": 0.41495807576963095, + "grad_norm": 0.2740793824195862, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0072, + "step": 6780 + }, + { + "epoch": 0.41557010832976315, + "grad_norm": 0.46569427847862244, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0069, + "step": 6790 + }, + { + "epoch": 0.41618214088989536, + "grad_norm": 0.3959881067276001, + "learning_rate": 1.828172598376902e-05, + "loss": 0.009, + "step": 6800 + }, + { + "epoch": 0.41679417345002756, + "grad_norm": 0.2465214729309082, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0068, + "step": 6810 + }, + { + "epoch": 0.41740620601015976, + "grad_norm": 0.3207756280899048, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0083, + "step": 6820 + }, + { + "epoch": 0.41801823857029197, + "grad_norm": 0.5600990653038025, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0076, + "step": 6830 + }, + { + "epoch": 0.4186302711304241, + "grad_norm": 0.32832831144332886, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0072, + "step": 6840 + }, + { + "epoch": 0.4192423036905563, + "grad_norm": 0.3397129774093628, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0083, + "step": 6850 + }, + { + "epoch": 0.4198543362506885, + "grad_norm": 0.3481312096118927, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0064, + "step": 6860 + }, + { + "epoch": 0.4204663688108207, + "grad_norm": 0.4542059898376465, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0104, + "step": 6870 + }, + { + "epoch": 0.4210784013709529, + "grad_norm": 0.2517620325088501, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0089, + "step": 6880 + }, + { + "epoch": 0.42169043393108513, + "grad_norm": 0.3671923875808716, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0068, + "step": 6890 + }, + { + "epoch": 0.42230246649121733, + "grad_norm": 0.41340726613998413, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0084, + "step": 6900 + }, + { + "epoch": 0.42291449905134954, + "grad_norm": 0.22815965116024017, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0079, + "step": 6910 + }, + { + "epoch": 0.42352653161148174, + "grad_norm": 0.35324010252952576, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0073, + "step": 6920 + }, + { + "epoch": 0.42413856417161394, + "grad_norm": 0.30134323239326477, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0071, + "step": 6930 + }, + { + "epoch": 0.42475059673174614, + "grad_norm": 0.4007415771484375, + "learning_rate": 1.82006727813775e-05, + "loss": 0.006, + "step": 6940 + }, + { + "epoch": 0.42536262929187835, + "grad_norm": 0.3320179879665375, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0074, + "step": 6950 + }, + { + "epoch": 0.42597466185201055, + "grad_norm": 0.311971515417099, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0062, + "step": 6960 + }, + { + "epoch": 0.42658669441214275, + "grad_norm": 0.34347453713417053, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0068, + "step": 6970 + }, + { + "epoch": 0.4271987269722749, + "grad_norm": 0.25632336735725403, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0113, + "step": 6980 + }, + { + "epoch": 0.4278107595324071, + "grad_norm": 0.21711130440235138, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0068, + "step": 6990 + }, + { + "epoch": 0.4284227920925393, + "grad_norm": 0.3381270170211792, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0064, + "step": 7000 + }, + { + "epoch": 0.4290348246526715, + "grad_norm": 0.32262885570526123, + "learning_rate": 1.815952390818299e-05, + "loss": 0.0091, + "step": 7010 + }, + { + "epoch": 0.4296468572128037, + "grad_norm": 0.65865558385849, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0104, + "step": 7020 + }, + { + "epoch": 0.4302588897729359, + "grad_norm": 0.3021128177642822, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.009, + "step": 7030 + }, + { + "epoch": 0.4308709223330681, + "grad_norm": 0.2859005331993103, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0065, + "step": 7040 + }, + { + "epoch": 0.4314829548932003, + "grad_norm": 0.3379405736923218, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0062, + "step": 7050 + }, + { + "epoch": 0.4320949874533325, + "grad_norm": 0.22009991109371185, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.007, + "step": 7060 + }, + { + "epoch": 0.4327070200134647, + "grad_norm": 0.24766206741333008, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0072, + "step": 7070 + }, + { + "epoch": 0.43331905257359693, + "grad_norm": 0.3557615280151367, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0096, + "step": 7080 + }, + { + "epoch": 0.43393108513372913, + "grad_norm": 0.5700691938400269, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0066, + "step": 7090 + }, + { + "epoch": 0.43454311769386134, + "grad_norm": 0.3194892704486847, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0065, + "step": 7100 + }, + { + "epoch": 0.43515515025399354, + "grad_norm": 0.2766750752925873, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0074, + "step": 7110 + }, + { + "epoch": 0.4357671828141257, + "grad_norm": 0.2775132656097412, + "learning_rate": 1.809403050791396e-05, + "loss": 0.007, + "step": 7120 + }, + { + "epoch": 0.4363792153742579, + "grad_norm": 0.4468507170677185, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0066, + "step": 7130 + }, + { + "epoch": 0.4369912479343901, + "grad_norm": 0.3282400369644165, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0185, + "step": 7140 + }, + { + "epoch": 0.4376032804945223, + "grad_norm": 0.2625710964202881, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0088, + "step": 7150 + }, + { + "epoch": 0.4382153130546545, + "grad_norm": 0.47729599475860596, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.008, + "step": 7160 + }, + { + "epoch": 0.4388273456147867, + "grad_norm": 0.30350950360298157, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0091, + "step": 7170 + }, + { + "epoch": 0.4394393781749189, + "grad_norm": 0.3514627516269684, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0065, + "step": 7180 + }, + { + "epoch": 0.4400514107350511, + "grad_norm": 0.26150578260421753, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0087, + "step": 7190 + }, + { + "epoch": 0.4406634432951833, + "grad_norm": 0.374138206243515, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0073, + "step": 7200 + }, + { + "epoch": 0.4412754758553155, + "grad_norm": 0.2980635166168213, + "learning_rate": 1.803969531201634e-05, + "loss": 0.007, + "step": 7210 + }, + { + "epoch": 0.4418875084154477, + "grad_norm": 0.38190510869026184, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0077, + "step": 7220 + }, + { + "epoch": 0.4424995409755799, + "grad_norm": 0.28819066286087036, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0065, + "step": 7230 + }, + { + "epoch": 0.4431115735357121, + "grad_norm": 0.43382275104522705, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0077, + "step": 7240 + }, + { + "epoch": 0.4437236060958443, + "grad_norm": 0.31589648127555847, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0065, + "step": 7250 + }, + { + "epoch": 0.4443356386559765, + "grad_norm": 0.3744536340236664, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0067, + "step": 7260 + }, + { + "epoch": 0.4449476712161087, + "grad_norm": 0.2600225806236267, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.008, + "step": 7270 + }, + { + "epoch": 0.4455597037762409, + "grad_norm": 0.28064799308776855, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0061, + "step": 7280 + }, + { + "epoch": 0.4461717363363731, + "grad_norm": 0.2745135426521301, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0075, + "step": 7290 + }, + { + "epoch": 0.4467837688965053, + "grad_norm": 0.23609793186187744, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0056, + "step": 7300 + }, + { + "epoch": 0.4473958014566375, + "grad_norm": 0.35910022258758545, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0079, + "step": 7310 + }, + { + "epoch": 0.4480078340167697, + "grad_norm": 0.22230662405490875, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0065, + "step": 7320 + }, + { + "epoch": 0.4486198665769019, + "grad_norm": 0.3835199475288391, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.008, + "step": 7330 + }, + { + "epoch": 0.4492318991370341, + "grad_norm": 0.37863102555274963, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0083, + "step": 7340 + }, + { + "epoch": 0.4498439316971663, + "grad_norm": 0.25412216782569885, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0056, + "step": 7350 + }, + { + "epoch": 0.4504559642572985, + "grad_norm": 0.43248918652534485, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0061, + "step": 7360 + }, + { + "epoch": 0.4510679968174307, + "grad_norm": 0.2937811613082886, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0073, + "step": 7370 + }, + { + "epoch": 0.4516800293775629, + "grad_norm": 0.3018436133861542, + "learning_rate": 1.793524061803872e-05, + "loss": 0.007, + "step": 7380 + }, + { + "epoch": 0.4522920619376951, + "grad_norm": 0.32781726121902466, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0079, + "step": 7390 + }, + { + "epoch": 0.45290409449782726, + "grad_norm": 0.2843719720840454, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0067, + "step": 7400 + }, + { + "epoch": 0.45351612705795946, + "grad_norm": 0.27588292956352234, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0092, + "step": 7410 + }, + { + "epoch": 0.45412815961809166, + "grad_norm": 0.38858234882354736, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0067, + "step": 7420 + }, + { + "epoch": 0.45474019217822387, + "grad_norm": 0.4235166609287262, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0082, + "step": 7430 + }, + { + "epoch": 0.45535222473835607, + "grad_norm": 0.272210031747818, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0059, + "step": 7440 + }, + { + "epoch": 0.4559642572984883, + "grad_norm": 0.23851896822452545, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0071, + "step": 7450 + }, + { + "epoch": 0.4565762898586205, + "grad_norm": 0.37179476022720337, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0073, + "step": 7460 + }, + { + "epoch": 0.4571883224187527, + "grad_norm": 0.31902605295181274, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.009, + "step": 7470 + }, + { + "epoch": 0.4578003549788849, + "grad_norm": 0.47023633122444153, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0091, + "step": 7480 + }, + { + "epoch": 0.4584123875390171, + "grad_norm": 0.35726839303970337, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0079, + "step": 7490 + }, + { + "epoch": 0.4590244200991493, + "grad_norm": 0.27567291259765625, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0073, + "step": 7500 + }, + { + "epoch": 0.4596364526592815, + "grad_norm": 0.23053516447544098, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0065, + "step": 7510 + }, + { + "epoch": 0.4602484852194137, + "grad_norm": 0.2169056385755539, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0054, + "step": 7520 + }, + { + "epoch": 0.4608605177795459, + "grad_norm": 0.2912258207798004, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0083, + "step": 7530 + }, + { + "epoch": 0.46147255033967804, + "grad_norm": 0.2527846097946167, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.006, + "step": 7540 + }, + { + "epoch": 0.46208458289981025, + "grad_norm": 0.3878445029258728, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0079, + "step": 7550 + }, + { + "epoch": 0.46269661545994245, + "grad_norm": 0.3981980085372925, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0081, + "step": 7560 + }, + { + "epoch": 0.46330864802007465, + "grad_norm": 0.48834845423698425, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0086, + "step": 7570 + }, + { + "epoch": 0.46392068058020686, + "grad_norm": 0.3045276701450348, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0085, + "step": 7580 + }, + { + "epoch": 0.46453271314033906, + "grad_norm": 0.23345299065113068, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0072, + "step": 7590 + }, + { + "epoch": 0.46514474570047126, + "grad_norm": 0.3632943034172058, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0085, + "step": 7600 + }, + { + "epoch": 0.46575677826060347, + "grad_norm": 0.19813670217990875, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0073, + "step": 7610 + }, + { + "epoch": 0.46636881082073567, + "grad_norm": 0.36094173789024353, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0085, + "step": 7620 + }, + { + "epoch": 0.46698084338086787, + "grad_norm": 0.30049464106559753, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0085, + "step": 7630 + }, + { + "epoch": 0.4675928759410001, + "grad_norm": 0.27693697810173035, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0057, + "step": 7640 + }, + { + "epoch": 0.4682049085011323, + "grad_norm": 0.3656866252422333, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0064, + "step": 7650 + }, + { + "epoch": 0.4688169410612645, + "grad_norm": 0.602168083190918, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0076, + "step": 7660 + }, + { + "epoch": 0.4694289736213967, + "grad_norm": 0.3553078770637512, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0062, + "step": 7670 + }, + { + "epoch": 0.47004100618152883, + "grad_norm": 0.326695054769516, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0062, + "step": 7680 + }, + { + "epoch": 0.47065303874166103, + "grad_norm": 0.2762170732021332, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0075, + "step": 7690 + }, + { + "epoch": 0.47126507130179324, + "grad_norm": 0.35057321190834045, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0063, + "step": 7700 + }, + { + "epoch": 0.47187710386192544, + "grad_norm": 0.3906462788581848, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0086, + "step": 7710 + }, + { + "epoch": 0.47248913642205764, + "grad_norm": 0.290752112865448, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0087, + "step": 7720 + }, + { + "epoch": 0.47310116898218985, + "grad_norm": 0.2242034673690796, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0068, + "step": 7730 + }, + { + "epoch": 0.47371320154232205, + "grad_norm": 0.3283435106277466, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0068, + "step": 7740 + }, + { + "epoch": 0.47432523410245425, + "grad_norm": 0.24059069156646729, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.0056, + "step": 7750 + }, + { + "epoch": 0.47493726666258645, + "grad_norm": 0.2978667914867401, + "learning_rate": 1.769330275540774e-05, + "loss": 0.007, + "step": 7760 + }, + { + "epoch": 0.47554929922271866, + "grad_norm": 0.2605571150779724, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0084, + "step": 7770 + }, + { + "epoch": 0.47616133178285086, + "grad_norm": 0.4010445475578308, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0075, + "step": 7780 + }, + { + "epoch": 0.47677336434298306, + "grad_norm": 0.31932029128074646, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0076, + "step": 7790 + }, + { + "epoch": 0.47738539690311527, + "grad_norm": 0.3508684039115906, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0067, + "step": 7800 + }, + { + "epoch": 0.47799742946324747, + "grad_norm": 0.2835206091403961, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0064, + "step": 7810 + }, + { + "epoch": 0.4786094620233796, + "grad_norm": 0.2661663293838501, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0051, + "step": 7820 + }, + { + "epoch": 0.4792214945835118, + "grad_norm": 0.4146379828453064, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0073, + "step": 7830 + }, + { + "epoch": 0.479833527143644, + "grad_norm": 0.38621196150779724, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0076, + "step": 7840 + }, + { + "epoch": 0.4804455597037762, + "grad_norm": 0.19052188098430634, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.008, + "step": 7850 + }, + { + "epoch": 0.48105759226390843, + "grad_norm": 0.3699149489402771, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0071, + "step": 7860 + }, + { + "epoch": 0.48166962482404063, + "grad_norm": 0.3756427764892578, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0071, + "step": 7870 + }, + { + "epoch": 0.48228165738417283, + "grad_norm": 0.2987386882305145, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0069, + "step": 7880 + }, + { + "epoch": 0.48289368994430504, + "grad_norm": 0.24891899526119232, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0077, + "step": 7890 + }, + { + "epoch": 0.48350572250443724, + "grad_norm": 0.44080299139022827, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.011, + "step": 7900 + }, + { + "epoch": 0.48411775506456944, + "grad_norm": 0.20801177620887756, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0089, + "step": 7910 + }, + { + "epoch": 0.48472978762470165, + "grad_norm": 0.31475305557250977, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0091, + "step": 7920 + }, + { + "epoch": 0.48534182018483385, + "grad_norm": 0.29783639311790466, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0082, + "step": 7930 + }, + { + "epoch": 0.48595385274496605, + "grad_norm": 0.3330203890800476, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0071, + "step": 7940 + }, + { + "epoch": 0.48656588530509826, + "grad_norm": 0.3537667691707611, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0068, + "step": 7950 + }, + { + "epoch": 0.4871779178652304, + "grad_norm": 0.2810688316822052, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0059, + "step": 7960 + }, + { + "epoch": 0.4877899504253626, + "grad_norm": 0.3359779715538025, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0059, + "step": 7970 + }, + { + "epoch": 0.4884019829854948, + "grad_norm": 0.36015257239341736, + "learning_rate": 1.754802282200567e-05, + "loss": 0.008, + "step": 7980 + }, + { + "epoch": 0.489014015545627, + "grad_norm": 0.2647690176963806, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0065, + "step": 7990 + }, + { + "epoch": 0.4896260481057592, + "grad_norm": 0.23366811871528625, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0068, + "step": 8000 + }, + { + "epoch": 0.4902380806658914, + "grad_norm": 0.2904139757156372, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0054, + "step": 8010 + }, + { + "epoch": 0.4908501132260236, + "grad_norm": 0.30941230058670044, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0069, + "step": 8020 + }, + { + "epoch": 0.4914621457861558, + "grad_norm": 0.1959473341703415, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0063, + "step": 8030 + }, + { + "epoch": 0.492074178346288, + "grad_norm": 0.33349713683128357, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0059, + "step": 8040 + }, + { + "epoch": 0.49268621090642023, + "grad_norm": 0.39017921686172485, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0067, + "step": 8050 + }, + { + "epoch": 0.49329824346655243, + "grad_norm": 0.36401957273483276, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0061, + "step": 8060 + }, + { + "epoch": 0.49391027602668464, + "grad_norm": 0.22296921908855438, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0071, + "step": 8070 + }, + { + "epoch": 0.49452230858681684, + "grad_norm": 0.8712129592895508, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0104, + "step": 8080 + }, + { + "epoch": 0.49513434114694904, + "grad_norm": 0.39942649006843567, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0068, + "step": 8090 + }, + { + "epoch": 0.4957463737070812, + "grad_norm": 0.3821292817592621, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0076, + "step": 8100 + }, + { + "epoch": 0.4963584062672134, + "grad_norm": 0.35861077904701233, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0086, + "step": 8110 + }, + { + "epoch": 0.4969704388273456, + "grad_norm": 0.38629451394081116, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0068, + "step": 8120 + }, + { + "epoch": 0.4975824713874778, + "grad_norm": 3.412374973297119, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0168, + "step": 8130 + }, + { + "epoch": 0.49819450394761, + "grad_norm": 0.2893833816051483, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0065, + "step": 8140 + }, + { + "epoch": 0.4988065365077422, + "grad_norm": 0.37679117918014526, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0058, + "step": 8150 + }, + { + "epoch": 0.4994185690678744, + "grad_norm": 0.2745130658149719, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0082, + "step": 8160 + }, + { + "epoch": 0.5000306016280066, + "grad_norm": 0.30250442028045654, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0065, + "step": 8170 + }, + { + "epoch": 0.5006426341881388, + "grad_norm": 0.19602464139461517, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0056, + "step": 8180 + }, + { + "epoch": 0.501254666748271, + "grad_norm": 0.4736115634441376, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0062, + "step": 8190 + }, + { + "epoch": 0.5018666993084032, + "grad_norm": 0.25439244508743286, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0069, + "step": 8200 + }, + { + "epoch": 0.5024787318685354, + "grad_norm": 0.19290995597839355, + "learning_rate": 1.739216409306913e-05, + "loss": 0.007, + "step": 8210 + }, + { + "epoch": 0.5030907644286676, + "grad_norm": 0.24844267964363098, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0071, + "step": 8220 + }, + { + "epoch": 0.5037027969887998, + "grad_norm": 0.21179668605327606, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0071, + "step": 8230 + }, + { + "epoch": 0.504314829548932, + "grad_norm": 0.29139387607574463, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0082, + "step": 8240 + }, + { + "epoch": 0.5049268621090642, + "grad_norm": 0.2621973752975464, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0068, + "step": 8250 + }, + { + "epoch": 0.5055388946691964, + "grad_norm": 0.23394125699996948, + "learning_rate": 1.735775329110705e-05, + "loss": 0.006, + "step": 8260 + }, + { + "epoch": 0.5061509272293286, + "grad_norm": 0.28399863839149475, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0067, + "step": 8270 + }, + { + "epoch": 0.5067629597894608, + "grad_norm": 0.5048072934150696, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.008, + "step": 8280 + }, + { + "epoch": 0.507374992349593, + "grad_norm": 0.33848801255226135, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0054, + "step": 8290 + }, + { + "epoch": 0.5079870249097252, + "grad_norm": 0.28341951966285706, + "learning_rate": 1.733009030001197e-05, + "loss": 0.008, + "step": 8300 + }, + { + "epoch": 0.5085990574698575, + "grad_norm": 0.3223153054714203, + "learning_rate": 1.732315596014244e-05, + "loss": 0.007, + "step": 8310 + }, + { + "epoch": 0.5092110900299895, + "grad_norm": 0.23227599263191223, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0049, + "step": 8320 + }, + { + "epoch": 0.5098231225901217, + "grad_norm": 0.2847786247730255, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.007, + "step": 8330 + }, + { + "epoch": 0.510435155150254, + "grad_norm": 0.2026357650756836, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.007, + "step": 8340 + }, + { + "epoch": 0.5110471877103862, + "grad_norm": 0.3617453873157501, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0067, + "step": 8350 + }, + { + "epoch": 0.5116592202705184, + "grad_norm": 0.4439109265804291, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0067, + "step": 8360 + }, + { + "epoch": 0.5122712528306506, + "grad_norm": 0.26640209555625916, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0086, + "step": 8370 + }, + { + "epoch": 0.5128832853907828, + "grad_norm": 0.38045984506607056, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0075, + "step": 8380 + }, + { + "epoch": 0.513495317950915, + "grad_norm": 0.23035791516304016, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.006, + "step": 8390 + }, + { + "epoch": 0.5141073505110472, + "grad_norm": 0.40618664026260376, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0071, + "step": 8400 + }, + { + "epoch": 0.5147193830711794, + "grad_norm": 0.2593354880809784, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0064, + "step": 8410 + }, + { + "epoch": 0.5153314156313116, + "grad_norm": 0.27723655104637146, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0062, + "step": 8420 + }, + { + "epoch": 0.5159434481914438, + "grad_norm": 0.3793911039829254, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0059, + "step": 8430 + }, + { + "epoch": 0.516555480751576, + "grad_norm": 0.28634312748908997, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0063, + "step": 8440 + }, + { + "epoch": 0.5171675133117082, + "grad_norm": 0.39417290687561035, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0078, + "step": 8450 + }, + { + "epoch": 0.5177795458718404, + "grad_norm": 0.3043057322502136, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0064, + "step": 8460 + }, + { + "epoch": 0.5183915784319726, + "grad_norm": 0.36794111132621765, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0106, + "step": 8470 + }, + { + "epoch": 0.5190036109921048, + "grad_norm": 0.312161922454834, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0067, + "step": 8480 + }, + { + "epoch": 0.519615643552237, + "grad_norm": 0.39240267872810364, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0065, + "step": 8490 + }, + { + "epoch": 0.5202276761123692, + "grad_norm": 0.4500446915626526, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0073, + "step": 8500 + }, + { + "epoch": 0.5208397086725014, + "grad_norm": 0.22808927297592163, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0072, + "step": 8510 + }, + { + "epoch": 0.5214517412326336, + "grad_norm": 0.3262411057949066, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0065, + "step": 8520 + }, + { + "epoch": 0.5220637737927658, + "grad_norm": 0.472229927778244, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0068, + "step": 8530 + }, + { + "epoch": 0.522675806352898, + "grad_norm": 0.31563568115234375, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0074, + "step": 8540 + }, + { + "epoch": 0.5232878389130302, + "grad_norm": 0.27949750423431396, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0064, + "step": 8550 + }, + { + "epoch": 0.5238998714731624, + "grad_norm": 0.30297499895095825, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0075, + "step": 8560 + }, + { + "epoch": 0.5245119040332946, + "grad_norm": 0.3946770429611206, + "learning_rate": 1.714028248198457e-05, + "loss": 0.011, + "step": 8570 + }, + { + "epoch": 0.5251239365934268, + "grad_norm": 0.3405992090702057, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0082, + "step": 8580 + }, + { + "epoch": 0.525735969153559, + "grad_norm": 0.2963511347770691, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0066, + "step": 8590 + }, + { + "epoch": 0.5263480017136911, + "grad_norm": 0.1909177303314209, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.006, + "step": 8600 + }, + { + "epoch": 0.5269600342738233, + "grad_norm": 0.3378836512565613, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0058, + "step": 8610 + }, + { + "epoch": 0.5275720668339555, + "grad_norm": 0.30862805247306824, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0067, + "step": 8620 + }, + { + "epoch": 0.5281840993940877, + "grad_norm": 0.397293359041214, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0071, + "step": 8630 + }, + { + "epoch": 0.5287961319542199, + "grad_norm": 0.3665411174297333, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0068, + "step": 8640 + }, + { + "epoch": 0.5294081645143521, + "grad_norm": 0.34842419624328613, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0068, + "step": 8650 + }, + { + "epoch": 0.5300201970744843, + "grad_norm": 0.38205671310424805, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0065, + "step": 8660 + }, + { + "epoch": 0.5306322296346165, + "grad_norm": 0.35549092292785645, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0068, + "step": 8670 + }, + { + "epoch": 0.5312442621947487, + "grad_norm": 0.15676020085811615, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0055, + "step": 8680 + }, + { + "epoch": 0.5318562947548809, + "grad_norm": 0.22985056042671204, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0071, + "step": 8690 + }, + { + "epoch": 0.5324683273150131, + "grad_norm": 0.2743426263332367, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0047, + "step": 8700 + }, + { + "epoch": 0.5330803598751453, + "grad_norm": 0.2503803074359894, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0079, + "step": 8710 + }, + { + "epoch": 0.5336923924352776, + "grad_norm": 0.5036469101905823, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0095, + "step": 8720 + }, + { + "epoch": 0.5343044249954098, + "grad_norm": 0.2349964827299118, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0068, + "step": 8730 + }, + { + "epoch": 0.534916457555542, + "grad_norm": 0.28706061840057373, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0065, + "step": 8740 + }, + { + "epoch": 0.5355284901156742, + "grad_norm": 0.21812452375888824, + "learning_rate": 1.701081551967764e-05, + "loss": 0.008, + "step": 8750 + }, + { + "epoch": 0.5361405226758064, + "grad_norm": 0.301618754863739, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0069, + "step": 8760 + }, + { + "epoch": 0.5367525552359386, + "grad_norm": 0.35402950644493103, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0067, + "step": 8770 + }, + { + "epoch": 0.5373645877960708, + "grad_norm": 0.2875203788280487, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0097, + "step": 8780 + }, + { + "epoch": 0.537976620356203, + "grad_norm": 0.2358965128660202, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0053, + "step": 8790 + }, + { + "epoch": 0.5385886529163352, + "grad_norm": 0.14462094008922577, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0059, + "step": 8800 + }, + { + "epoch": 0.5392006854764674, + "grad_norm": 0.17893171310424805, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0062, + "step": 8810 + }, + { + "epoch": 0.5398127180365996, + "grad_norm": 0.2923351526260376, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0078, + "step": 8820 + }, + { + "epoch": 0.5404247505967318, + "grad_norm": 0.3288479745388031, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0067, + "step": 8830 + }, + { + "epoch": 0.541036783156864, + "grad_norm": 0.3996310532093048, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.006, + "step": 8840 + }, + { + "epoch": 0.5416488157169962, + "grad_norm": 0.24345380067825317, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0067, + "step": 8850 + }, + { + "epoch": 0.5422608482771284, + "grad_norm": 0.26688340306282043, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0072, + "step": 8860 + }, + { + "epoch": 0.5428728808372606, + "grad_norm": 0.4816153645515442, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0099, + "step": 8870 + }, + { + "epoch": 0.5434849133973927, + "grad_norm": 0.22544988989830017, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.007, + "step": 8880 + }, + { + "epoch": 0.5440969459575249, + "grad_norm": 0.2820419669151306, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0064, + "step": 8890 + }, + { + "epoch": 0.5447089785176571, + "grad_norm": 0.2758846879005432, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0072, + "step": 8900 + }, + { + "epoch": 0.5453210110777893, + "grad_norm": 0.4620129466056824, + "learning_rate": 1.689381359053773e-05, + "loss": 0.008, + "step": 8910 + }, + { + "epoch": 0.5459330436379215, + "grad_norm": 0.5567039847373962, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0079, + "step": 8920 + }, + { + "epoch": 0.5465450761980537, + "grad_norm": 0.347251832485199, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.006, + "step": 8930 + }, + { + "epoch": 0.5471571087581859, + "grad_norm": 0.31768012046813965, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0065, + "step": 8940 + }, + { + "epoch": 0.5477691413183181, + "grad_norm": 0.24245156347751617, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0052, + "step": 8950 + }, + { + "epoch": 0.5483811738784503, + "grad_norm": 0.2124931961297989, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0074, + "step": 8960 + }, + { + "epoch": 0.5489932064385825, + "grad_norm": 0.18998636305332184, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0056, + "step": 8970 + }, + { + "epoch": 0.5496052389987147, + "grad_norm": 0.2667362689971924, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0056, + "step": 8980 + }, + { + "epoch": 0.5502172715588469, + "grad_norm": 0.4424617886543274, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0091, + "step": 8990 + }, + { + "epoch": 0.5508293041189791, + "grad_norm": 0.33623644709587097, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0061, + "step": 9000 + }, + { + "epoch": 0.5514413366791113, + "grad_norm": 0.29990604519844055, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0059, + "step": 9010 + }, + { + "epoch": 0.5520533692392435, + "grad_norm": 0.4384118914604187, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0059, + "step": 9020 + }, + { + "epoch": 0.5526654017993757, + "grad_norm": 0.3468496799468994, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0068, + "step": 9030 + }, + { + "epoch": 0.5532774343595079, + "grad_norm": 0.3473573327064514, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0062, + "step": 9040 + }, + { + "epoch": 0.5538894669196401, + "grad_norm": 0.36125242710113525, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0062, + "step": 9050 + }, + { + "epoch": 0.5545014994797723, + "grad_norm": 0.2603420615196228, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0091, + "step": 9060 + }, + { + "epoch": 0.5551135320399045, + "grad_norm": 0.27355659008026123, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0074, + "step": 9070 + }, + { + "epoch": 0.5557255646000367, + "grad_norm": 0.24741119146347046, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0064, + "step": 9080 + }, + { + "epoch": 0.556337597160169, + "grad_norm": 0.2001475840806961, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0094, + "step": 9090 + }, + { + "epoch": 0.5569496297203012, + "grad_norm": 0.41522347927093506, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0088, + "step": 9100 + }, + { + "epoch": 0.5575616622804334, + "grad_norm": 0.27282488346099854, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0062, + "step": 9110 + }, + { + "epoch": 0.5581736948405656, + "grad_norm": 0.26905956864356995, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.007, + "step": 9120 + }, + { + "epoch": 0.5587857274006978, + "grad_norm": 0.24747484922409058, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0084, + "step": 9130 + }, + { + "epoch": 0.55939775996083, + "grad_norm": 0.1863871067762375, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0061, + "step": 9140 + }, + { + "epoch": 0.5600097925209622, + "grad_norm": 0.3599740266799927, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0063, + "step": 9150 + }, + { + "epoch": 0.5606218250810943, + "grad_norm": 0.2238125205039978, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0047, + "step": 9160 + }, + { + "epoch": 0.5612338576412265, + "grad_norm": 0.272077351808548, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.006, + "step": 9170 + }, + { + "epoch": 0.5618458902013587, + "grad_norm": 0.2371625155210495, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0064, + "step": 9180 + }, + { + "epoch": 0.5624579227614909, + "grad_norm": 0.12783293426036835, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0054, + "step": 9190 + }, + { + "epoch": 0.5630699553216231, + "grad_norm": 0.3144581615924835, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0064, + "step": 9200 + }, + { + "epoch": 0.5636819878817553, + "grad_norm": 0.31995031237602234, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0082, + "step": 9210 + }, + { + "epoch": 0.5642940204418875, + "grad_norm": 0.31995660066604614, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0083, + "step": 9220 + }, + { + "epoch": 0.5649060530020197, + "grad_norm": 0.5018982291221619, + "learning_rate": 1.665453350687773e-05, + "loss": 0.007, + "step": 9230 + }, + { + "epoch": 0.5655180855621519, + "grad_norm": 0.2927841544151306, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0104, + "step": 9240 + }, + { + "epoch": 0.5661301181222841, + "grad_norm": 0.21124979853630066, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0068, + "step": 9250 + }, + { + "epoch": 0.5667421506824163, + "grad_norm": 0.25787463784217834, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0059, + "step": 9260 + }, + { + "epoch": 0.5673541832425485, + "grad_norm": 0.3194720447063446, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0065, + "step": 9270 + }, + { + "epoch": 0.5679662158026807, + "grad_norm": 0.24165599048137665, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.006, + "step": 9280 + }, + { + "epoch": 0.5685782483628129, + "grad_norm": 0.4880482256412506, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0096, + "step": 9290 + }, + { + "epoch": 0.5691902809229451, + "grad_norm": 0.24660199880599976, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0098, + "step": 9300 + }, + { + "epoch": 0.5698023134830773, + "grad_norm": 0.24707400798797607, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0052, + "step": 9310 + }, + { + "epoch": 0.5704143460432095, + "grad_norm": 0.33855682611465454, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.011, + "step": 9320 + }, + { + "epoch": 0.5710263786033417, + "grad_norm": 0.22913751006126404, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0074, + "step": 9330 + }, + { + "epoch": 0.5716384111634739, + "grad_norm": 0.24127185344696045, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0062, + "step": 9340 + }, + { + "epoch": 0.5722504437236061, + "grad_norm": 0.26104915142059326, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0065, + "step": 9350 + }, + { + "epoch": 0.5728624762837383, + "grad_norm": 0.21698857843875885, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0062, + "step": 9360 + }, + { + "epoch": 0.5734745088438705, + "grad_norm": 0.29092445969581604, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0081, + "step": 9370 + }, + { + "epoch": 0.5740865414040027, + "grad_norm": 0.2534378468990326, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0064, + "step": 9380 + }, + { + "epoch": 0.5746985739641349, + "grad_norm": 0.28900131583213806, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0064, + "step": 9390 + }, + { + "epoch": 0.5753106065242671, + "grad_norm": 0.3028101921081543, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0104, + "step": 9400 + }, + { + "epoch": 0.5759226390843993, + "grad_norm": 0.28851139545440674, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0069, + "step": 9410 + }, + { + "epoch": 0.5765346716445315, + "grad_norm": 0.5735841393470764, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0072, + "step": 9420 + }, + { + "epoch": 0.5771467042046637, + "grad_norm": 0.20355567336082458, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0056, + "step": 9430 + }, + { + "epoch": 0.5777587367647958, + "grad_norm": 0.37027955055236816, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.006, + "step": 9440 + }, + { + "epoch": 0.578370769324928, + "grad_norm": 0.2701684832572937, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0066, + "step": 9450 + }, + { + "epoch": 0.5789828018850602, + "grad_norm": 0.17381855845451355, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0054, + "step": 9460 + }, + { + "epoch": 0.5795948344451924, + "grad_norm": 0.250261515378952, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0096, + "step": 9470 + }, + { + "epoch": 0.5802068670053246, + "grad_norm": 0.22972841560840607, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0058, + "step": 9480 + }, + { + "epoch": 0.5808188995654568, + "grad_norm": 0.22654809057712555, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0061, + "step": 9490 + }, + { + "epoch": 0.581430932125589, + "grad_norm": 0.17165100574493408, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0058, + "step": 9500 + }, + { + "epoch": 0.5820429646857213, + "grad_norm": 0.2462143450975418, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0054, + "step": 9510 + }, + { + "epoch": 0.5826549972458535, + "grad_norm": 0.3970383107662201, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0069, + "step": 9520 + }, + { + "epoch": 0.5832670298059857, + "grad_norm": 0.21578988432884216, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0054, + "step": 9530 + }, + { + "epoch": 0.5838790623661179, + "grad_norm": 0.5680915713310242, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0072, + "step": 9540 + }, + { + "epoch": 0.5844910949262501, + "grad_norm": 0.24070246517658234, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0073, + "step": 9550 + }, + { + "epoch": 0.5851031274863823, + "grad_norm": 0.2524685263633728, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0068, + "step": 9560 + }, + { + "epoch": 0.5857151600465145, + "grad_norm": 0.27286672592163086, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.008, + "step": 9570 + }, + { + "epoch": 0.5863271926066467, + "grad_norm": 0.3459629714488983, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0088, + "step": 9580 + }, + { + "epoch": 0.5869392251667789, + "grad_norm": 0.2964814603328705, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0066, + "step": 9590 + }, + { + "epoch": 0.5875512577269111, + "grad_norm": 0.3559853434562683, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0064, + "step": 9600 + }, + { + "epoch": 0.5881632902870433, + "grad_norm": 0.256898432970047, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0056, + "step": 9610 + }, + { + "epoch": 0.5887753228471755, + "grad_norm": 0.25032711029052734, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0052, + "step": 9620 + }, + { + "epoch": 0.5893873554073077, + "grad_norm": 0.2467224895954132, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0048, + "step": 9630 + }, + { + "epoch": 0.5899993879674399, + "grad_norm": 0.5331161618232727, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0078, + "step": 9640 + }, + { + "epoch": 0.5906114205275721, + "grad_norm": 0.33348897099494934, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0068, + "step": 9650 + }, + { + "epoch": 0.5912234530877043, + "grad_norm": 0.21435993909835815, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0058, + "step": 9660 + }, + { + "epoch": 0.5918354856478365, + "grad_norm": 0.35850396752357483, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0068, + "step": 9670 + }, + { + "epoch": 0.5924475182079687, + "grad_norm": 0.3007623851299286, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0063, + "step": 9680 + }, + { + "epoch": 0.5930595507681009, + "grad_norm": 0.22949714958667755, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0054, + "step": 9690 + }, + { + "epoch": 0.5936715833282331, + "grad_norm": 0.23259367048740387, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0048, + "step": 9700 + }, + { + "epoch": 0.5942836158883653, + "grad_norm": 0.2305079996585846, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0047, + "step": 9710 + }, + { + "epoch": 0.5948956484484974, + "grad_norm": 0.33875930309295654, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0063, + "step": 9720 + }, + { + "epoch": 0.5955076810086296, + "grad_norm": 0.3981896936893463, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0076, + "step": 9730 + }, + { + "epoch": 0.5961197135687618, + "grad_norm": 0.280831515789032, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0075, + "step": 9740 + }, + { + "epoch": 0.596731746128894, + "grad_norm": 0.26045629382133484, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0064, + "step": 9750 + }, + { + "epoch": 0.5973437786890262, + "grad_norm": 0.23102521896362305, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0071, + "step": 9760 + }, + { + "epoch": 0.5979558112491584, + "grad_norm": 0.5013224482536316, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0063, + "step": 9770 + }, + { + "epoch": 0.5985678438092906, + "grad_norm": 0.45689067244529724, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0088, + "step": 9780 + }, + { + "epoch": 0.5991798763694228, + "grad_norm": 0.27118632197380066, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0065, + "step": 9790 + }, + { + "epoch": 0.599791908929555, + "grad_norm": 0.420202374458313, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0076, + "step": 9800 + }, + { + "epoch": 0.6004039414896872, + "grad_norm": 0.35844025015830994, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0056, + "step": 9810 + }, + { + "epoch": 0.6010159740498194, + "grad_norm": 0.2205585241317749, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0082, + "step": 9820 + }, + { + "epoch": 0.6016280066099516, + "grad_norm": 0.18860426545143127, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.007, + "step": 9830 + }, + { + "epoch": 0.6022400391700838, + "grad_norm": 0.25045180320739746, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0082, + "step": 9840 + }, + { + "epoch": 0.602852071730216, + "grad_norm": 0.2581705152988434, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0065, + "step": 9850 + }, + { + "epoch": 0.6034641042903482, + "grad_norm": 0.25894811749458313, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0058, + "step": 9860 + }, + { + "epoch": 0.6040761368504804, + "grad_norm": 0.43305444717407227, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0066, + "step": 9870 + }, + { + "epoch": 0.6046881694106127, + "grad_norm": 0.2295757383108139, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0069, + "step": 9880 + }, + { + "epoch": 0.6053002019707449, + "grad_norm": 0.29785802960395813, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0074, + "step": 9890 + }, + { + "epoch": 0.6059122345308771, + "grad_norm": 0.3353278338909149, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0068, + "step": 9900 + }, + { + "epoch": 0.6065242670910093, + "grad_norm": 0.29115045070648193, + "learning_rate": 1.612387195896372e-05, + "loss": 0.008, + "step": 9910 + }, + { + "epoch": 0.6071362996511415, + "grad_norm": 0.3202555477619171, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0071, + "step": 9920 + }, + { + "epoch": 0.6077483322112737, + "grad_norm": 0.2849314212799072, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.005, + "step": 9930 + }, + { + "epoch": 0.6083603647714059, + "grad_norm": 0.2768756151199341, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0051, + "step": 9940 + }, + { + "epoch": 0.6089723973315381, + "grad_norm": 0.3138035535812378, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0058, + "step": 9950 + }, + { + "epoch": 0.6095844298916703, + "grad_norm": 0.20827682316303253, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0058, + "step": 9960 + }, + { + "epoch": 0.6101964624518025, + "grad_norm": 0.29986995458602905, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0076, + "step": 9970 + }, + { + "epoch": 0.6108084950119347, + "grad_norm": 0.23564326763153076, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0056, + "step": 9980 + }, + { + "epoch": 0.6114205275720669, + "grad_norm": 0.24854765832424164, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0066, + "step": 9990 + }, + { + "epoch": 0.6120325601321991, + "grad_norm": 0.5696694850921631, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0072, + "step": 10000 + }, + { + "epoch": 0.6126445926923312, + "grad_norm": 0.24267911911010742, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.0061, + "step": 10010 + }, + { + "epoch": 0.6132566252524634, + "grad_norm": 0.1955283135175705, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0076, + "step": 10020 + }, + { + "epoch": 0.6138686578125956, + "grad_norm": 0.3427830934524536, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0073, + "step": 10030 + }, + { + "epoch": 0.6144806903727278, + "grad_norm": 0.38532915711402893, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0078, + "step": 10040 + }, + { + "epoch": 0.61509272293286, + "grad_norm": 0.4302294850349426, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0071, + "step": 10050 + }, + { + "epoch": 0.6157047554929922, + "grad_norm": 0.38420233130455017, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0072, + "step": 10060 + }, + { + "epoch": 0.6163167880531244, + "grad_norm": 0.23822636902332306, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.004, + "step": 10070 + }, + { + "epoch": 0.6169288206132566, + "grad_norm": 0.25123289227485657, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0065, + "step": 10080 + }, + { + "epoch": 0.6175408531733888, + "grad_norm": 0.23007746040821075, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0059, + "step": 10090 + }, + { + "epoch": 0.618152885733521, + "grad_norm": 0.24051082134246826, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0089, + "step": 10100 + }, + { + "epoch": 0.6187649182936532, + "grad_norm": 0.26246321201324463, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0052, + "step": 10110 + }, + { + "epoch": 0.6193769508537854, + "grad_norm": 0.3160432279109955, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0059, + "step": 10120 + }, + { + "epoch": 0.6199889834139176, + "grad_norm": 0.42534199357032776, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0071, + "step": 10130 + }, + { + "epoch": 0.6206010159740498, + "grad_norm": 0.22966268658638, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0052, + "step": 10140 + }, + { + "epoch": 0.621213048534182, + "grad_norm": 0.22234882414340973, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0073, + "step": 10150 + }, + { + "epoch": 0.6218250810943142, + "grad_norm": 0.31061676144599915, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0066, + "step": 10160 + }, + { + "epoch": 0.6224371136544464, + "grad_norm": 0.34178492426872253, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0063, + "step": 10170 + }, + { + "epoch": 0.6230491462145786, + "grad_norm": 0.263583779335022, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0079, + "step": 10180 + }, + { + "epoch": 0.6236611787747108, + "grad_norm": 0.3774336278438568, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0066, + "step": 10190 + }, + { + "epoch": 0.624273211334843, + "grad_norm": 0.29274430871009827, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.007, + "step": 10200 + }, + { + "epoch": 0.6248852438949752, + "grad_norm": 0.31850868463516235, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0056, + "step": 10210 + }, + { + "epoch": 0.6254972764551074, + "grad_norm": 0.3084369897842407, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0086, + "step": 10220 + }, + { + "epoch": 0.6261093090152396, + "grad_norm": 0.21596118807792664, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0072, + "step": 10230 + }, + { + "epoch": 0.6267213415753718, + "grad_norm": 0.16397996246814728, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0075, + "step": 10240 + }, + { + "epoch": 0.627333374135504, + "grad_norm": 0.15055827796459198, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0046, + "step": 10250 + }, + { + "epoch": 0.6279454066956363, + "grad_norm": 0.23483684659004211, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0064, + "step": 10260 + }, + { + "epoch": 0.6285574392557685, + "grad_norm": 0.3131091594696045, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0069, + "step": 10270 + }, + { + "epoch": 0.6291694718159007, + "grad_norm": 0.27958226203918457, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0067, + "step": 10280 + }, + { + "epoch": 0.6297815043760328, + "grad_norm": 0.23422567546367645, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0086, + "step": 10290 + }, + { + "epoch": 0.630393536936165, + "grad_norm": 0.4644703269004822, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0067, + "step": 10300 + }, + { + "epoch": 0.6310055694962972, + "grad_norm": 0.45787107944488525, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0068, + "step": 10310 + }, + { + "epoch": 0.6316176020564294, + "grad_norm": 0.21038737893104553, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0074, + "step": 10320 + }, + { + "epoch": 0.6322296346165616, + "grad_norm": 0.23812010884284973, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0054, + "step": 10330 + }, + { + "epoch": 0.6328416671766938, + "grad_norm": 0.36856284737586975, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0061, + "step": 10340 + }, + { + "epoch": 0.633453699736826, + "grad_norm": 0.3540131151676178, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0074, + "step": 10350 + }, + { + "epoch": 0.6340657322969582, + "grad_norm": 0.3004823923110962, + "learning_rate": 1.575723252169281e-05, + "loss": 0.006, + "step": 10360 + }, + { + "epoch": 0.6346777648570904, + "grad_norm": 0.17188489437103271, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0053, + "step": 10370 + }, + { + "epoch": 0.6352897974172226, + "grad_norm": 0.21710847318172455, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0062, + "step": 10380 + }, + { + "epoch": 0.6359018299773548, + "grad_norm": 0.2356785386800766, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0061, + "step": 10390 + }, + { + "epoch": 0.636513862537487, + "grad_norm": 0.2736414670944214, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0063, + "step": 10400 + }, + { + "epoch": 0.6371258950976192, + "grad_norm": 0.23872444033622742, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.006, + "step": 10410 + }, + { + "epoch": 0.6377379276577514, + "grad_norm": 0.24478361010551453, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0073, + "step": 10420 + }, + { + "epoch": 0.6383499602178836, + "grad_norm": 0.2964334487915039, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0048, + "step": 10430 + }, + { + "epoch": 0.6389619927780158, + "grad_norm": 0.2760549783706665, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0051, + "step": 10440 + }, + { + "epoch": 0.639574025338148, + "grad_norm": 0.2598065137863159, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0072, + "step": 10450 + }, + { + "epoch": 0.6401860578982802, + "grad_norm": 0.346999853849411, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0052, + "step": 10460 + }, + { + "epoch": 0.6407980904584124, + "grad_norm": 0.31291016936302185, + "learning_rate": 1.56658563993822e-05, + "loss": 0.007, + "step": 10470 + }, + { + "epoch": 0.6414101230185446, + "grad_norm": 0.2631952166557312, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0058, + "step": 10480 + }, + { + "epoch": 0.6420221555786768, + "grad_norm": 0.30895209312438965, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.006, + "step": 10490 + }, + { + "epoch": 0.642634188138809, + "grad_norm": 0.17614217102527618, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0069, + "step": 10500 + }, + { + "epoch": 0.6432462206989412, + "grad_norm": 0.38792312145233154, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0077, + "step": 10510 + }, + { + "epoch": 0.6438582532590734, + "grad_norm": 0.1722564697265625, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0054, + "step": 10520 + }, + { + "epoch": 0.6444702858192056, + "grad_norm": 0.2741699516773224, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0062, + "step": 10530 + }, + { + "epoch": 0.6450823183793378, + "grad_norm": 0.2059863954782486, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0073, + "step": 10540 + }, + { + "epoch": 0.64569435093947, + "grad_norm": 0.2702447474002838, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0044, + "step": 10550 + }, + { + "epoch": 0.6463063834996022, + "grad_norm": 0.2299312800168991, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0051, + "step": 10560 + }, + { + "epoch": 0.6469184160597343, + "grad_norm": 0.1995723992586136, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0057, + "step": 10570 + }, + { + "epoch": 0.6475304486198665, + "grad_norm": 0.30346980690956116, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0063, + "step": 10580 + }, + { + "epoch": 0.6481424811799987, + "grad_norm": 0.5040738582611084, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0064, + "step": 10590 + }, + { + "epoch": 0.6487545137401309, + "grad_norm": 0.16984818875789642, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0071, + "step": 10600 + }, + { + "epoch": 0.6493665463002631, + "grad_norm": 0.26560020446777344, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0116, + "step": 10610 + }, + { + "epoch": 0.6499785788603953, + "grad_norm": 0.4563823342323303, + "learning_rate": 1.554018740860716e-05, + "loss": 0.008, + "step": 10620 + }, + { + "epoch": 0.6505906114205275, + "grad_norm": 0.23272818326950073, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.006, + "step": 10630 + }, + { + "epoch": 0.6512026439806597, + "grad_norm": 0.19166870415210724, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0066, + "step": 10640 + }, + { + "epoch": 0.651814676540792, + "grad_norm": 0.2822705805301666, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0062, + "step": 10650 + }, + { + "epoch": 0.6524267091009242, + "grad_norm": 0.24001267552375793, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0069, + "step": 10660 + }, + { + "epoch": 0.6530387416610564, + "grad_norm": 0.2563900947570801, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0068, + "step": 10670 + }, + { + "epoch": 0.6536507742211886, + "grad_norm": 0.2747437357902527, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0058, + "step": 10680 + }, + { + "epoch": 0.6542628067813208, + "grad_norm": 0.39710354804992676, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.005, + "step": 10690 + }, + { + "epoch": 0.654874839341453, + "grad_norm": 0.30690231919288635, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0102, + "step": 10700 + }, + { + "epoch": 0.6554868719015852, + "grad_norm": 0.2879253923892975, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0072, + "step": 10710 + }, + { + "epoch": 0.6560989044617174, + "grad_norm": 0.19964110851287842, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0065, + "step": 10720 + }, + { + "epoch": 0.6567109370218496, + "grad_norm": 0.20109151303768158, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0066, + "step": 10730 + }, + { + "epoch": 0.6573229695819818, + "grad_norm": 0.21469832956790924, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0048, + "step": 10740 + }, + { + "epoch": 0.657935002142114, + "grad_norm": 0.19622936844825745, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0057, + "step": 10750 + }, + { + "epoch": 0.6585470347022462, + "grad_norm": 0.2255190759897232, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0052, + "step": 10760 + }, + { + "epoch": 0.6591590672623784, + "grad_norm": 0.47484955191612244, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0066, + "step": 10770 + }, + { + "epoch": 0.6597710998225106, + "grad_norm": 0.32192179560661316, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0067, + "step": 10780 + }, + { + "epoch": 0.6603831323826428, + "grad_norm": 0.33044904470443726, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0061, + "step": 10790 + }, + { + "epoch": 0.660995164942775, + "grad_norm": 0.3206661343574524, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0072, + "step": 10800 + }, + { + "epoch": 0.6616071975029072, + "grad_norm": 0.34903818368911743, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0055, + "step": 10810 + }, + { + "epoch": 0.6622192300630394, + "grad_norm": 0.1982222944498062, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0063, + "step": 10820 + }, + { + "epoch": 0.6628312626231716, + "grad_norm": 0.25388309359550476, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0072, + "step": 10830 + }, + { + "epoch": 0.6634432951833038, + "grad_norm": 0.2325269728899002, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0078, + "step": 10840 + }, + { + "epoch": 0.6640553277434359, + "grad_norm": 0.3364964425563812, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0054, + "step": 10850 + }, + { + "epoch": 0.6646673603035681, + "grad_norm": 0.198661208152771, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0061, + "step": 10860 + }, + { + "epoch": 0.6652793928637003, + "grad_norm": 0.333836168050766, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0058, + "step": 10870 + }, + { + "epoch": 0.6658914254238325, + "grad_norm": 0.21908101439476013, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0087, + "step": 10880 + }, + { + "epoch": 0.6665034579839647, + "grad_norm": 0.3094167709350586, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0062, + "step": 10890 + }, + { + "epoch": 0.6671154905440969, + "grad_norm": 0.28113746643066406, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0066, + "step": 10900 + }, + { + "epoch": 0.6677275231042291, + "grad_norm": 0.20239399373531342, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0071, + "step": 10910 + }, + { + "epoch": 0.6683395556643613, + "grad_norm": 0.32829156517982483, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0056, + "step": 10920 + }, + { + "epoch": 0.6689515882244935, + "grad_norm": 0.2950859069824219, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 0.6695636207846257, + "grad_norm": 0.36404141783714294, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0075, + "step": 10940 + }, + { + "epoch": 0.6701756533447579, + "grad_norm": 0.2479381114244461, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0055, + "step": 10950 + }, + { + "epoch": 0.6707876859048901, + "grad_norm": 0.1934390366077423, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.005, + "step": 10960 + }, + { + "epoch": 0.6713997184650223, + "grad_norm": 0.20912423729896545, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0064, + "step": 10970 + }, + { + "epoch": 0.6720117510251545, + "grad_norm": 0.1781405806541443, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0048, + "step": 10980 + }, + { + "epoch": 0.6726237835852867, + "grad_norm": 0.18812811374664307, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0048, + "step": 10990 + }, + { + "epoch": 0.6732358161454189, + "grad_norm": 0.2006077766418457, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0073, + "step": 11000 + }, + { + "epoch": 0.6738478487055511, + "grad_norm": 0.20471568405628204, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0068, + "step": 11010 + }, + { + "epoch": 0.6744598812656833, + "grad_norm": 0.2979716658592224, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0067, + "step": 11020 + }, + { + "epoch": 0.6750719138258156, + "grad_norm": 0.3256290853023529, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0054, + "step": 11030 + }, + { + "epoch": 0.6756839463859478, + "grad_norm": 0.3346560001373291, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0061, + "step": 11040 + }, + { + "epoch": 0.67629597894608, + "grad_norm": 0.35791122913360596, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0054, + "step": 11050 + }, + { + "epoch": 0.6769080115062122, + "grad_norm": 0.30428826808929443, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0066, + "step": 11060 + }, + { + "epoch": 0.6775200440663444, + "grad_norm": 0.31254154443740845, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0065, + "step": 11070 + }, + { + "epoch": 0.6781320766264766, + "grad_norm": 0.263028621673584, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0062, + "step": 11080 + }, + { + "epoch": 0.6787441091866088, + "grad_norm": 0.22496990859508514, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0056, + "step": 11090 + }, + { + "epoch": 0.679356141746741, + "grad_norm": 0.2647632360458374, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0072, + "step": 11100 + }, + { + "epoch": 0.6799681743068732, + "grad_norm": 0.2517150342464447, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0064, + "step": 11110 + }, + { + "epoch": 0.6805802068670054, + "grad_norm": 0.30550616979599, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0055, + "step": 11120 + }, + { + "epoch": 0.6811922394271375, + "grad_norm": 0.21312931180000305, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0074, + "step": 11130 + }, + { + "epoch": 0.6818042719872697, + "grad_norm": 0.21152199804782867, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0047, + "step": 11140 + }, + { + "epoch": 0.6824163045474019, + "grad_norm": 0.2030613273382187, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0045, + "step": 11150 + }, + { + "epoch": 0.6830283371075341, + "grad_norm": 0.30646151304244995, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0045, + "step": 11160 + }, + { + "epoch": 0.6836403696676663, + "grad_norm": 0.2693783938884735, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0061, + "step": 11170 + }, + { + "epoch": 0.6842524022277985, + "grad_norm": 0.25288495421409607, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0068, + "step": 11180 + }, + { + "epoch": 0.6848644347879307, + "grad_norm": 0.34989964962005615, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.007, + "step": 11190 + }, + { + "epoch": 0.6854764673480629, + "grad_norm": 0.192350834608078, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0064, + "step": 11200 + }, + { + "epoch": 0.6860884999081951, + "grad_norm": 0.3841196894645691, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0069, + "step": 11210 + }, + { + "epoch": 0.6867005324683273, + "grad_norm": 0.2168666571378708, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0063, + "step": 11220 + }, + { + "epoch": 0.6873125650284595, + "grad_norm": 0.2756234109401703, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0068, + "step": 11230 + }, + { + "epoch": 0.6879245975885917, + "grad_norm": 0.1971903294324875, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.006, + "step": 11240 + }, + { + "epoch": 0.6885366301487239, + "grad_norm": 0.3857499659061432, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0063, + "step": 11250 + }, + { + "epoch": 0.6891486627088561, + "grad_norm": 0.194110706448555, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0049, + "step": 11260 + }, + { + "epoch": 0.6897606952689883, + "grad_norm": 0.24935179948806763, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0054, + "step": 11270 + }, + { + "epoch": 0.6903727278291205, + "grad_norm": 0.5208527445793152, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0062, + "step": 11280 + }, + { + "epoch": 0.6909847603892527, + "grad_norm": 0.2917899191379547, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0058, + "step": 11290 + }, + { + "epoch": 0.6915967929493849, + "grad_norm": 0.42692577838897705, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0072, + "step": 11300 + }, + { + "epoch": 0.6922088255095171, + "grad_norm": 0.36888429522514343, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0059, + "step": 11310 + }, + { + "epoch": 0.6928208580696493, + "grad_norm": 0.26246029138565063, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0057, + "step": 11320 + }, + { + "epoch": 0.6934328906297815, + "grad_norm": 0.22163739800453186, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0078, + "step": 11330 + }, + { + "epoch": 0.6940449231899137, + "grad_norm": 0.33411458134651184, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0053, + "step": 11340 + }, + { + "epoch": 0.6946569557500459, + "grad_norm": 0.2792898118495941, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0095, + "step": 11350 + }, + { + "epoch": 0.6952689883101781, + "grad_norm": 0.2770175039768219, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0059, + "step": 11360 + }, + { + "epoch": 0.6958810208703103, + "grad_norm": 0.14913171529769897, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0046, + "step": 11370 + }, + { + "epoch": 0.6964930534304425, + "grad_norm": 0.22906239330768585, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0054, + "step": 11380 + }, + { + "epoch": 0.6971050859905747, + "grad_norm": 0.2854336202144623, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0052, + "step": 11390 + }, + { + "epoch": 0.697717118550707, + "grad_norm": 0.21835818886756897, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0064, + "step": 11400 + }, + { + "epoch": 0.698329151110839, + "grad_norm": 0.42180293798446655, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0071, + "step": 11410 + }, + { + "epoch": 0.6989411836709712, + "grad_norm": 0.3056841492652893, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0092, + "step": 11420 + }, + { + "epoch": 0.6995532162311034, + "grad_norm": 0.15149559080600739, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0049, + "step": 11430 + }, + { + "epoch": 0.7001652487912357, + "grad_norm": 0.15561188757419586, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0051, + "step": 11440 + }, + { + "epoch": 0.7007772813513679, + "grad_norm": 0.2941122055053711, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0065, + "step": 11450 + }, + { + "epoch": 0.7013893139115001, + "grad_norm": 0.3008195757865906, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0059, + "step": 11460 + }, + { + "epoch": 0.7020013464716323, + "grad_norm": 0.3787235617637634, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0068, + "step": 11470 + }, + { + "epoch": 0.7026133790317645, + "grad_norm": 0.2069675624370575, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.005, + "step": 11480 + }, + { + "epoch": 0.7032254115918967, + "grad_norm": 0.33505553007125854, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0058, + "step": 11490 + }, + { + "epoch": 0.7038374441520289, + "grad_norm": 0.281213641166687, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0064, + "step": 11500 + }, + { + "epoch": 0.7044494767121611, + "grad_norm": 0.28471192717552185, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0066, + "step": 11510 + }, + { + "epoch": 0.7050615092722933, + "grad_norm": 0.3166801929473877, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0062, + "step": 11520 + }, + { + "epoch": 0.7056735418324255, + "grad_norm": 0.26893407106399536, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.005, + "step": 11530 + }, + { + "epoch": 0.7062855743925577, + "grad_norm": 0.17421478033065796, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0063, + "step": 11540 + }, + { + "epoch": 0.7068976069526899, + "grad_norm": 0.40999990701675415, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0069, + "step": 11550 + }, + { + "epoch": 0.7075096395128221, + "grad_norm": 0.190180242061615, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0061, + "step": 11560 + }, + { + "epoch": 0.7081216720729543, + "grad_norm": 0.20383603870868683, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0049, + "step": 11570 + }, + { + "epoch": 0.7087337046330865, + "grad_norm": 0.28741395473480225, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0059, + "step": 11580 + }, + { + "epoch": 0.7093457371932187, + "grad_norm": 0.24231962859630585, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.008, + "step": 11590 + }, + { + "epoch": 0.7099577697533509, + "grad_norm": 0.2221115529537201, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0053, + "step": 11600 + }, + { + "epoch": 0.7105698023134831, + "grad_norm": 0.18564820289611816, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0072, + "step": 11610 + }, + { + "epoch": 0.7111818348736153, + "grad_norm": 0.3734343647956848, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0089, + "step": 11620 + }, + { + "epoch": 0.7117938674337475, + "grad_norm": 0.3215912878513336, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0093, + "step": 11630 + }, + { + "epoch": 0.7124058999938797, + "grad_norm": 0.22602899372577667, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0062, + "step": 11640 + }, + { + "epoch": 0.7130179325540119, + "grad_norm": 0.3115978538990021, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.006, + "step": 11650 + }, + { + "epoch": 0.7136299651141441, + "grad_norm": 0.26148155331611633, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0071, + "step": 11660 + }, + { + "epoch": 0.7142419976742763, + "grad_norm": 0.142781600356102, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0051, + "step": 11670 + }, + { + "epoch": 0.7148540302344085, + "grad_norm": 0.21306048333644867, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0068, + "step": 11680 + }, + { + "epoch": 0.7154660627945407, + "grad_norm": 0.3439876437187195, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.006, + "step": 11690 + }, + { + "epoch": 0.7160780953546728, + "grad_norm": 0.4010280966758728, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0062, + "step": 11700 + }, + { + "epoch": 0.716690127914805, + "grad_norm": 0.2760031819343567, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.006, + "step": 11710 + }, + { + "epoch": 0.7173021604749372, + "grad_norm": 0.45097261667251587, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0062, + "step": 11720 + }, + { + "epoch": 0.7179141930350694, + "grad_norm": 0.20118115842342377, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0054, + "step": 11730 + }, + { + "epoch": 0.7185262255952016, + "grad_norm": 0.3090760409832001, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0054, + "step": 11740 + }, + { + "epoch": 0.7191382581553338, + "grad_norm": 0.25016647577285767, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0077, + "step": 11750 + }, + { + "epoch": 0.719750290715466, + "grad_norm": 0.2310703545808792, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0053, + "step": 11760 + }, + { + "epoch": 0.7203623232755982, + "grad_norm": 0.2269359678030014, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.006, + "step": 11770 + }, + { + "epoch": 0.7209743558357304, + "grad_norm": 0.3917788565158844, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0062, + "step": 11780 + }, + { + "epoch": 0.7215863883958626, + "grad_norm": 0.25999465584754944, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0064, + "step": 11790 + }, + { + "epoch": 0.7221984209559948, + "grad_norm": 0.19340357184410095, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0071, + "step": 11800 + }, + { + "epoch": 0.722810453516127, + "grad_norm": 0.25046268105506897, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0078, + "step": 11810 + }, + { + "epoch": 0.7234224860762593, + "grad_norm": 0.19819264113903046, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.005, + "step": 11820 + }, + { + "epoch": 0.7240345186363915, + "grad_norm": 0.43484950065612793, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0078, + "step": 11830 + }, + { + "epoch": 0.7246465511965237, + "grad_norm": 0.29191601276397705, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0064, + "step": 11840 + }, + { + "epoch": 0.7252585837566559, + "grad_norm": 0.21717441082000732, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0056, + "step": 11850 + }, + { + "epoch": 0.7258706163167881, + "grad_norm": 0.3210129737854004, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0072, + "step": 11860 + }, + { + "epoch": 0.7264826488769203, + "grad_norm": 0.33192649483680725, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0061, + "step": 11870 + }, + { + "epoch": 0.7270946814370525, + "grad_norm": 0.14648163318634033, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0083, + "step": 11880 + }, + { + "epoch": 0.7277067139971847, + "grad_norm": 0.20028764009475708, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0052, + "step": 11890 + }, + { + "epoch": 0.7283187465573169, + "grad_norm": 0.21449612081050873, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0063, + "step": 11900 + }, + { + "epoch": 0.7289307791174491, + "grad_norm": 0.27472081780433655, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0062, + "step": 11910 + }, + { + "epoch": 0.7295428116775813, + "grad_norm": 0.2919130027294159, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0048, + "step": 11920 + }, + { + "epoch": 0.7301548442377135, + "grad_norm": 0.153092160820961, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0053, + "step": 11930 + }, + { + "epoch": 0.7307668767978457, + "grad_norm": 0.22820086777210236, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0058, + "step": 11940 + }, + { + "epoch": 0.7313789093579779, + "grad_norm": 0.24281881749629974, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0044, + "step": 11950 + }, + { + "epoch": 0.7319909419181101, + "grad_norm": 0.32581812143325806, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0051, + "step": 11960 + }, + { + "epoch": 0.7326029744782423, + "grad_norm": 0.3139822483062744, + "learning_rate": 1.435930222050582e-05, + "loss": 0.006, + "step": 11970 + }, + { + "epoch": 0.7332150070383744, + "grad_norm": 0.37985655665397644, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0052, + "step": 11980 + }, + { + "epoch": 0.7338270395985066, + "grad_norm": 0.1958508938550949, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.007, + "step": 11990 + }, + { + "epoch": 0.7344390721586388, + "grad_norm": 0.25318172574043274, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0084, + "step": 12000 + }, + { + "epoch": 0.735051104718771, + "grad_norm": 0.33245304226875305, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0051, + "step": 12010 + }, + { + "epoch": 0.7356631372789032, + "grad_norm": 0.2750372290611267, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0057, + "step": 12020 + }, + { + "epoch": 0.7362751698390354, + "grad_norm": 0.2057010382413864, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0057, + "step": 12030 + }, + { + "epoch": 0.7368872023991676, + "grad_norm": 0.30713731050491333, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0067, + "step": 12040 + }, + { + "epoch": 0.7374992349592998, + "grad_norm": 0.20423808693885803, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.006, + "step": 12050 + }, + { + "epoch": 0.738111267519432, + "grad_norm": 0.3129539489746094, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0067, + "step": 12060 + }, + { + "epoch": 0.7387233000795642, + "grad_norm": 0.25026270747184753, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0081, + "step": 12070 + }, + { + "epoch": 0.7393353326396964, + "grad_norm": 0.4147534668445587, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0061, + "step": 12080 + }, + { + "epoch": 0.7399473651998286, + "grad_norm": 0.20954278111457825, + "learning_rate": 1.425047976058418e-05, + "loss": 0.006, + "step": 12090 + }, + { + "epoch": 0.7405593977599608, + "grad_norm": 0.2700798809528351, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 0.741171430320093, + "grad_norm": 0.2597086429595947, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0059, + "step": 12110 + }, + { + "epoch": 0.7417834628802252, + "grad_norm": 0.2674495279788971, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0045, + "step": 12120 + }, + { + "epoch": 0.7423954954403574, + "grad_norm": 0.24583879113197327, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0061, + "step": 12130 + }, + { + "epoch": 0.7430075280004896, + "grad_norm": 0.23704801499843597, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0054, + "step": 12140 + }, + { + "epoch": 0.7436195605606218, + "grad_norm": 0.2381024807691574, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0073, + "step": 12150 + }, + { + "epoch": 0.744231593120754, + "grad_norm": 0.24937355518341064, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0049, + "step": 12160 + }, + { + "epoch": 0.7448436256808862, + "grad_norm": 0.20442882180213928, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0061, + "step": 12170 + }, + { + "epoch": 0.7454556582410184, + "grad_norm": 0.3053426742553711, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0087, + "step": 12180 + }, + { + "epoch": 0.7460676908011507, + "grad_norm": 0.3654315769672394, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0047, + "step": 12190 + }, + { + "epoch": 0.7466797233612829, + "grad_norm": 0.18926535546779633, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0065, + "step": 12200 + }, + { + "epoch": 0.7472917559214151, + "grad_norm": 0.21620485186576843, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0094, + "step": 12210 + }, + { + "epoch": 0.7479037884815473, + "grad_norm": 0.2754563093185425, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0059, + "step": 12220 + }, + { + "epoch": 0.7485158210416795, + "grad_norm": 0.39795419573783875, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.007, + "step": 12230 + }, + { + "epoch": 0.7491278536018117, + "grad_norm": 0.20502857863903046, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0048, + "step": 12240 + }, + { + "epoch": 0.7497398861619439, + "grad_norm": 0.23821429908275604, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0082, + "step": 12250 + }, + { + "epoch": 0.750351918722076, + "grad_norm": 0.45541366934776306, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0071, + "step": 12260 + }, + { + "epoch": 0.7509639512822082, + "grad_norm": 0.24881400167942047, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0061, + "step": 12270 + }, + { + "epoch": 0.7515759838423404, + "grad_norm": 0.2409125715494156, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0061, + "step": 12280 + }, + { + "epoch": 0.7521880164024726, + "grad_norm": 0.2930417060852051, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0054, + "step": 12290 + }, + { + "epoch": 0.7528000489626048, + "grad_norm": 0.30566394329071045, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0054, + "step": 12300 + }, + { + "epoch": 0.753412081522737, + "grad_norm": 0.32679763436317444, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0059, + "step": 12310 + }, + { + "epoch": 0.7540241140828692, + "grad_norm": 0.29273876547813416, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0067, + "step": 12320 + }, + { + "epoch": 0.7546361466430014, + "grad_norm": 0.19642773270606995, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0062, + "step": 12330 + }, + { + "epoch": 0.7552481792031336, + "grad_norm": 0.21928250789642334, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0066, + "step": 12340 + }, + { + "epoch": 0.7558602117632658, + "grad_norm": 0.2534322738647461, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0059, + "step": 12350 + }, + { + "epoch": 0.756472244323398, + "grad_norm": 0.20712649822235107, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0054, + "step": 12360 + }, + { + "epoch": 0.7570842768835302, + "grad_norm": 0.18670639395713806, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0063, + "step": 12370 + }, + { + "epoch": 0.7576963094436624, + "grad_norm": 0.26770254969596863, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0052, + "step": 12380 + }, + { + "epoch": 0.7583083420037946, + "grad_norm": 0.3621291518211365, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0056, + "step": 12390 + }, + { + "epoch": 0.7589203745639268, + "grad_norm": 0.31771939992904663, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0059, + "step": 12400 + }, + { + "epoch": 0.759532407124059, + "grad_norm": 0.44418177008628845, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0065, + "step": 12410 + }, + { + "epoch": 0.7601444396841912, + "grad_norm": 0.2183474898338318, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0046, + "step": 12420 + }, + { + "epoch": 0.7607564722443234, + "grad_norm": 0.4400590658187866, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0061, + "step": 12430 + }, + { + "epoch": 0.7613685048044556, + "grad_norm": 0.296539843082428, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0059, + "step": 12440 + }, + { + "epoch": 0.7619805373645878, + "grad_norm": 0.352870374917984, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0055, + "step": 12450 + }, + { + "epoch": 0.76259256992472, + "grad_norm": 0.19494596123695374, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0061, + "step": 12460 + }, + { + "epoch": 0.7632046024848522, + "grad_norm": 0.3799489438533783, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0057, + "step": 12470 + }, + { + "epoch": 0.7638166350449844, + "grad_norm": 0.3572365641593933, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0053, + "step": 12480 + }, + { + "epoch": 0.7644286676051166, + "grad_norm": 0.2559097707271576, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0062, + "step": 12490 + }, + { + "epoch": 0.7650407001652488, + "grad_norm": 0.13144978880882263, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0065, + "step": 12500 + }, + { + "epoch": 0.765652732725381, + "grad_norm": 0.34635287523269653, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0081, + "step": 12510 + }, + { + "epoch": 0.7662647652855132, + "grad_norm": 0.25615188479423523, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0057, + "step": 12520 + }, + { + "epoch": 0.7668767978456454, + "grad_norm": 0.17619644105434418, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0047, + "step": 12530 + }, + { + "epoch": 0.7674888304057775, + "grad_norm": 0.20169994235038757, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0068, + "step": 12540 + }, + { + "epoch": 0.7681008629659097, + "grad_norm": 0.49686071276664734, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0066, + "step": 12550 + }, + { + "epoch": 0.7687128955260419, + "grad_norm": 0.28179335594177246, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0058, + "step": 12560 + }, + { + "epoch": 0.7693249280861741, + "grad_norm": 0.28156182169914246, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.005, + "step": 12570 + }, + { + "epoch": 0.7699369606463063, + "grad_norm": 0.15054315328598022, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0051, + "step": 12580 + }, + { + "epoch": 0.7705489932064385, + "grad_norm": 0.22872644662857056, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0066, + "step": 12590 + }, + { + "epoch": 0.7711610257665708, + "grad_norm": 0.25821951031684875, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0054, + "step": 12600 + }, + { + "epoch": 0.771773058326703, + "grad_norm": 0.23592771589756012, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0059, + "step": 12610 + }, + { + "epoch": 0.7723850908868352, + "grad_norm": 0.34409141540527344, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0053, + "step": 12620 + }, + { + "epoch": 0.7729971234469674, + "grad_norm": 0.2803158760070801, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0042, + "step": 12630 + }, + { + "epoch": 0.7736091560070996, + "grad_norm": 0.32796284556388855, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0074, + "step": 12640 + }, + { + "epoch": 0.7742211885672318, + "grad_norm": 0.34749120473861694, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0054, + "step": 12650 + }, + { + "epoch": 0.774833221127364, + "grad_norm": 0.34066343307495117, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0082, + "step": 12660 + }, + { + "epoch": 0.7754452536874962, + "grad_norm": 0.4294384717941284, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0062, + "step": 12670 + }, + { + "epoch": 0.7760572862476284, + "grad_norm": 0.2355230748653412, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0069, + "step": 12680 + }, + { + "epoch": 0.7766693188077606, + "grad_norm": 0.3181976079940796, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0068, + "step": 12690 + }, + { + "epoch": 0.7772813513678928, + "grad_norm": 0.2763727605342865, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0052, + "step": 12700 + }, + { + "epoch": 0.777893383928025, + "grad_norm": 0.2938949465751648, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0041, + "step": 12710 + }, + { + "epoch": 0.7785054164881572, + "grad_norm": 0.31331220269203186, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0062, + "step": 12720 + }, + { + "epoch": 0.7791174490482894, + "grad_norm": 0.3389904797077179, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0061, + "step": 12730 + }, + { + "epoch": 0.7797294816084216, + "grad_norm": 0.2848975360393524, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0065, + "step": 12740 + }, + { + "epoch": 0.7803415141685538, + "grad_norm": 0.29838478565216064, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0061, + "step": 12750 + }, + { + "epoch": 0.780953546728686, + "grad_norm": 0.47004032135009766, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0066, + "step": 12760 + }, + { + "epoch": 0.7815655792888182, + "grad_norm": 0.26898056268692017, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0063, + "step": 12770 + }, + { + "epoch": 0.7821776118489504, + "grad_norm": 0.29459917545318604, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0065, + "step": 12780 + }, + { + "epoch": 0.7827896444090826, + "grad_norm": 0.3481508791446686, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0058, + "step": 12790 + }, + { + "epoch": 0.7834016769692148, + "grad_norm": 0.1707627922296524, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0053, + "step": 12800 + }, + { + "epoch": 0.784013709529347, + "grad_norm": 0.14735333621501923, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0058, + "step": 12810 + }, + { + "epoch": 0.7846257420894791, + "grad_norm": 0.28002044558525085, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.006, + "step": 12820 + }, + { + "epoch": 0.7852377746496113, + "grad_norm": 0.39598894119262695, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0062, + "step": 12830 + }, + { + "epoch": 0.7858498072097435, + "grad_norm": 0.19379247725009918, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0058, + "step": 12840 + }, + { + "epoch": 0.7864618397698757, + "grad_norm": 0.27260729670524597, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.006, + "step": 12850 + }, + { + "epoch": 0.7870738723300079, + "grad_norm": 0.2845087945461273, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0052, + "step": 12860 + }, + { + "epoch": 0.7876859048901401, + "grad_norm": 0.37151217460632324, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0043, + "step": 12870 + }, + { + "epoch": 0.7882979374502723, + "grad_norm": 0.3387412130832672, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0046, + "step": 12880 + }, + { + "epoch": 0.7889099700104045, + "grad_norm": 0.42672809958457947, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0065, + "step": 12890 + }, + { + "epoch": 0.7895220025705367, + "grad_norm": 0.20378202199935913, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0062, + "step": 12900 + }, + { + "epoch": 0.7901340351306689, + "grad_norm": 0.16417330503463745, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0045, + "step": 12910 + }, + { + "epoch": 0.7907460676908011, + "grad_norm": 0.1704142540693283, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0054, + "step": 12920 + }, + { + "epoch": 0.7913581002509333, + "grad_norm": 0.21494890749454498, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0061, + "step": 12930 + }, + { + "epoch": 0.7919701328110655, + "grad_norm": 0.3430638909339905, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0046, + "step": 12940 + }, + { + "epoch": 0.7925821653711977, + "grad_norm": 0.22641201317310333, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0049, + "step": 12950 + }, + { + "epoch": 0.79319419793133, + "grad_norm": 0.27153971791267395, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0057, + "step": 12960 + }, + { + "epoch": 0.7938062304914622, + "grad_norm": 0.2648560702800751, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.0048, + "step": 12970 + }, + { + "epoch": 0.7944182630515944, + "grad_norm": 0.2148633897304535, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0048, + "step": 12980 + }, + { + "epoch": 0.7950302956117266, + "grad_norm": 0.35170191526412964, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0069, + "step": 12990 + }, + { + "epoch": 0.7956423281718588, + "grad_norm": 0.3539712429046631, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0067, + "step": 13000 + }, + { + "epoch": 0.796254360731991, + "grad_norm": 0.29938259720802307, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0102, + "step": 13010 + }, + { + "epoch": 0.7968663932921232, + "grad_norm": 0.35241010785102844, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0073, + "step": 13020 + }, + { + "epoch": 0.7974784258522554, + "grad_norm": 0.2929113805294037, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0061, + "step": 13030 + }, + { + "epoch": 0.7980904584123876, + "grad_norm": 0.24052929878234863, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0046, + "step": 13040 + }, + { + "epoch": 0.7987024909725198, + "grad_norm": 0.21611042320728302, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0043, + "step": 13050 + }, + { + "epoch": 0.799314523532652, + "grad_norm": 0.23498570919036865, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0046, + "step": 13060 + }, + { + "epoch": 0.7999265560927842, + "grad_norm": 0.30229923129081726, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0068, + "step": 13070 + }, + { + "epoch": 0.8005385886529164, + "grad_norm": 0.2916681170463562, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0062, + "step": 13080 + }, + { + "epoch": 0.8011506212130486, + "grad_norm": 0.31905195116996765, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0048, + "step": 13090 + }, + { + "epoch": 0.8017626537731807, + "grad_norm": 0.22307109832763672, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0049, + "step": 13100 + }, + { + "epoch": 0.8023746863333129, + "grad_norm": 0.2815198004245758, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0054, + "step": 13110 + }, + { + "epoch": 0.8029867188934451, + "grad_norm": 0.18762829899787903, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0064, + "step": 13120 + }, + { + "epoch": 0.8035987514535773, + "grad_norm": 0.1918255090713501, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0064, + "step": 13130 + }, + { + "epoch": 0.8042107840137095, + "grad_norm": 0.3726229667663574, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0065, + "step": 13140 + }, + { + "epoch": 0.8048228165738417, + "grad_norm": 0.423285573720932, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0062, + "step": 13150 + }, + { + "epoch": 0.8054348491339739, + "grad_norm": 0.1709958165884018, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0052, + "step": 13160 + }, + { + "epoch": 0.8060468816941061, + "grad_norm": 0.3615981936454773, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0054, + "step": 13170 + }, + { + "epoch": 0.8066589142542383, + "grad_norm": 0.2101999819278717, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0053, + "step": 13180 + }, + { + "epoch": 0.8072709468143705, + "grad_norm": 0.14393582940101624, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0047, + "step": 13190 + }, + { + "epoch": 0.8078829793745027, + "grad_norm": 0.3704521656036377, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0056, + "step": 13200 + }, + { + "epoch": 0.8084950119346349, + "grad_norm": 0.23275913298130035, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0051, + "step": 13210 + }, + { + "epoch": 0.8091070444947671, + "grad_norm": 0.18429698050022125, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0045, + "step": 13220 + }, + { + "epoch": 0.8097190770548993, + "grad_norm": 0.21721667051315308, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0052, + "step": 13230 + }, + { + "epoch": 0.8103311096150315, + "grad_norm": 0.29456019401550293, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 0.8109431421751637, + "grad_norm": 0.19854630529880524, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0071, + "step": 13250 + }, + { + "epoch": 0.8115551747352959, + "grad_norm": 0.4318163990974426, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0059, + "step": 13260 + }, + { + "epoch": 0.8121672072954281, + "grad_norm": 0.3421531915664673, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.006, + "step": 13270 + }, + { + "epoch": 0.8127792398555603, + "grad_norm": 0.2370125651359558, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0054, + "step": 13280 + }, + { + "epoch": 0.8133912724156925, + "grad_norm": 0.2996460497379303, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 0.8140033049758247, + "grad_norm": 0.2911904454231262, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0053, + "step": 13300 + }, + { + "epoch": 0.8146153375359569, + "grad_norm": 0.26010408997535706, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0053, + "step": 13310 + }, + { + "epoch": 0.8152273700960891, + "grad_norm": 0.404702752828598, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0081, + "step": 13320 + }, + { + "epoch": 0.8158394026562213, + "grad_norm": 0.25591781735420227, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0057, + "step": 13330 + }, + { + "epoch": 0.8164514352163535, + "grad_norm": 0.1437849998474121, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0064, + "step": 13340 + }, + { + "epoch": 0.8170634677764858, + "grad_norm": 0.12252022325992584, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0047, + "step": 13350 + }, + { + "epoch": 0.817675500336618, + "grad_norm": 0.1861230581998825, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0063, + "step": 13360 + }, + { + "epoch": 0.8182875328967502, + "grad_norm": 0.2313026636838913, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0066, + "step": 13370 + }, + { + "epoch": 0.8188995654568824, + "grad_norm": 0.5445839166641235, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0076, + "step": 13380 + }, + { + "epoch": 0.8195115980170145, + "grad_norm": 0.21818871796131134, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0068, + "step": 13390 + }, + { + "epoch": 0.8201236305771467, + "grad_norm": 0.21823963522911072, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0072, + "step": 13400 + }, + { + "epoch": 0.8207356631372789, + "grad_norm": 0.1730659157037735, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0051, + "step": 13410 + }, + { + "epoch": 0.8213476956974111, + "grad_norm": 0.1301007866859436, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0075, + "step": 13420 + }, + { + "epoch": 0.8219597282575433, + "grad_norm": 0.32452520728111267, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.005, + "step": 13430 + }, + { + "epoch": 0.8225717608176755, + "grad_norm": 0.24771001935005188, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0058, + "step": 13440 + }, + { + "epoch": 0.8231837933778077, + "grad_norm": 0.4575227200984955, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0062, + "step": 13450 + }, + { + "epoch": 0.8237958259379399, + "grad_norm": 0.16441279649734497, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0081, + "step": 13460 + }, + { + "epoch": 0.8244078584980721, + "grad_norm": 0.26582902669906616, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0069, + "step": 13470 + }, + { + "epoch": 0.8250198910582043, + "grad_norm": 0.18871302902698517, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0068, + "step": 13480 + }, + { + "epoch": 0.8256319236183365, + "grad_norm": 0.23244783282279968, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0063, + "step": 13490 + }, + { + "epoch": 0.8262439561784687, + "grad_norm": 0.2399880290031433, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0036, + "step": 13500 + }, + { + "epoch": 0.8268559887386009, + "grad_norm": 0.25766822695732117, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0074, + "step": 13510 + }, + { + "epoch": 0.8274680212987331, + "grad_norm": 0.24792100489139557, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0066, + "step": 13520 + }, + { + "epoch": 0.8280800538588653, + "grad_norm": 0.3371896743774414, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0069, + "step": 13530 + }, + { + "epoch": 0.8286920864189975, + "grad_norm": 0.16249819099903107, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0079, + "step": 13540 + }, + { + "epoch": 0.8293041189791297, + "grad_norm": 0.2705139219760895, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0069, + "step": 13550 + }, + { + "epoch": 0.8299161515392619, + "grad_norm": 0.1905352771282196, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0055, + "step": 13560 + }, + { + "epoch": 0.8305281840993941, + "grad_norm": 0.23938500881195068, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0055, + "step": 13570 + }, + { + "epoch": 0.8311402166595263, + "grad_norm": 0.3562251031398773, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0054, + "step": 13580 + }, + { + "epoch": 0.8317522492196585, + "grad_norm": 0.2934769093990326, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0064, + "step": 13590 + }, + { + "epoch": 0.8323642817797907, + "grad_norm": 0.252366840839386, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0078, + "step": 13600 + }, + { + "epoch": 0.8329763143399229, + "grad_norm": 0.16646964848041534, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0046, + "step": 13610 + }, + { + "epoch": 0.8335883469000551, + "grad_norm": 0.22584658861160278, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0071, + "step": 13620 + }, + { + "epoch": 0.8342003794601873, + "grad_norm": 0.3578774034976959, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0049, + "step": 13630 + }, + { + "epoch": 0.8348124120203195, + "grad_norm": 0.3447739779949188, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0065, + "step": 13640 + }, + { + "epoch": 0.8354244445804517, + "grad_norm": 0.381954550743103, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0057, + "step": 13650 + }, + { + "epoch": 0.8360364771405839, + "grad_norm": 0.3563731908798218, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0065, + "step": 13660 + }, + { + "epoch": 0.836648509700716, + "grad_norm": 0.29516372084617615, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0053, + "step": 13670 + }, + { + "epoch": 0.8372605422608482, + "grad_norm": 0.22686618566513062, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0043, + "step": 13680 + }, + { + "epoch": 0.8378725748209804, + "grad_norm": 0.4608387351036072, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.005, + "step": 13690 + }, + { + "epoch": 0.8384846073811126, + "grad_norm": 0.31025534868240356, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0055, + "step": 13700 + }, + { + "epoch": 0.8390966399412448, + "grad_norm": 0.32904690504074097, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0055, + "step": 13710 + }, + { + "epoch": 0.839708672501377, + "grad_norm": 0.2547053098678589, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0061, + "step": 13720 + }, + { + "epoch": 0.8403207050615092, + "grad_norm": 0.30524104833602905, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.005, + "step": 13730 + }, + { + "epoch": 0.8409327376216414, + "grad_norm": 0.17741642892360687, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0051, + "step": 13740 + }, + { + "epoch": 0.8415447701817736, + "grad_norm": 0.23125578463077545, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0053, + "step": 13750 + }, + { + "epoch": 0.8421568027419059, + "grad_norm": 0.3080023229122162, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0075, + "step": 13760 + }, + { + "epoch": 0.842768835302038, + "grad_norm": 0.2509821951389313, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0053, + "step": 13770 + }, + { + "epoch": 0.8433808678621703, + "grad_norm": 0.17483864724636078, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.005, + "step": 13780 + }, + { + "epoch": 0.8439929004223025, + "grad_norm": 0.3952518403530121, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0056, + "step": 13790 + }, + { + "epoch": 0.8446049329824347, + "grad_norm": 0.2945535480976105, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0055, + "step": 13800 + }, + { + "epoch": 0.8452169655425669, + "grad_norm": 0.13024291396141052, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0073, + "step": 13810 + }, + { + "epoch": 0.8458289981026991, + "grad_norm": 0.1840520054101944, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0061, + "step": 13820 + }, + { + "epoch": 0.8464410306628313, + "grad_norm": 0.2368786782026291, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0058, + "step": 13830 + }, + { + "epoch": 0.8470530632229635, + "grad_norm": 0.2885456085205078, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0055, + "step": 13840 + }, + { + "epoch": 0.8476650957830957, + "grad_norm": 0.2782488167285919, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0046, + "step": 13850 + }, + { + "epoch": 0.8482771283432279, + "grad_norm": 0.1711442470550537, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0055, + "step": 13860 + }, + { + "epoch": 0.8488891609033601, + "grad_norm": 0.22235877811908722, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0056, + "step": 13870 + }, + { + "epoch": 0.8495011934634923, + "grad_norm": 0.1937183290719986, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0047, + "step": 13880 + }, + { + "epoch": 0.8501132260236245, + "grad_norm": 0.33960190415382385, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0063, + "step": 13890 + }, + { + "epoch": 0.8507252585837567, + "grad_norm": 0.1983388215303421, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0045, + "step": 13900 + }, + { + "epoch": 0.8513372911438889, + "grad_norm": 0.2968246638774872, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0051, + "step": 13910 + }, + { + "epoch": 0.8519493237040211, + "grad_norm": 0.25328314304351807, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0057, + "step": 13920 + }, + { + "epoch": 0.8525613562641533, + "grad_norm": 0.2435184270143509, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0056, + "step": 13930 + }, + { + "epoch": 0.8531733888242855, + "grad_norm": 0.24512560665607452, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0053, + "step": 13940 + }, + { + "epoch": 0.8537854213844176, + "grad_norm": 0.22028976678848267, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.006, + "step": 13950 + }, + { + "epoch": 0.8543974539445498, + "grad_norm": 0.24743935465812683, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0065, + "step": 13960 + }, + { + "epoch": 0.855009486504682, + "grad_norm": 0.1393810361623764, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0081, + "step": 13970 + }, + { + "epoch": 0.8556215190648142, + "grad_norm": 0.25975972414016724, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0063, + "step": 13980 + }, + { + "epoch": 0.8562335516249464, + "grad_norm": 0.1944616585969925, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0049, + "step": 13990 + }, + { + "epoch": 0.8568455841850786, + "grad_norm": 0.21936742961406708, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0056, + "step": 14000 + }, + { + "epoch": 0.8574576167452108, + "grad_norm": 0.1556629091501236, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0047, + "step": 14010 + }, + { + "epoch": 0.858069649305343, + "grad_norm": 0.23696991801261902, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.006, + "step": 14020 + }, + { + "epoch": 0.8586816818654752, + "grad_norm": 0.32507795095443726, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0061, + "step": 14030 + }, + { + "epoch": 0.8592937144256074, + "grad_norm": 0.35332199931144714, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0058, + "step": 14040 + }, + { + "epoch": 0.8599057469857396, + "grad_norm": 0.1835644394159317, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0046, + "step": 14050 + }, + { + "epoch": 0.8605177795458718, + "grad_norm": 0.19127517938613892, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0044, + "step": 14060 + }, + { + "epoch": 0.861129812106004, + "grad_norm": 0.30748996138572693, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0055, + "step": 14070 + }, + { + "epoch": 0.8617418446661362, + "grad_norm": 0.178785502910614, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0049, + "step": 14080 + }, + { + "epoch": 0.8623538772262684, + "grad_norm": 0.16979056596755981, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0044, + "step": 14090 + }, + { + "epoch": 0.8629659097864006, + "grad_norm": 0.19519983232021332, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0063, + "step": 14100 + }, + { + "epoch": 0.8635779423465328, + "grad_norm": 0.2722550928592682, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0057, + "step": 14110 + }, + { + "epoch": 0.864189974906665, + "grad_norm": 0.1956222504377365, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0054, + "step": 14120 + }, + { + "epoch": 0.8648020074667973, + "grad_norm": 0.32274308800697327, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0066, + "step": 14130 + }, + { + "epoch": 0.8654140400269295, + "grad_norm": 0.25953641533851624, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.0056, + "step": 14140 + }, + { + "epoch": 0.8660260725870617, + "grad_norm": 0.3293299674987793, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0057, + "step": 14150 + }, + { + "epoch": 0.8666381051471939, + "grad_norm": 0.35404127836227417, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0072, + "step": 14160 + }, + { + "epoch": 0.8672501377073261, + "grad_norm": 0.24674376845359802, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0064, + "step": 14170 + }, + { + "epoch": 0.8678621702674583, + "grad_norm": 0.23506462574005127, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0047, + "step": 14180 + }, + { + "epoch": 0.8684742028275905, + "grad_norm": 0.30500903725624084, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0059, + "step": 14190 + }, + { + "epoch": 0.8690862353877227, + "grad_norm": 0.23000167310237885, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0051, + "step": 14200 + }, + { + "epoch": 0.8696982679478549, + "grad_norm": 0.17339368164539337, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0041, + "step": 14210 + }, + { + "epoch": 0.8703103005079871, + "grad_norm": 0.2505367696285248, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.0059, + "step": 14220 + }, + { + "epoch": 0.8709223330681192, + "grad_norm": 0.22645734250545502, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0044, + "step": 14230 + }, + { + "epoch": 0.8715343656282514, + "grad_norm": 0.3509127199649811, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0043, + "step": 14240 + }, + { + "epoch": 0.8721463981883836, + "grad_norm": 0.2758972644805908, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0055, + "step": 14250 + }, + { + "epoch": 0.8727584307485158, + "grad_norm": 0.1943834275007248, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.006, + "step": 14260 + }, + { + "epoch": 0.873370463308648, + "grad_norm": 0.32881075143814087, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0061, + "step": 14270 + }, + { + "epoch": 0.8739824958687802, + "grad_norm": 0.35203438997268677, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0054, + "step": 14280 + }, + { + "epoch": 0.8745945284289124, + "grad_norm": 0.13618917763233185, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0044, + "step": 14290 + }, + { + "epoch": 0.8752065609890446, + "grad_norm": 0.22939404845237732, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0044, + "step": 14300 + }, + { + "epoch": 0.8758185935491768, + "grad_norm": 0.2027491182088852, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0051, + "step": 14310 + }, + { + "epoch": 0.876430626109309, + "grad_norm": 0.21950028836727142, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0105, + "step": 14320 + }, + { + "epoch": 0.8770426586694412, + "grad_norm": 0.307913213968277, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0051, + "step": 14330 + }, + { + "epoch": 0.8776546912295734, + "grad_norm": 0.1669110357761383, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0058, + "step": 14340 + }, + { + "epoch": 0.8782667237897056, + "grad_norm": 0.3033636808395386, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0066, + "step": 14350 + }, + { + "epoch": 0.8788787563498378, + "grad_norm": 0.25514236092567444, + "learning_rate": 1.210961823379053e-05, + "loss": 0.005, + "step": 14360 + }, + { + "epoch": 0.87949078890997, + "grad_norm": 0.2574418783187866, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0069, + "step": 14370 + }, + { + "epoch": 0.8801028214701022, + "grad_norm": 0.17803016304969788, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.006, + "step": 14380 + }, + { + "epoch": 0.8807148540302344, + "grad_norm": 0.31375741958618164, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0077, + "step": 14390 + }, + { + "epoch": 0.8813268865903666, + "grad_norm": 0.18031778931617737, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0052, + "step": 14400 + }, + { + "epoch": 0.8819389191504988, + "grad_norm": 0.18077519536018372, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0055, + "step": 14410 + }, + { + "epoch": 0.882550951710631, + "grad_norm": 0.22171644866466522, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0059, + "step": 14420 + }, + { + "epoch": 0.8831629842707632, + "grad_norm": 0.16187389194965363, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0044, + "step": 14430 + }, + { + "epoch": 0.8837750168308954, + "grad_norm": 0.27667325735092163, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0063, + "step": 14440 + }, + { + "epoch": 0.8843870493910276, + "grad_norm": 0.2493051290512085, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0072, + "step": 14450 + }, + { + "epoch": 0.8849990819511598, + "grad_norm": 0.3519611656665802, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 0.885611114511292, + "grad_norm": 0.17942464351654053, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0057, + "step": 14470 + }, + { + "epoch": 0.8862231470714242, + "grad_norm": 0.24518658220767975, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0044, + "step": 14480 + }, + { + "epoch": 0.8868351796315564, + "grad_norm": 0.28493785858154297, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0055, + "step": 14490 + }, + { + "epoch": 0.8874472121916887, + "grad_norm": 0.22260263562202454, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0062, + "step": 14500 + }, + { + "epoch": 0.8880592447518207, + "grad_norm": 0.2804561257362366, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0051, + "step": 14510 + }, + { + "epoch": 0.888671277311953, + "grad_norm": 0.24349385499954224, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0045, + "step": 14520 + }, + { + "epoch": 0.8892833098720851, + "grad_norm": 0.262207955121994, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0082, + "step": 14530 + }, + { + "epoch": 0.8898953424322174, + "grad_norm": 0.15527820587158203, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0061, + "step": 14540 + }, + { + "epoch": 0.8905073749923496, + "grad_norm": 0.23850804567337036, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0042, + "step": 14550 + }, + { + "epoch": 0.8911194075524818, + "grad_norm": 0.2665582001209259, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0053, + "step": 14560 + }, + { + "epoch": 0.891731440112614, + "grad_norm": 0.2652167081832886, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 0.8923434726727462, + "grad_norm": 0.21386243402957916, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0072, + "step": 14580 + }, + { + "epoch": 0.8929555052328784, + "grad_norm": 0.3087247312068939, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0082, + "step": 14590 + }, + { + "epoch": 0.8935675377930106, + "grad_norm": 0.2003909796476364, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0048, + "step": 14600 + }, + { + "epoch": 0.8941795703531428, + "grad_norm": 0.2214624583721161, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0062, + "step": 14610 + }, + { + "epoch": 0.894791602913275, + "grad_norm": 0.2500647306442261, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0052, + "step": 14620 + }, + { + "epoch": 0.8954036354734072, + "grad_norm": 0.2615419030189514, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0054, + "step": 14630 + }, + { + "epoch": 0.8960156680335394, + "grad_norm": 0.21347551047801971, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0048, + "step": 14640 + }, + { + "epoch": 0.8966277005936716, + "grad_norm": 0.35483887791633606, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0054, + "step": 14650 + }, + { + "epoch": 0.8972397331538038, + "grad_norm": 0.2423439472913742, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0055, + "step": 14660 + }, + { + "epoch": 0.897851765713936, + "grad_norm": 0.16826359927654266, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0067, + "step": 14670 + }, + { + "epoch": 0.8984637982740682, + "grad_norm": 0.3589499294757843, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0059, + "step": 14680 + }, + { + "epoch": 0.8990758308342004, + "grad_norm": 0.3081042468547821, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0057, + "step": 14690 + }, + { + "epoch": 0.8996878633943326, + "grad_norm": 0.31996914744377136, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0063, + "step": 14700 + }, + { + "epoch": 0.9002998959544648, + "grad_norm": 0.301209419965744, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0073, + "step": 14710 + }, + { + "epoch": 0.900911928514597, + "grad_norm": 0.19257168471813202, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0055, + "step": 14720 + }, + { + "epoch": 0.9015239610747292, + "grad_norm": 0.15221600234508514, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0053, + "step": 14730 + }, + { + "epoch": 0.9021359936348614, + "grad_norm": 0.21519577503204346, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0055, + "step": 14740 + }, + { + "epoch": 0.9027480261949936, + "grad_norm": 0.23772196471691132, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.006, + "step": 14750 + }, + { + "epoch": 0.9033600587551258, + "grad_norm": 0.2872219979763031, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0057, + "step": 14760 + }, + { + "epoch": 0.903972091315258, + "grad_norm": 0.2589483857154846, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0056, + "step": 14770 + }, + { + "epoch": 0.9045841238753902, + "grad_norm": 0.31850162148475647, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0051, + "step": 14780 + }, + { + "epoch": 0.9051961564355223, + "grad_norm": 0.27179282903671265, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0051, + "step": 14790 + }, + { + "epoch": 0.9058081889956545, + "grad_norm": 0.4132739007472992, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.005, + "step": 14800 + }, + { + "epoch": 0.9064202215557867, + "grad_norm": 0.19336774945259094, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0041, + "step": 14810 + }, + { + "epoch": 0.9070322541159189, + "grad_norm": 0.20783282816410065, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0065, + "step": 14820 + }, + { + "epoch": 0.9076442866760511, + "grad_norm": 0.26141899824142456, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0069, + "step": 14830 + }, + { + "epoch": 0.9082563192361833, + "grad_norm": 0.2158539742231369, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0081, + "step": 14840 + }, + { + "epoch": 0.9088683517963155, + "grad_norm": 0.3233732581138611, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0065, + "step": 14850 + }, + { + "epoch": 0.9094803843564477, + "grad_norm": 0.23924769461154938, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0042, + "step": 14860 + }, + { + "epoch": 0.9100924169165799, + "grad_norm": 0.17663812637329102, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.004, + "step": 14870 + }, + { + "epoch": 0.9107044494767121, + "grad_norm": 0.34379643201828003, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.005, + "step": 14880 + }, + { + "epoch": 0.9113164820368443, + "grad_norm": 0.29971349239349365, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0059, + "step": 14890 + }, + { + "epoch": 0.9119285145969765, + "grad_norm": 0.24832949042320251, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0042, + "step": 14900 + }, + { + "epoch": 0.9125405471571088, + "grad_norm": 0.22288024425506592, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0065, + "step": 14910 + }, + { + "epoch": 0.913152579717241, + "grad_norm": 0.2806689441204071, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0043, + "step": 14920 + }, + { + "epoch": 0.9137646122773732, + "grad_norm": 0.3908274173736572, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0047, + "step": 14930 + }, + { + "epoch": 0.9143766448375054, + "grad_norm": 0.16255778074264526, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0068, + "step": 14940 + }, + { + "epoch": 0.9149886773976376, + "grad_norm": 0.430791437625885, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0085, + "step": 14950 + }, + { + "epoch": 0.9156007099577698, + "grad_norm": 0.1739969551563263, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0056, + "step": 14960 + }, + { + "epoch": 0.916212742517902, + "grad_norm": 0.24298283457756042, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0055, + "step": 14970 + }, + { + "epoch": 0.9168247750780342, + "grad_norm": 0.21269915997982025, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0051, + "step": 14980 + }, + { + "epoch": 0.9174368076381664, + "grad_norm": 0.263388991355896, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0058, + "step": 14990 + }, + { + "epoch": 0.9180488401982986, + "grad_norm": 0.28030532598495483, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0068, + "step": 15000 + }, + { + "epoch": 0.9186608727584308, + "grad_norm": 0.17051894962787628, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 0.919272905318563, + "grad_norm": 0.2763383388519287, + "learning_rate": 1.146875176249365e-05, + "loss": 0.004, + "step": 15020 + }, + { + "epoch": 0.9198849378786952, + "grad_norm": 0.2616822421550751, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0052, + "step": 15030 + }, + { + "epoch": 0.9204969704388274, + "grad_norm": 0.21407093107700348, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0062, + "step": 15040 + }, + { + "epoch": 0.9211090029989596, + "grad_norm": 0.23936578631401062, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0073, + "step": 15050 + }, + { + "epoch": 0.9217210355590918, + "grad_norm": 0.26383110880851746, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.006, + "step": 15060 + }, + { + "epoch": 0.922333068119224, + "grad_norm": 0.19477945566177368, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0043, + "step": 15070 + }, + { + "epoch": 0.9229451006793561, + "grad_norm": 0.16677282750606537, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0061, + "step": 15080 + }, + { + "epoch": 0.9235571332394883, + "grad_norm": 0.26856037974357605, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0065, + "step": 15090 + }, + { + "epoch": 0.9241691657996205, + "grad_norm": 0.20086173713207245, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0056, + "step": 15100 + }, + { + "epoch": 0.9247811983597527, + "grad_norm": 0.26998719573020935, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0034, + "step": 15110 + }, + { + "epoch": 0.9253932309198849, + "grad_norm": 0.12727728486061096, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0043, + "step": 15120 + }, + { + "epoch": 0.9260052634800171, + "grad_norm": 0.11288347095251083, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0055, + "step": 15130 + }, + { + "epoch": 0.9266172960401493, + "grad_norm": 0.1109771579504013, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0048, + "step": 15140 + }, + { + "epoch": 0.9272293286002815, + "grad_norm": 0.2556479275226593, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0056, + "step": 15150 + }, + { + "epoch": 0.9278413611604137, + "grad_norm": 0.2149561196565628, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.005, + "step": 15160 + }, + { + "epoch": 0.9284533937205459, + "grad_norm": 0.16953054070472717, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0063, + "step": 15170 + }, + { + "epoch": 0.9290654262806781, + "grad_norm": 0.18306049704551697, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.004, + "step": 15180 + }, + { + "epoch": 0.9296774588408103, + "grad_norm": 0.15755385160446167, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0035, + "step": 15190 + }, + { + "epoch": 0.9302894914009425, + "grad_norm": 0.21062517166137695, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0049, + "step": 15200 + }, + { + "epoch": 0.9309015239610747, + "grad_norm": 0.1403888463973999, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0051, + "step": 15210 + }, + { + "epoch": 0.9315135565212069, + "grad_norm": 0.4044550359249115, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0062, + "step": 15220 + }, + { + "epoch": 0.9321255890813391, + "grad_norm": 0.22543896734714508, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 0.9327376216414713, + "grad_norm": 0.2025403380393982, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0055, + "step": 15240 + }, + { + "epoch": 0.9333496542016035, + "grad_norm": 1.0549683570861816, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0092, + "step": 15250 + }, + { + "epoch": 0.9339616867617357, + "grad_norm": 0.3442397117614746, + "learning_rate": 1.123494277220359e-05, + "loss": 0.005, + "step": 15260 + }, + { + "epoch": 0.934573719321868, + "grad_norm": 0.1678813248872757, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.005, + "step": 15270 + }, + { + "epoch": 0.9351857518820001, + "grad_norm": 0.31081119179725647, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0052, + "step": 15280 + }, + { + "epoch": 0.9357977844421324, + "grad_norm": 0.25498780608177185, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.006, + "step": 15290 + }, + { + "epoch": 0.9364098170022646, + "grad_norm": 0.21825125813484192, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0054, + "step": 15300 + }, + { + "epoch": 0.9370218495623968, + "grad_norm": 0.19719983637332916, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0074, + "step": 15310 + }, + { + "epoch": 0.937633882122529, + "grad_norm": 0.32297465205192566, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0058, + "step": 15320 + }, + { + "epoch": 0.9382459146826612, + "grad_norm": 0.2717733383178711, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0035, + "step": 15330 + }, + { + "epoch": 0.9388579472427934, + "grad_norm": 0.22138433158397675, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0048, + "step": 15340 + }, + { + "epoch": 0.9394699798029256, + "grad_norm": 0.1943465769290924, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0063, + "step": 15350 + }, + { + "epoch": 0.9400820123630577, + "grad_norm": 0.18422184884548187, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0054, + "step": 15360 + }, + { + "epoch": 0.9406940449231899, + "grad_norm": 0.17614246904850006, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0054, + "step": 15370 + }, + { + "epoch": 0.9413060774833221, + "grad_norm": 0.17661592364311218, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0053, + "step": 15380 + }, + { + "epoch": 0.9419181100434543, + "grad_norm": 0.42976850271224976, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0054, + "step": 15390 + }, + { + "epoch": 0.9425301426035865, + "grad_norm": 0.34272316098213196, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0044, + "step": 15400 + }, + { + "epoch": 0.9431421751637187, + "grad_norm": 0.3346613645553589, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0042, + "step": 15410 + }, + { + "epoch": 0.9437542077238509, + "grad_norm": 0.15300114452838898, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0057, + "step": 15420 + }, + { + "epoch": 0.9443662402839831, + "grad_norm": 0.23935656249523163, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0084, + "step": 15430 + }, + { + "epoch": 0.9449782728441153, + "grad_norm": 0.21595227718353271, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0051, + "step": 15440 + }, + { + "epoch": 0.9455903054042475, + "grad_norm": 0.2670149505138397, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0059, + "step": 15450 + }, + { + "epoch": 0.9462023379643797, + "grad_norm": 0.2214009314775467, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0054, + "step": 15460 + }, + { + "epoch": 0.9468143705245119, + "grad_norm": 0.3491996228694916, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 0.9474264030846441, + "grad_norm": 0.28213024139404297, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0054, + "step": 15480 + }, + { + "epoch": 0.9480384356447763, + "grad_norm": 0.30218765139579773, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0049, + "step": 15490 + }, + { + "epoch": 0.9486504682049085, + "grad_norm": 0.17068025469779968, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0046, + "step": 15500 + }, + { + "epoch": 0.9492625007650407, + "grad_norm": 0.23325121402740479, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0054, + "step": 15510 + }, + { + "epoch": 0.9498745333251729, + "grad_norm": 0.22118528187274933, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0035, + "step": 15520 + }, + { + "epoch": 0.9504865658853051, + "grad_norm": 0.20202121138572693, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0043, + "step": 15530 + }, + { + "epoch": 0.9510985984454373, + "grad_norm": 0.28455010056495667, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0039, + "step": 15540 + }, + { + "epoch": 0.9517106310055695, + "grad_norm": 0.26871445775032043, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0046, + "step": 15550 + }, + { + "epoch": 0.9523226635657017, + "grad_norm": 0.33665943145751953, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0058, + "step": 15560 + }, + { + "epoch": 0.9529346961258339, + "grad_norm": 0.3182595670223236, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0053, + "step": 15570 + }, + { + "epoch": 0.9535467286859661, + "grad_norm": 0.2867930829524994, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0068, + "step": 15580 + }, + { + "epoch": 0.9541587612460983, + "grad_norm": 0.21562239527702332, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.0051, + "step": 15590 + }, + { + "epoch": 0.9547707938062305, + "grad_norm": 0.19122859835624695, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0046, + "step": 15600 + }, + { + "epoch": 0.9553828263663627, + "grad_norm": 0.24596959352493286, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.005, + "step": 15610 + }, + { + "epoch": 0.9559948589264949, + "grad_norm": 0.182195246219635, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0038, + "step": 15620 + }, + { + "epoch": 0.9566068914866271, + "grad_norm": 0.3122585415840149, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0053, + "step": 15630 + }, + { + "epoch": 0.9572189240467592, + "grad_norm": 0.25725093483924866, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0063, + "step": 15640 + }, + { + "epoch": 0.9578309566068914, + "grad_norm": 0.19965514540672302, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0053, + "step": 15650 + }, + { + "epoch": 0.9584429891670236, + "grad_norm": 0.3474758267402649, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.006, + "step": 15660 + }, + { + "epoch": 0.9590550217271558, + "grad_norm": 0.18151336908340454, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0048, + "step": 15670 + }, + { + "epoch": 0.959667054287288, + "grad_norm": 0.18923020362854004, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0037, + "step": 15680 + }, + { + "epoch": 0.9602790868474202, + "grad_norm": 0.19792871177196503, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0049, + "step": 15690 + }, + { + "epoch": 0.9608911194075525, + "grad_norm": 0.20296797156333923, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0047, + "step": 15700 + }, + { + "epoch": 0.9615031519676847, + "grad_norm": 0.2556051015853882, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0054, + "step": 15710 + }, + { + "epoch": 0.9621151845278169, + "grad_norm": 0.35538288950920105, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0037, + "step": 15720 + }, + { + "epoch": 0.9627272170879491, + "grad_norm": 0.45357266068458557, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0065, + "step": 15730 + }, + { + "epoch": 0.9633392496480813, + "grad_norm": 0.23721693456172943, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0046, + "step": 15740 + }, + { + "epoch": 0.9639512822082135, + "grad_norm": 0.2727845013141632, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0052, + "step": 15750 + }, + { + "epoch": 0.9645633147683457, + "grad_norm": 0.2647950351238251, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0054, + "step": 15760 + }, + { + "epoch": 0.9651753473284779, + "grad_norm": 0.23364882171154022, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.005, + "step": 15770 + }, + { + "epoch": 0.9657873798886101, + "grad_norm": 0.2035825401544571, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0054, + "step": 15780 + }, + { + "epoch": 0.9663994124487423, + "grad_norm": 0.2411692589521408, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0062, + "step": 15790 + }, + { + "epoch": 0.9670114450088745, + "grad_norm": 0.23559266328811646, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 0.9676234775690067, + "grad_norm": 0.23872418701648712, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0063, + "step": 15810 + }, + { + "epoch": 0.9682355101291389, + "grad_norm": 0.27072128653526306, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0052, + "step": 15820 + }, + { + "epoch": 0.9688475426892711, + "grad_norm": 0.42610588669776917, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0056, + "step": 15830 + }, + { + "epoch": 0.9694595752494033, + "grad_norm": 0.13065233826637268, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0044, + "step": 15840 + }, + { + "epoch": 0.9700716078095355, + "grad_norm": 0.2479996383190155, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0049, + "step": 15850 + }, + { + "epoch": 0.9706836403696677, + "grad_norm": 0.22867974638938904, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 0.9712956729297999, + "grad_norm": 0.21570387482643127, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0048, + "step": 15870 + }, + { + "epoch": 0.9719077054899321, + "grad_norm": 0.26354169845581055, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0073, + "step": 15880 + }, + { + "epoch": 0.9725197380500643, + "grad_norm": 0.19785451889038086, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0044, + "step": 15890 + }, + { + "epoch": 0.9731317706101965, + "grad_norm": 0.09346124529838562, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0051, + "step": 15900 + }, + { + "epoch": 0.9737438031703287, + "grad_norm": 0.18946298956871033, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0049, + "step": 15910 + }, + { + "epoch": 0.9743558357304608, + "grad_norm": 0.1761726588010788, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0057, + "step": 15920 + }, + { + "epoch": 0.974967868290593, + "grad_norm": 0.2610328495502472, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0061, + "step": 15930 + }, + { + "epoch": 0.9755799008507252, + "grad_norm": 0.1841743141412735, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0046, + "step": 15940 + }, + { + "epoch": 0.9761919334108574, + "grad_norm": 0.14279355108737946, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0038, + "step": 15950 + }, + { + "epoch": 0.9768039659709896, + "grad_norm": 0.1717681884765625, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0035, + "step": 15960 + }, + { + "epoch": 0.9774159985311218, + "grad_norm": 0.2102527618408203, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.007, + "step": 15970 + }, + { + "epoch": 0.978028031091254, + "grad_norm": 0.29462379217147827, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0058, + "step": 15980 + }, + { + "epoch": 0.9786400636513862, + "grad_norm": 0.1863207072019577, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0058, + "step": 15990 + }, + { + "epoch": 0.9792520962115184, + "grad_norm": 0.2764773964881897, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0051, + "step": 16000 + }, + { + "epoch": 0.9798641287716506, + "grad_norm": 0.2723250091075897, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0056, + "step": 16010 + }, + { + "epoch": 0.9804761613317828, + "grad_norm": 0.21564331650733948, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0048, + "step": 16020 + }, + { + "epoch": 0.981088193891915, + "grad_norm": 0.20242232084274292, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0058, + "step": 16030 + }, + { + "epoch": 0.9817002264520472, + "grad_norm": 0.21522754430770874, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0039, + "step": 16040 + }, + { + "epoch": 0.9823122590121794, + "grad_norm": 0.20013833045959473, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0051, + "step": 16050 + }, + { + "epoch": 0.9829242915723116, + "grad_norm": 0.3008810579776764, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 0.9835363241324439, + "grad_norm": 0.2994979918003082, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0033, + "step": 16070 + }, + { + "epoch": 0.984148356692576, + "grad_norm": 0.22704628109931946, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0046, + "step": 16080 + }, + { + "epoch": 0.9847603892527083, + "grad_norm": 0.3253551423549652, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0043, + "step": 16090 + }, + { + "epoch": 0.9853724218128405, + "grad_norm": 0.14902091026306152, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0042, + "step": 16100 + }, + { + "epoch": 0.9859844543729727, + "grad_norm": 0.15155524015426636, + "learning_rate": 1.04066696184376e-05, + "loss": 0.005, + "step": 16110 + }, + { + "epoch": 0.9865964869331049, + "grad_norm": 0.1859518140554428, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0063, + "step": 16120 + }, + { + "epoch": 0.9872085194932371, + "grad_norm": 0.5434902906417847, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0072, + "step": 16130 + }, + { + "epoch": 0.9878205520533693, + "grad_norm": 0.19308103621006012, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0046, + "step": 16140 + }, + { + "epoch": 0.9884325846135015, + "grad_norm": 0.21260593831539154, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0077, + "step": 16150 + }, + { + "epoch": 0.9890446171736337, + "grad_norm": 0.15255668759346008, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0059, + "step": 16160 + }, + { + "epoch": 0.9896566497337659, + "grad_norm": 0.18739885091781616, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0047, + "step": 16170 + }, + { + "epoch": 0.9902686822938981, + "grad_norm": 0.2112029641866684, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0049, + "step": 16180 + }, + { + "epoch": 0.9908807148540303, + "grad_norm": 0.35941991209983826, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.005, + "step": 16190 + }, + { + "epoch": 0.9914927474141624, + "grad_norm": 0.16792108118534088, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0051, + "step": 16200 + }, + { + "epoch": 0.9921047799742946, + "grad_norm": 0.1985466182231903, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0063, + "step": 16210 + }, + { + "epoch": 0.9927168125344268, + "grad_norm": 0.17579570412635803, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0046, + "step": 16220 + }, + { + "epoch": 0.993328845094559, + "grad_norm": 0.23352178931236267, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0061, + "step": 16230 + }, + { + "epoch": 0.9939408776546912, + "grad_norm": 0.3543553054332733, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0054, + "step": 16240 + }, + { + "epoch": 0.9945529102148234, + "grad_norm": 0.18603719770908356, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0049, + "step": 16250 + }, + { + "epoch": 0.9951649427749556, + "grad_norm": 0.31745344400405884, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0061, + "step": 16260 + }, + { + "epoch": 0.9957769753350878, + "grad_norm": 0.1416773498058319, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0063, + "step": 16270 + }, + { + "epoch": 0.99638900789522, + "grad_norm": 0.18451642990112305, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0055, + "step": 16280 + }, + { + "epoch": 0.9970010404553522, + "grad_norm": 0.13422183692455292, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0047, + "step": 16290 + }, + { + "epoch": 0.9976130730154844, + "grad_norm": 0.15831588208675385, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0045, + "step": 16300 + }, + { + "epoch": 0.9982251055756166, + "grad_norm": 0.42520084977149963, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0053, + "step": 16310 + }, + { + "epoch": 0.9988371381357488, + "grad_norm": 0.20889437198638916, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0043, + "step": 16320 + }, + { + "epoch": 0.999449170695881, + "grad_norm": 0.17016667127609253, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0072, + "step": 16330 + }, + { + "epoch": 1.0000612032560132, + "grad_norm": 0.3129214346408844, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0054, + "step": 16340 + }, + { + "epoch": 1.0006732358161454, + "grad_norm": 0.334224134683609, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.0037, + "step": 16350 + }, + { + "epoch": 1.0012852683762776, + "grad_norm": 0.28502705693244934, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0065, + "step": 16360 + }, + { + "epoch": 1.0018973009364098, + "grad_norm": 0.21431966125965118, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0046, + "step": 16370 + }, + { + "epoch": 1.002509333496542, + "grad_norm": 0.22898051142692566, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.006, + "step": 16380 + }, + { + "epoch": 1.0031213660566742, + "grad_norm": 0.41625624895095825, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0068, + "step": 16390 + }, + { + "epoch": 1.0037333986168064, + "grad_norm": 0.2510327398777008, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0043, + "step": 16400 + }, + { + "epoch": 1.0043454311769386, + "grad_norm": 0.23560962080955505, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0049, + "step": 16410 + }, + { + "epoch": 1.0049574637370708, + "grad_norm": 0.2081199437379837, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0061, + "step": 16420 + }, + { + "epoch": 1.005569496297203, + "grad_norm": 0.12456244230270386, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0057, + "step": 16430 + }, + { + "epoch": 1.0061815288573353, + "grad_norm": 0.22212636470794678, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0052, + "step": 16440 + }, + { + "epoch": 1.0067935614174675, + "grad_norm": 0.27772897481918335, + "learning_rate": 1.007637577910799e-05, + "loss": 0.007, + "step": 16450 + }, + { + "epoch": 1.0074055939775997, + "grad_norm": 0.40040507912635803, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0051, + "step": 16460 + }, + { + "epoch": 1.0080176265377319, + "grad_norm": 0.19763565063476562, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0047, + "step": 16470 + }, + { + "epoch": 1.008629659097864, + "grad_norm": 0.2906181514263153, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0055, + "step": 16480 + }, + { + "epoch": 1.0092416916579963, + "grad_norm": 0.29949888586997986, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0045, + "step": 16490 + }, + { + "epoch": 1.0098537242181285, + "grad_norm": 0.3900962769985199, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0053, + "step": 16500 + }, + { + "epoch": 1.0104657567782607, + "grad_norm": 0.22380846738815308, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0043, + "step": 16510 + }, + { + "epoch": 1.0110777893383929, + "grad_norm": 0.3426673412322998, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0052, + "step": 16520 + }, + { + "epoch": 1.011689821898525, + "grad_norm": 0.2452230006456375, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0055, + "step": 16530 + }, + { + "epoch": 1.0123018544586573, + "grad_norm": 0.24280408024787903, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0042, + "step": 16540 + }, + { + "epoch": 1.0129138870187895, + "grad_norm": 0.18271701037883759, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0047, + "step": 16550 + }, + { + "epoch": 1.0135259195789217, + "grad_norm": 0.2874322235584259, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0059, + "step": 16560 + }, + { + "epoch": 1.0141379521390539, + "grad_norm": 0.17367394268512726, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0048, + "step": 16570 + }, + { + "epoch": 1.014749984699186, + "grad_norm": 0.167460098862648, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0074, + "step": 16580 + }, + { + "epoch": 1.0153620172593183, + "grad_norm": 0.21867765486240387, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0042, + "step": 16590 + }, + { + "epoch": 1.0159740498194505, + "grad_norm": 0.2539086639881134, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0057, + "step": 16600 + }, + { + "epoch": 1.0165860823795827, + "grad_norm": 0.1415795534849167, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0044, + "step": 16610 + }, + { + "epoch": 1.0171981149397147, + "grad_norm": 0.12702493369579315, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0038, + "step": 16620 + }, + { + "epoch": 1.0178101474998469, + "grad_norm": 0.16548305749893188, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0042, + "step": 16630 + }, + { + "epoch": 1.018422180059979, + "grad_norm": 0.4413173496723175, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0059, + "step": 16640 + }, + { + "epoch": 1.0190342126201113, + "grad_norm": 0.30871614813804626, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0045, + "step": 16650 + }, + { + "epoch": 1.0196462451802435, + "grad_norm": 0.259650319814682, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0044, + "step": 16660 + }, + { + "epoch": 1.0202582777403757, + "grad_norm": 0.36035388708114624, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0068, + "step": 16670 + }, + { + "epoch": 1.020870310300508, + "grad_norm": 0.3487808406352997, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0038, + "step": 16680 + }, + { + "epoch": 1.02148234286064, + "grad_norm": 0.2898370623588562, + "learning_rate": 9.843955128197274e-06, + "loss": 0.004, + "step": 16690 + }, + { + "epoch": 1.0220943754207723, + "grad_norm": 0.2942182719707489, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0042, + "step": 16700 + }, + { + "epoch": 1.0227064079809045, + "grad_norm": 0.27839869260787964, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0042, + "step": 16710 + }, + { + "epoch": 1.0233184405410367, + "grad_norm": 0.17199957370758057, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0059, + "step": 16720 + }, + { + "epoch": 1.023930473101169, + "grad_norm": 0.2521669566631317, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0073, + "step": 16730 + }, + { + "epoch": 1.0245425056613011, + "grad_norm": 0.19908513128757477, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0047, + "step": 16740 + }, + { + "epoch": 1.0251545382214333, + "grad_norm": 0.23300328850746155, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0055, + "step": 16750 + }, + { + "epoch": 1.0257665707815655, + "grad_norm": 0.24671277403831482, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0043, + "step": 16760 + }, + { + "epoch": 1.0263786033416977, + "grad_norm": 0.23183101415634155, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0052, + "step": 16770 + }, + { + "epoch": 1.02699063590183, + "grad_norm": 0.13460612297058105, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0035, + "step": 16780 + }, + { + "epoch": 1.0276026684619621, + "grad_norm": 0.1990940123796463, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0044, + "step": 16790 + }, + { + "epoch": 1.0282147010220943, + "grad_norm": 0.21223406493663788, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0036, + "step": 16800 + }, + { + "epoch": 1.0288267335822265, + "grad_norm": 0.2649106979370117, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0044, + "step": 16810 + }, + { + "epoch": 1.0294387661423587, + "grad_norm": 0.2524845600128174, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0048, + "step": 16820 + }, + { + "epoch": 1.030050798702491, + "grad_norm": 0.22169779241085052, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 1.0306628312626231, + "grad_norm": 0.16642418503761292, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0048, + "step": 16840 + }, + { + "epoch": 1.0312748638227553, + "grad_norm": 0.22939598560333252, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0048, + "step": 16850 + }, + { + "epoch": 1.0318868963828876, + "grad_norm": 0.2131129503250122, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0055, + "step": 16860 + }, + { + "epoch": 1.0324989289430198, + "grad_norm": 0.20492705702781677, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0041, + "step": 16870 + }, + { + "epoch": 1.033110961503152, + "grad_norm": 0.2988845705986023, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0042, + "step": 16880 + }, + { + "epoch": 1.0337229940632842, + "grad_norm": 0.18579600751399994, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0042, + "step": 16890 + }, + { + "epoch": 1.0343350266234164, + "grad_norm": 0.2553490698337555, + "learning_rate": 9.641222698101725e-06, + "loss": 0.005, + "step": 16900 + }, + { + "epoch": 1.0349470591835486, + "grad_norm": 0.338440865278244, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0036, + "step": 16910 + }, + { + "epoch": 1.0355590917436808, + "grad_norm": 0.12755723297595978, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0044, + "step": 16920 + }, + { + "epoch": 1.036171124303813, + "grad_norm": 0.12222232669591904, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0037, + "step": 16930 + }, + { + "epoch": 1.0367831568639452, + "grad_norm": 0.20246204733848572, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0055, + "step": 16940 + }, + { + "epoch": 1.0373951894240774, + "grad_norm": 0.36903291940689087, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0051, + "step": 16950 + }, + { + "epoch": 1.0380072219842096, + "grad_norm": 0.3166116178035736, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0045, + "step": 16960 + }, + { + "epoch": 1.0386192545443418, + "grad_norm": 0.2777375280857086, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0041, + "step": 16970 + }, + { + "epoch": 1.039231287104474, + "grad_norm": 0.3173989951610565, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0053, + "step": 16980 + }, + { + "epoch": 1.0398433196646062, + "grad_norm": 0.2135571539402008, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0051, + "step": 16990 + }, + { + "epoch": 1.0404553522247384, + "grad_norm": 0.18536782264709473, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0037, + "step": 17000 + }, + { + "epoch": 1.0410673847848706, + "grad_norm": 0.17782410979270935, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0052, + "step": 17010 + }, + { + "epoch": 1.0416794173450028, + "grad_norm": 0.31509512662887573, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0099, + "step": 17020 + }, + { + "epoch": 1.042291449905135, + "grad_norm": 0.22748225927352905, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 1.0429034824652672, + "grad_norm": 0.14924705028533936, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0053, + "step": 17040 + }, + { + "epoch": 1.0435155150253994, + "grad_norm": 0.21390999853610992, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0044, + "step": 17050 + }, + { + "epoch": 1.0441275475855316, + "grad_norm": 0.25828516483306885, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0042, + "step": 17060 + }, + { + "epoch": 1.0447395801456638, + "grad_norm": 0.24069662392139435, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0069, + "step": 17070 + }, + { + "epoch": 1.045351612705796, + "grad_norm": 0.1090504601597786, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0036, + "step": 17080 + }, + { + "epoch": 1.0459636452659282, + "grad_norm": 0.17990687489509583, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0049, + "step": 17090 + }, + { + "epoch": 1.0465756778260604, + "grad_norm": 0.21505555510520935, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0051, + "step": 17100 + }, + { + "epoch": 1.0471877103861926, + "grad_norm": 0.2157493680715561, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0063, + "step": 17110 + }, + { + "epoch": 1.0477997429463248, + "grad_norm": 0.30865493416786194, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0053, + "step": 17120 + }, + { + "epoch": 1.048411775506457, + "grad_norm": 0.16882938146591187, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0043, + "step": 17130 + }, + { + "epoch": 1.0490238080665892, + "grad_norm": 0.14921846985816956, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0043, + "step": 17140 + }, + { + "epoch": 1.0496358406267214, + "grad_norm": 0.15723800659179688, + "learning_rate": 9.400800085133245e-06, + "loss": 0.005, + "step": 17150 + }, + { + "epoch": 1.0502478731868536, + "grad_norm": 0.19597285985946655, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0046, + "step": 17160 + }, + { + "epoch": 1.0508599057469858, + "grad_norm": 0.1684723198413849, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0053, + "step": 17170 + }, + { + "epoch": 1.051471938307118, + "grad_norm": 0.1733175367116928, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0053, + "step": 17180 + }, + { + "epoch": 1.0520839708672503, + "grad_norm": 0.23111647367477417, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0048, + "step": 17190 + }, + { + "epoch": 1.0526960034273822, + "grad_norm": 0.36174628138542175, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0049, + "step": 17200 + }, + { + "epoch": 1.0533080359875144, + "grad_norm": 0.15791575610637665, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0048, + "step": 17210 + }, + { + "epoch": 1.0539200685476466, + "grad_norm": 0.16026809811592102, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0047, + "step": 17220 + }, + { + "epoch": 1.0545321011077788, + "grad_norm": 0.13964296877384186, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0033, + "step": 17230 + }, + { + "epoch": 1.055144133667911, + "grad_norm": 0.22623896598815918, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0041, + "step": 17240 + }, + { + "epoch": 1.0557561662280432, + "grad_norm": 0.15534555912017822, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0067, + "step": 17250 + }, + { + "epoch": 1.0563681987881754, + "grad_norm": 0.09519665688276291, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0035, + "step": 17260 + }, + { + "epoch": 1.0569802313483077, + "grad_norm": 0.19323785603046417, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0045, + "step": 17270 + }, + { + "epoch": 1.0575922639084399, + "grad_norm": 0.21194952726364136, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0047, + "step": 17280 + }, + { + "epoch": 1.058204296468572, + "grad_norm": 0.28977999091148376, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0049, + "step": 17290 + }, + { + "epoch": 1.0588163290287043, + "grad_norm": 0.1739121824502945, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0062, + "step": 17300 + }, + { + "epoch": 1.0594283615888365, + "grad_norm": 0.23189865052700043, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0055, + "step": 17310 + }, + { + "epoch": 1.0600403941489687, + "grad_norm": 0.15705449879169464, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0033, + "step": 17320 + }, + { + "epoch": 1.0606524267091009, + "grad_norm": 0.23189882934093475, + "learning_rate": 9.228411903689187e-06, + "loss": 0.003, + "step": 17330 + }, + { + "epoch": 1.061264459269233, + "grad_norm": 0.19559095799922943, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0051, + "step": 17340 + }, + { + "epoch": 1.0618764918293653, + "grad_norm": 0.2560543715953827, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0049, + "step": 17350 + }, + { + "epoch": 1.0624885243894975, + "grad_norm": 0.35167232155799866, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0042, + "step": 17360 + }, + { + "epoch": 1.0631005569496297, + "grad_norm": 0.17626497149467468, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0039, + "step": 17370 + }, + { + "epoch": 1.0637125895097619, + "grad_norm": 0.18818546831607819, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0043, + "step": 17380 + }, + { + "epoch": 1.064324622069894, + "grad_norm": 0.10237561911344528, + "learning_rate": 9.171095634265995e-06, + "loss": 0.0053, + "step": 17390 + }, + { + "epoch": 1.0649366546300263, + "grad_norm": 0.21828459203243256, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0042, + "step": 17400 + }, + { + "epoch": 1.0655486871901585, + "grad_norm": 0.09354235231876373, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0034, + "step": 17410 + }, + { + "epoch": 1.0661607197502907, + "grad_norm": 0.18106088042259216, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0051, + "step": 17420 + }, + { + "epoch": 1.066772752310423, + "grad_norm": 0.21538101136684418, + "learning_rate": 9.132927564918328e-06, + "loss": 0.0056, + "step": 17430 + }, + { + "epoch": 1.067384784870555, + "grad_norm": 0.18729519844055176, + "learning_rate": 9.1233909973763e-06, + "loss": 0.004, + "step": 17440 + }, + { + "epoch": 1.0679968174306873, + "grad_norm": 0.3791484832763672, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0052, + "step": 17450 + }, + { + "epoch": 1.0686088499908195, + "grad_norm": 0.19206254184246063, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0042, + "step": 17460 + }, + { + "epoch": 1.0692208825509517, + "grad_norm": 0.15434518456459045, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0061, + "step": 17470 + }, + { + "epoch": 1.069832915111084, + "grad_norm": 0.17898093163967133, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0045, + "step": 17480 + }, + { + "epoch": 1.0704449476712161, + "grad_norm": 0.21975649893283844, + "learning_rate": 9.07574141798717e-06, + "loss": 0.005, + "step": 17490 + }, + { + "epoch": 1.0710569802313483, + "grad_norm": 0.1380346417427063, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0032, + "step": 17500 + }, + { + "epoch": 1.0716690127914805, + "grad_norm": 0.28567400574684143, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0044, + "step": 17510 + }, + { + "epoch": 1.0722810453516127, + "grad_norm": 0.22925534844398499, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0041, + "step": 17520 + }, + { + "epoch": 1.072893077911745, + "grad_norm": 0.27094215154647827, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0047, + "step": 17530 + }, + { + "epoch": 1.0735051104718771, + "grad_norm": 0.32299691438674927, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0048, + "step": 17540 + }, + { + "epoch": 1.0741171430320093, + "grad_norm": 0.26789531111717224, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0047, + "step": 17550 + }, + { + "epoch": 1.0747291755921415, + "grad_norm": 0.3175952434539795, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0077, + "step": 17560 + }, + { + "epoch": 1.0753412081522737, + "grad_norm": 0.24784249067306519, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0048, + "step": 17570 + }, + { + "epoch": 1.075953240712406, + "grad_norm": 0.3081960380077362, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0046, + "step": 17580 + }, + { + "epoch": 1.0765652732725381, + "grad_norm": 0.25334152579307556, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0039, + "step": 17590 + }, + { + "epoch": 1.0771773058326704, + "grad_norm": 0.24747619032859802, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0059, + "step": 17600 + }, + { + "epoch": 1.0777893383928026, + "grad_norm": 0.19048908352851868, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0049, + "step": 17610 + }, + { + "epoch": 1.0784013709529348, + "grad_norm": 0.18883349001407623, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0047, + "step": 17620 + }, + { + "epoch": 1.079013403513067, + "grad_norm": 0.18653099238872528, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0044, + "step": 17630 + }, + { + "epoch": 1.0796254360731992, + "grad_norm": 0.1320251226425171, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0042, + "step": 17640 + }, + { + "epoch": 1.0802374686333314, + "grad_norm": 0.14996238052845, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0041, + "step": 17650 + }, + { + "epoch": 1.0808495011934636, + "grad_norm": 0.4576573073863983, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0059, + "step": 17660 + }, + { + "epoch": 1.0814615337535958, + "grad_norm": 0.19582511484622955, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0051, + "step": 17670 + }, + { + "epoch": 1.082073566313728, + "grad_norm": 0.21973003447055817, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0059, + "step": 17680 + }, + { + "epoch": 1.0826855988738602, + "grad_norm": 0.18183568120002747, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0056, + "step": 17690 + }, + { + "epoch": 1.0832976314339924, + "grad_norm": 0.1761978417634964, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0049, + "step": 17700 + }, + { + "epoch": 1.0839096639941246, + "grad_norm": 0.10185366123914719, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0041, + "step": 17710 + }, + { + "epoch": 1.0845216965542568, + "grad_norm": 0.262513130903244, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0046, + "step": 17720 + }, + { + "epoch": 1.0851337291143888, + "grad_norm": 0.36413198709487915, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0043, + "step": 17730 + }, + { + "epoch": 1.085745761674521, + "grad_norm": 0.2258218675851822, + "learning_rate": 8.83836825410936e-06, + "loss": 0.005, + "step": 17740 + }, + { + "epoch": 1.0863577942346532, + "grad_norm": 0.20840497314929962, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0067, + "step": 17750 + }, + { + "epoch": 1.0869698267947854, + "grad_norm": 0.33392995595932007, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0063, + "step": 17760 + }, + { + "epoch": 1.0875818593549176, + "grad_norm": 0.18477876484394073, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0039, + "step": 17770 + }, + { + "epoch": 1.0881938919150498, + "grad_norm": 0.14785899221897125, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0063, + "step": 17780 + }, + { + "epoch": 1.088805924475182, + "grad_norm": 0.12930043041706085, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0055, + "step": 17790 + }, + { + "epoch": 1.0894179570353142, + "grad_norm": 0.1541786789894104, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0035, + "step": 17800 + }, + { + "epoch": 1.0900299895954464, + "grad_norm": 0.1781499683856964, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0054, + "step": 17810 + }, + { + "epoch": 1.0906420221555786, + "grad_norm": 0.13659314811229706, + "learning_rate": 8.762735374981932e-06, + "loss": 0.0047, + "step": 17820 + }, + { + "epoch": 1.0912540547157108, + "grad_norm": 0.18936918675899506, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0041, + "step": 17830 + }, + { + "epoch": 1.091866087275843, + "grad_norm": 0.24795638024806976, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0043, + "step": 17840 + }, + { + "epoch": 1.0924781198359752, + "grad_norm": 0.28090324997901917, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0036, + "step": 17850 + }, + { + "epoch": 1.0930901523961074, + "grad_norm": 0.3130576014518738, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0042, + "step": 17860 + }, + { + "epoch": 1.0937021849562396, + "grad_norm": 0.19758646190166473, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0047, + "step": 17870 + }, + { + "epoch": 1.0943142175163718, + "grad_norm": 0.20309071242809296, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0042, + "step": 17880 + }, + { + "epoch": 1.094926250076504, + "grad_norm": 0.19741898775100708, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0057, + "step": 17890 + }, + { + "epoch": 1.0955382826366362, + "grad_norm": 0.19182747602462769, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0042, + "step": 17900 + }, + { + "epoch": 1.0961503151967684, + "grad_norm": 0.14508575201034546, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0044, + "step": 17910 + }, + { + "epoch": 1.0967623477569006, + "grad_norm": 0.19854849576950073, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0064, + "step": 17920 + }, + { + "epoch": 1.0973743803170328, + "grad_norm": 0.15055720508098602, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0049, + "step": 17930 + }, + { + "epoch": 1.097986412877165, + "grad_norm": 0.1855372190475464, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0043, + "step": 17940 + }, + { + "epoch": 1.0985984454372972, + "grad_norm": 0.13770940899848938, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0058, + "step": 17950 + }, + { + "epoch": 1.0992104779974294, + "grad_norm": 0.24905221164226532, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0048, + "step": 17960 + }, + { + "epoch": 1.0998225105575616, + "grad_norm": 0.1951165348291397, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0043, + "step": 17970 + }, + { + "epoch": 1.1004345431176938, + "grad_norm": 0.18365852534770966, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0051, + "step": 17980 + }, + { + "epoch": 1.101046575677826, + "grad_norm": 0.16304127871990204, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0034, + "step": 17990 + }, + { + "epoch": 1.1016586082379582, + "grad_norm": 0.262677401304245, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0042, + "step": 18000 + }, + { + "epoch": 1.1022706407980905, + "grad_norm": 0.6157310605049133, + "learning_rate": 8.583791146965244e-06, + "loss": 0.007, + "step": 18010 + }, + { + "epoch": 1.1028826733582227, + "grad_norm": 0.2832951247692108, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0056, + "step": 18020 + }, + { + "epoch": 1.1034947059183549, + "grad_norm": 0.1781810224056244, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0049, + "step": 18030 + }, + { + "epoch": 1.104106738478487, + "grad_norm": 0.23228950798511505, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0045, + "step": 18040 + }, + { + "epoch": 1.1047187710386193, + "grad_norm": 0.2573170065879822, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0048, + "step": 18050 + }, + { + "epoch": 1.1053308035987515, + "grad_norm": 0.30996036529541016, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0054, + "step": 18060 + }, + { + "epoch": 1.1059428361588837, + "grad_norm": 0.24979132413864136, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0045, + "step": 18070 + }, + { + "epoch": 1.1065548687190159, + "grad_norm": 0.17564314603805542, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0033, + "step": 18080 + }, + { + "epoch": 1.107166901279148, + "grad_norm": 0.14539776742458344, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0047, + "step": 18090 + }, + { + "epoch": 1.1077789338392803, + "grad_norm": 0.2530387341976166, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0058, + "step": 18100 + }, + { + "epoch": 1.1083909663994125, + "grad_norm": 0.2038760781288147, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0052, + "step": 18110 + }, + { + "epoch": 1.1090029989595447, + "grad_norm": 0.1769075244665146, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0043, + "step": 18120 + }, + { + "epoch": 1.1096150315196769, + "grad_norm": 0.1686626374721527, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0055, + "step": 18130 + }, + { + "epoch": 1.110227064079809, + "grad_norm": 0.21752336621284485, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0052, + "step": 18140 + }, + { + "epoch": 1.1108390966399413, + "grad_norm": 0.2739295959472656, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0039, + "step": 18150 + }, + { + "epoch": 1.1114511292000735, + "grad_norm": 0.18259567022323608, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0038, + "step": 18160 + }, + { + "epoch": 1.1120631617602057, + "grad_norm": 0.21565310657024384, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0043, + "step": 18170 + }, + { + "epoch": 1.112675194320338, + "grad_norm": 0.2141607403755188, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0059, + "step": 18180 + }, + { + "epoch": 1.11328722688047, + "grad_norm": 0.3017563819885254, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0044, + "step": 18190 + }, + { + "epoch": 1.1138992594406023, + "grad_norm": 0.2021455019712448, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0044, + "step": 18200 + }, + { + "epoch": 1.1145112920007345, + "grad_norm": 0.2113070785999298, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0048, + "step": 18210 + }, + { + "epoch": 1.1151233245608667, + "grad_norm": 0.18945784866809845, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0029, + "step": 18220 + }, + { + "epoch": 1.115735357120999, + "grad_norm": 0.15259192883968353, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0043, + "step": 18230 + }, + { + "epoch": 1.1163473896811311, + "grad_norm": 0.17555822432041168, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0039, + "step": 18240 + }, + { + "epoch": 1.1169594222412633, + "grad_norm": 0.20105648040771484, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0058, + "step": 18250 + }, + { + "epoch": 1.1175714548013955, + "grad_norm": 0.31626567244529724, + "learning_rate": 8.349909816537207e-06, + "loss": 0.0044, + "step": 18260 + }, + { + "epoch": 1.1181834873615277, + "grad_norm": 0.16219007968902588, + "learning_rate": 8.340593854157868e-06, + "loss": 0.005, + "step": 18270 + }, + { + "epoch": 1.11879551992166, + "grad_norm": 0.2174186110496521, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0055, + "step": 18280 + }, + { + "epoch": 1.1194075524817921, + "grad_norm": 0.13639339804649353, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0043, + "step": 18290 + }, + { + "epoch": 1.1200195850419243, + "grad_norm": 0.15100249648094177, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0042, + "step": 18300 + }, + { + "epoch": 1.1206316176020565, + "grad_norm": 0.2114904671907425, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0043, + "step": 18310 + }, + { + "epoch": 1.1212436501621887, + "grad_norm": 0.2941966950893402, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0052, + "step": 18320 + }, + { + "epoch": 1.1218556827223207, + "grad_norm": 0.21695150434970856, + "learning_rate": 8.28476400245882e-06, + "loss": 0.005, + "step": 18330 + }, + { + "epoch": 1.122467715282453, + "grad_norm": 0.11768218129873276, + "learning_rate": 8.275470116190976e-06, + "loss": 0.005, + "step": 18340 + }, + { + "epoch": 1.1230797478425851, + "grad_norm": 0.1427483856678009, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0039, + "step": 18350 + }, + { + "epoch": 1.1236917804027173, + "grad_norm": 0.1837971955537796, + "learning_rate": 8.256891946721157e-06, + "loss": 0.004, + "step": 18360 + }, + { + "epoch": 1.1243038129628495, + "grad_norm": 0.30968883633613586, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0037, + "step": 18370 + }, + { + "epoch": 1.1249158455229817, + "grad_norm": 0.13366396725177765, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0042, + "step": 18380 + }, + { + "epoch": 1.125527878083114, + "grad_norm": 0.1829235553741455, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0039, + "step": 18390 + }, + { + "epoch": 1.1261399106432461, + "grad_norm": 0.3106991648674011, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0052, + "step": 18400 + }, + { + "epoch": 1.1267519432033783, + "grad_norm": 0.38655754923820496, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0046, + "step": 18410 + }, + { + "epoch": 1.1273639757635106, + "grad_norm": 0.23598383367061615, + "learning_rate": 8.201235047388747e-06, + "loss": 0.004, + "step": 18420 + }, + { + "epoch": 1.1279760083236428, + "grad_norm": 0.17428012192249298, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0046, + "step": 18430 + }, + { + "epoch": 1.128588040883775, + "grad_norm": 0.1847466081380844, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0043, + "step": 18440 + }, + { + "epoch": 1.1292000734439072, + "grad_norm": 0.14917762577533722, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0049, + "step": 18450 + }, + { + "epoch": 1.1298121060040394, + "grad_norm": 0.2882528305053711, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0046, + "step": 18460 + }, + { + "epoch": 1.1304241385641716, + "grad_norm": 0.36186549067497253, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0065, + "step": 18470 + }, + { + "epoch": 1.1310361711243038, + "grad_norm": 0.1604463905096054, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0037, + "step": 18480 + }, + { + "epoch": 1.131648203684436, + "grad_norm": 0.17751921713352203, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0034, + "step": 18490 + }, + { + "epoch": 1.1322602362445682, + "grad_norm": 0.15355733036994934, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0059, + "step": 18500 + }, + { + "epoch": 1.1328722688047004, + "grad_norm": 0.21558596193790436, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0044, + "step": 18510 + }, + { + "epoch": 1.1334843013648326, + "grad_norm": 0.20114412903785706, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 1.1340963339249648, + "grad_norm": 0.17260855436325073, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0053, + "step": 18530 + }, + { + "epoch": 1.134708366485097, + "grad_norm": 0.16089287400245667, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0032, + "step": 18540 + }, + { + "epoch": 1.1353203990452292, + "grad_norm": 0.14655937254428864, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0043, + "step": 18550 + }, + { + "epoch": 1.1359324316053614, + "grad_norm": 0.16373249888420105, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0058, + "step": 18560 + }, + { + "epoch": 1.1365444641654936, + "grad_norm": 0.14543801546096802, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0043, + "step": 18570 + }, + { + "epoch": 1.1371564967256258, + "grad_norm": 0.3515278100967407, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0043, + "step": 18580 + }, + { + "epoch": 1.137768529285758, + "grad_norm": 0.21776945888996124, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0053, + "step": 18590 + }, + { + "epoch": 1.1383805618458902, + "grad_norm": 0.21879829466342926, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0051, + "step": 18600 + }, + { + "epoch": 1.1389925944060224, + "grad_norm": 0.16967973113059998, + "learning_rate": 8.025779439806006e-06, + "loss": 0.0048, + "step": 18610 + }, + { + "epoch": 1.1396046269661546, + "grad_norm": 0.4298441410064697, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0056, + "step": 18620 + }, + { + "epoch": 1.1402166595262868, + "grad_norm": 0.1858961284160614, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0067, + "step": 18630 + }, + { + "epoch": 1.140828692086419, + "grad_norm": 0.25853803753852844, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0057, + "step": 18640 + }, + { + "epoch": 1.1414407246465512, + "grad_norm": 0.18566234409809113, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0048, + "step": 18650 + }, + { + "epoch": 1.1420527572066834, + "grad_norm": 0.3471083343029022, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0042, + "step": 18660 + }, + { + "epoch": 1.1426647897668156, + "grad_norm": 0.2092636376619339, + "learning_rate": 7.970630670012853e-06, + "loss": 0.004, + "step": 18670 + }, + { + "epoch": 1.1432768223269478, + "grad_norm": 0.3432580828666687, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0044, + "step": 18680 + }, + { + "epoch": 1.14388885488708, + "grad_norm": 0.14227882027626038, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0041, + "step": 18690 + }, + { + "epoch": 1.1445008874472122, + "grad_norm": 0.2128007709980011, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0037, + "step": 18700 + }, + { + "epoch": 1.1451129200073444, + "grad_norm": 0.25377482175827026, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0049, + "step": 18710 + }, + { + "epoch": 1.1457249525674766, + "grad_norm": 0.1905982494354248, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0037, + "step": 18720 + }, + { + "epoch": 1.1463369851276088, + "grad_norm": 0.3090096712112427, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0079, + "step": 18730 + }, + { + "epoch": 1.146949017687741, + "grad_norm": 0.15604345500469208, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0037, + "step": 18740 + }, + { + "epoch": 1.1475610502478732, + "grad_norm": 0.21756386756896973, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0043, + "step": 18750 + }, + { + "epoch": 1.1481730828080055, + "grad_norm": 0.23869304358959198, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0038, + "step": 18760 + }, + { + "epoch": 1.1487851153681377, + "grad_norm": 0.18082380294799805, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0073, + "step": 18770 + }, + { + "epoch": 1.1493971479282699, + "grad_norm": 0.4032754898071289, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0061, + "step": 18780 + }, + { + "epoch": 1.150009180488402, + "grad_norm": 0.3173290491104126, + "learning_rate": 7.860719408056385e-06, + "loss": 0.004, + "step": 18790 + }, + { + "epoch": 1.1506212130485343, + "grad_norm": 0.18892645835876465, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0044, + "step": 18800 + }, + { + "epoch": 1.1512332456086665, + "grad_norm": 0.26740241050720215, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0056, + "step": 18810 + }, + { + "epoch": 1.1518452781687987, + "grad_norm": 0.3046218752861023, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0072, + "step": 18820 + }, + { + "epoch": 1.1524573107289309, + "grad_norm": 0.17181983590126038, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0034, + "step": 18830 + }, + { + "epoch": 1.1530693432890629, + "grad_norm": 0.22095724940299988, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0045, + "step": 18840 + }, + { + "epoch": 1.153681375849195, + "grad_norm": 0.1514609307050705, + "learning_rate": 7.80596155940873e-06, + "loss": 0.004, + "step": 18850 + }, + { + "epoch": 1.1542934084093273, + "grad_norm": 0.15244366228580475, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0047, + "step": 18860 + }, + { + "epoch": 1.1549054409694595, + "grad_norm": 0.24359947443008423, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0039, + "step": 18870 + }, + { + "epoch": 1.1555174735295917, + "grad_norm": 0.15558156371116638, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0036, + "step": 18880 + }, + { + "epoch": 1.1561295060897239, + "grad_norm": 0.33679234981536865, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0041, + "step": 18890 + }, + { + "epoch": 1.156741538649856, + "grad_norm": 0.15811999142169952, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0062, + "step": 18900 + }, + { + "epoch": 1.1573535712099883, + "grad_norm": 0.14838527143001556, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0029, + "step": 18910 + }, + { + "epoch": 1.1579656037701205, + "grad_norm": 0.23024815320968628, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0038, + "step": 18920 + }, + { + "epoch": 1.1585776363302527, + "grad_norm": 0.18455618619918823, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0044, + "step": 18930 + }, + { + "epoch": 1.1591896688903849, + "grad_norm": 0.20213079452514648, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0038, + "step": 18940 + }, + { + "epoch": 1.159801701450517, + "grad_norm": 0.19000643491744995, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0043, + "step": 18950 + }, + { + "epoch": 1.1604137340106493, + "grad_norm": 0.14075686037540436, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0047, + "step": 18960 + }, + { + "epoch": 1.1610257665707815, + "grad_norm": 0.22101792693138123, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0045, + "step": 18970 + }, + { + "epoch": 1.1616377991309137, + "grad_norm": 0.1097906231880188, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0037, + "step": 18980 + }, + { + "epoch": 1.162249831691046, + "grad_norm": 0.16169370710849762, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0033, + "step": 18990 + }, + { + "epoch": 1.162861864251178, + "grad_norm": 0.32931753993034363, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0052, + "step": 19000 + }, + { + "epoch": 1.1634738968113103, + "grad_norm": 0.2494741678237915, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0057, + "step": 19010 + }, + { + "epoch": 1.1640859293714425, + "grad_norm": 0.18492171168327332, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0056, + "step": 19020 + }, + { + "epoch": 1.1646979619315747, + "grad_norm": 0.18830963969230652, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0036, + "step": 19030 + }, + { + "epoch": 1.165309994491707, + "grad_norm": 0.1331586092710495, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0046, + "step": 19040 + }, + { + "epoch": 1.1659220270518391, + "grad_norm": 0.2433806210756302, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0053, + "step": 19050 + }, + { + "epoch": 1.1665340596119713, + "grad_norm": 0.24491485953330994, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0037, + "step": 19060 + }, + { + "epoch": 1.1671460921721035, + "grad_norm": 0.1789211630821228, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0046, + "step": 19070 + }, + { + "epoch": 1.1677581247322357, + "grad_norm": 0.2729121148586273, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0043, + "step": 19080 + }, + { + "epoch": 1.168370157292368, + "grad_norm": 0.19535189867019653, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0056, + "step": 19090 + }, + { + "epoch": 1.1689821898525001, + "grad_norm": 0.2282983660697937, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0048, + "step": 19100 + }, + { + "epoch": 1.1695942224126323, + "grad_norm": 0.1281195729970932, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0045, + "step": 19110 + }, + { + "epoch": 1.1702062549727645, + "grad_norm": 0.2850968539714813, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0034, + "step": 19120 + }, + { + "epoch": 1.1708182875328967, + "grad_norm": 0.12891536951065063, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0054, + "step": 19130 + }, + { + "epoch": 1.171430320093029, + "grad_norm": 0.13464727997779846, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0033, + "step": 19140 + }, + { + "epoch": 1.1720423526531611, + "grad_norm": 0.2415568083524704, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0041, + "step": 19150 + }, + { + "epoch": 1.1726543852132933, + "grad_norm": 0.15686331689357758, + "learning_rate": 7.525246655150879e-06, + "loss": 0.004, + "step": 19160 + }, + { + "epoch": 1.1732664177734256, + "grad_norm": 0.15490666031837463, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0039, + "step": 19170 + }, + { + "epoch": 1.1738784503335578, + "grad_norm": 0.14095450937747955, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0034, + "step": 19180 + }, + { + "epoch": 1.17449048289369, + "grad_norm": 0.19024531543254852, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0084, + "step": 19190 + }, + { + "epoch": 1.1751025154538222, + "grad_norm": 0.2583692669868469, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0042, + "step": 19200 + }, + { + "epoch": 1.1757145480139544, + "grad_norm": 0.19117654860019684, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0038, + "step": 19210 + }, + { + "epoch": 1.1763265805740866, + "grad_norm": 0.15838374197483063, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0036, + "step": 19220 + }, + { + "epoch": 1.1769386131342188, + "grad_norm": 0.30352044105529785, + "learning_rate": 7.4623904967312e-06, + "loss": 0.0052, + "step": 19230 + }, + { + "epoch": 1.177550645694351, + "grad_norm": 0.229969322681427, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0043, + "step": 19240 + }, + { + "epoch": 1.1781626782544832, + "grad_norm": 0.17781461775302887, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0046, + "step": 19250 + }, + { + "epoch": 1.1787747108146154, + "grad_norm": 0.1306339055299759, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0032, + "step": 19260 + }, + { + "epoch": 1.1793867433747476, + "grad_norm": 0.15727253258228302, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0045, + "step": 19270 + }, + { + "epoch": 1.1799987759348798, + "grad_norm": 0.24909166991710663, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0045, + "step": 19280 + }, + { + "epoch": 1.180610808495012, + "grad_norm": 0.4604126811027527, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0053, + "step": 19290 + }, + { + "epoch": 1.1812228410551442, + "grad_norm": 0.12739762663841248, + "learning_rate": 7.399737764864619e-06, + "loss": 0.004, + "step": 19300 + }, + { + "epoch": 1.1818348736152764, + "grad_norm": 0.2849223017692566, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0043, + "step": 19310 + }, + { + "epoch": 1.1824469061754086, + "grad_norm": 0.26089897751808167, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0044, + "step": 19320 + }, + { + "epoch": 1.1830589387355408, + "grad_norm": 0.1752242147922516, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0076, + "step": 19330 + }, + { + "epoch": 1.183670971295673, + "grad_norm": 0.14917130768299103, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0097, + "step": 19340 + }, + { + "epoch": 1.1842830038558052, + "grad_norm": 0.1599114090204239, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0061, + "step": 19350 + }, + { + "epoch": 1.1848950364159374, + "grad_norm": 0.16370004415512085, + "learning_rate": 7.346200065486093e-06, + "loss": 0.0035, + "step": 19360 + }, + { + "epoch": 1.1855070689760696, + "grad_norm": 0.19354844093322754, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0032, + "step": 19370 + }, + { + "epoch": 1.1861191015362018, + "grad_norm": 0.19689561426639557, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0067, + "step": 19380 + }, + { + "epoch": 1.186731134096334, + "grad_norm": 0.22203278541564941, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0041, + "step": 19390 + }, + { + "epoch": 1.1873431666564662, + "grad_norm": 0.13579773902893066, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0048, + "step": 19400 + }, + { + "epoch": 1.1879551992165984, + "grad_norm": 0.12321218848228455, + "learning_rate": 7.301703138094429e-06, + "loss": 0.004, + "step": 19410 + }, + { + "epoch": 1.1885672317767306, + "grad_norm": 0.28819525241851807, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0042, + "step": 19420 + }, + { + "epoch": 1.1891792643368628, + "grad_norm": 0.2577916085720062, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0039, + "step": 19430 + }, + { + "epoch": 1.189791296896995, + "grad_norm": 0.26840633153915405, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0062, + "step": 19440 + }, + { + "epoch": 1.1904033294571272, + "grad_norm": 0.24222144484519958, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0044, + "step": 19450 + }, + { + "epoch": 1.1910153620172594, + "grad_norm": 0.157009556889534, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0038, + "step": 19460 + }, + { + "epoch": 1.1916273945773916, + "grad_norm": 0.19925500452518463, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0042, + "step": 19470 + }, + { + "epoch": 1.1922394271375236, + "grad_norm": 0.19200846552848816, + "learning_rate": 7.239590017751423e-06, + "loss": 0.004, + "step": 19480 + }, + { + "epoch": 1.1928514596976558, + "grad_norm": 0.18441490828990936, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0056, + "step": 19490 + }, + { + "epoch": 1.193463492257788, + "grad_norm": 0.27565324306488037, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0046, + "step": 19500 + }, + { + "epoch": 1.1940755248179202, + "grad_norm": 0.17830556631088257, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0043, + "step": 19510 + }, + { + "epoch": 1.1946875573780524, + "grad_norm": 0.2769330143928528, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0037, + "step": 19520 + }, + { + "epoch": 1.1952995899381846, + "grad_norm": 0.168451189994812, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0039, + "step": 19530 + }, + { + "epoch": 1.1959116224983168, + "grad_norm": 0.31246763467788696, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0046, + "step": 19540 + }, + { + "epoch": 1.196523655058449, + "grad_norm": 0.21112671494483948, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0041, + "step": 19550 + }, + { + "epoch": 1.1971356876185812, + "grad_norm": 0.31681302189826965, + "learning_rate": 7.168868583990693e-06, + "loss": 0.005, + "step": 19560 + }, + { + "epoch": 1.1977477201787134, + "grad_norm": 0.18634411692619324, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0042, + "step": 19570 + }, + { + "epoch": 1.1983597527388457, + "grad_norm": 0.17780153453350067, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0057, + "step": 19580 + }, + { + "epoch": 1.1989717852989779, + "grad_norm": 0.19183002412319183, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0043, + "step": 19590 + }, + { + "epoch": 1.19958381785911, + "grad_norm": 0.28469574451446533, + "learning_rate": 7.133615440411572e-06, + "loss": 0.004, + "step": 19600 + }, + { + "epoch": 1.2001958504192423, + "grad_norm": 0.22470368444919586, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0044, + "step": 19610 + }, + { + "epoch": 1.2008078829793745, + "grad_norm": 0.23563240468502045, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0041, + "step": 19620 + }, + { + "epoch": 1.2014199155395067, + "grad_norm": 0.18467430770397186, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0053, + "step": 19630 + }, + { + "epoch": 1.2020319480996389, + "grad_norm": 0.12539178133010864, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0047, + "step": 19640 + }, + { + "epoch": 1.202643980659771, + "grad_norm": 0.2552005648612976, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.004, + "step": 19650 + }, + { + "epoch": 1.2032560132199033, + "grad_norm": 0.13963459432125092, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0035, + "step": 19660 + }, + { + "epoch": 1.2038680457800355, + "grad_norm": 0.17387327551841736, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0038, + "step": 19670 + }, + { + "epoch": 1.2044800783401677, + "grad_norm": 0.1284111589193344, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0044, + "step": 19680 + }, + { + "epoch": 1.2050921109002999, + "grad_norm": 0.22337380051612854, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0041, + "step": 19690 + }, + { + "epoch": 1.205704143460432, + "grad_norm": 0.2254808247089386, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0033, + "step": 19700 + }, + { + "epoch": 1.2063161760205643, + "grad_norm": 0.19316980242729187, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0041, + "step": 19710 + }, + { + "epoch": 1.2069282085806965, + "grad_norm": 0.17951075732707977, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0038, + "step": 19720 + }, + { + "epoch": 1.2075402411408287, + "grad_norm": 0.3105165660381317, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0043, + "step": 19730 + }, + { + "epoch": 1.208152273700961, + "grad_norm": 0.21083533763885498, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0039, + "step": 19740 + }, + { + "epoch": 1.208764306261093, + "grad_norm": 0.20121195912361145, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0035, + "step": 19750 + }, + { + "epoch": 1.2093763388212253, + "grad_norm": 0.20067447423934937, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0047, + "step": 19760 + }, + { + "epoch": 1.2099883713813575, + "grad_norm": 0.15943066775798798, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0039, + "step": 19770 + }, + { + "epoch": 1.2106004039414897, + "grad_norm": 0.21581032872200012, + "learning_rate": 6.975884226362e-06, + "loss": 0.0045, + "step": 19780 + }, + { + "epoch": 1.211212436501622, + "grad_norm": 0.16258753836154938, + "learning_rate": 6.967165692827958e-06, + "loss": 0.004, + "step": 19790 + }, + { + "epoch": 1.2118244690617541, + "grad_norm": 0.18742400407791138, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0047, + "step": 19800 + }, + { + "epoch": 1.2124365016218863, + "grad_norm": 0.09035168588161469, + "learning_rate": 6.949742834253074e-06, + "loss": 0.004, + "step": 19810 + }, + { + "epoch": 1.2130485341820185, + "grad_norm": 0.21749694645404816, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0054, + "step": 19820 + }, + { + "epoch": 1.2136605667421507, + "grad_norm": 0.3189448416233063, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0043, + "step": 19830 + }, + { + "epoch": 1.214272599302283, + "grad_norm": 0.26815512776374817, + "learning_rate": 6.923644220932124e-06, + "loss": 0.005, + "step": 19840 + }, + { + "epoch": 1.2148846318624151, + "grad_norm": 0.19533704221248627, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0037, + "step": 19850 + }, + { + "epoch": 1.2154966644225473, + "grad_norm": 0.36249589920043945, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0064, + "step": 19860 + }, + { + "epoch": 1.2161086969826795, + "grad_norm": 0.19801265001296997, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0042, + "step": 19870 + }, + { + "epoch": 1.2167207295428117, + "grad_norm": 0.10341386497020721, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0053, + "step": 19880 + }, + { + "epoch": 1.217332762102944, + "grad_norm": 0.17985381186008453, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0045, + "step": 19890 + }, + { + "epoch": 1.2179447946630761, + "grad_norm": 0.18160982429981232, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0061, + "step": 19900 + }, + { + "epoch": 1.2185568272232083, + "grad_norm": 0.15552182495594025, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0047, + "step": 19910 + }, + { + "epoch": 1.2191688597833406, + "grad_norm": 0.34908807277679443, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0046, + "step": 19920 + }, + { + "epoch": 1.2197808923434728, + "grad_norm": 0.14835652709007263, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0042, + "step": 19930 + }, + { + "epoch": 1.220392924903605, + "grad_norm": 0.23276430368423462, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0042, + "step": 19940 + }, + { + "epoch": 1.2210049574637372, + "grad_norm": 0.1900823563337326, + "learning_rate": 6.828319751504063e-06, + "loss": 0.004, + "step": 19950 + }, + { + "epoch": 1.2216169900238694, + "grad_norm": 0.134046271443367, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0039, + "step": 19960 + }, + { + "epoch": 1.2222290225840013, + "grad_norm": 0.17264600098133087, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0036, + "step": 19970 + }, + { + "epoch": 1.2228410551441335, + "grad_norm": 0.24845834076404572, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0044, + "step": 19980 + }, + { + "epoch": 1.2234530877042658, + "grad_norm": 0.14805762469768524, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0049, + "step": 19990 + }, + { + "epoch": 1.224065120264398, + "grad_norm": 0.228907972574234, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0043, + "step": 20000 + }, + { + "epoch": 1.2246771528245302, + "grad_norm": 0.16869507730007172, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0041, + "step": 20010 + }, + { + "epoch": 1.2252891853846624, + "grad_norm": 0.1983603835105896, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0041, + "step": 20020 + }, + { + "epoch": 1.2259012179447946, + "grad_norm": 0.17656362056732178, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0028, + "step": 20030 + }, + { + "epoch": 1.2265132505049268, + "grad_norm": 0.1360313892364502, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0069, + "step": 20040 + }, + { + "epoch": 1.227125283065059, + "grad_norm": 0.21057721972465515, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0051, + "step": 20050 + }, + { + "epoch": 1.2277373156251912, + "grad_norm": 0.138632670044899, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0038, + "step": 20060 + }, + { + "epoch": 1.2283493481853234, + "grad_norm": 0.17815573513507843, + "learning_rate": 6.725005485342219e-06, + "loss": 0.003, + "step": 20070 + }, + { + "epoch": 1.2289613807454556, + "grad_norm": 0.1769353598356247, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0066, + "step": 20080 + }, + { + "epoch": 1.2295734133055878, + "grad_norm": 0.23068928718566895, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0048, + "step": 20090 + }, + { + "epoch": 1.23018544586572, + "grad_norm": 0.25139328837394714, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0049, + "step": 20100 + }, + { + "epoch": 1.2307974784258522, + "grad_norm": 0.09128634631633759, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0042, + "step": 20110 + }, + { + "epoch": 1.2314095109859844, + "grad_norm": 0.20516613125801086, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0031, + "step": 20120 + }, + { + "epoch": 1.2320215435461166, + "grad_norm": 0.1518358588218689, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0049, + "step": 20130 + }, + { + "epoch": 1.2326335761062488, + "grad_norm": 0.1673758625984192, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0044, + "step": 20140 + }, + { + "epoch": 1.233245608666381, + "grad_norm": 0.14084585011005402, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0053, + "step": 20150 + }, + { + "epoch": 1.2338576412265132, + "grad_norm": 0.23316942155361176, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0042, + "step": 20160 + }, + { + "epoch": 1.2344696737866454, + "grad_norm": 0.23793813586235046, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0045, + "step": 20170 + }, + { + "epoch": 1.2350817063467776, + "grad_norm": 0.4269389510154724, + "learning_rate": 6.630934952049143e-06, + "loss": 0.005, + "step": 20180 + }, + { + "epoch": 1.2356937389069098, + "grad_norm": 0.15654191374778748, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0039, + "step": 20190 + }, + { + "epoch": 1.236305771467042, + "grad_norm": 0.19204623997211456, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0032, + "step": 20200 + }, + { + "epoch": 1.2369178040271742, + "grad_norm": 0.15817691385746002, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0044, + "step": 20210 + }, + { + "epoch": 1.2375298365873064, + "grad_norm": 0.12637947499752045, + "learning_rate": 6.596880604028027e-06, + "loss": 0.004, + "step": 20220 + }, + { + "epoch": 1.2381418691474386, + "grad_norm": 0.26657921075820923, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0041, + "step": 20230 + }, + { + "epoch": 1.2387539017075708, + "grad_norm": 0.15207791328430176, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0045, + "step": 20240 + }, + { + "epoch": 1.239365934267703, + "grad_norm": 0.32583367824554443, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0042, + "step": 20250 + }, + { + "epoch": 1.2399779668278352, + "grad_norm": 0.15617726743221283, + "learning_rate": 6.562908932779455e-06, + "loss": 0.004, + "step": 20260 + }, + { + "epoch": 1.2405899993879674, + "grad_norm": 0.1935809850692749, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0041, + "step": 20270 + }, + { + "epoch": 1.2412020319480996, + "grad_norm": 0.17422369122505188, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0035, + "step": 20280 + }, + { + "epoch": 1.2418140645082318, + "grad_norm": 0.15332955121994019, + "learning_rate": 6.53748481975927e-06, + "loss": 0.0049, + "step": 20290 + }, + { + "epoch": 1.242426097068364, + "grad_norm": 0.16183018684387207, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0042, + "step": 20300 + }, + { + "epoch": 1.2430381296284962, + "grad_norm": 0.28421106934547424, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0045, + "step": 20310 + }, + { + "epoch": 1.2436501621886284, + "grad_norm": 0.23288874328136444, + "learning_rate": 6.512107839793337e-06, + "loss": 0.004, + "step": 20320 + }, + { + "epoch": 1.2442621947487607, + "grad_norm": 0.17955242097377777, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0036, + "step": 20330 + }, + { + "epoch": 1.2448742273088929, + "grad_norm": 0.20192117989063263, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0055, + "step": 20340 + }, + { + "epoch": 1.245486259869025, + "grad_norm": 0.15365810692310333, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0034, + "step": 20350 + }, + { + "epoch": 1.2460982924291573, + "grad_norm": 0.25220832228660583, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0039, + "step": 20360 + }, + { + "epoch": 1.2467103249892895, + "grad_norm": 0.25777462124824524, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0053, + "step": 20370 + }, + { + "epoch": 1.2473223575494217, + "grad_norm": 0.2693277895450592, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0053, + "step": 20380 + }, + { + "epoch": 1.2479343901095539, + "grad_norm": 0.22846420109272003, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0033, + "step": 20390 + }, + { + "epoch": 1.248546422669686, + "grad_norm": 0.17022505402565002, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0046, + "step": 20400 + }, + { + "epoch": 1.2491584552298183, + "grad_norm": 0.08295682072639465, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0035, + "step": 20410 + }, + { + "epoch": 1.2497704877899505, + "grad_norm": 0.2745625972747803, + "learning_rate": 6.427861749601945e-06, + "loss": 0.0044, + "step": 20420 + }, + { + "epoch": 1.2503825203500827, + "grad_norm": 0.12855033576488495, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0042, + "step": 20430 + }, + { + "epoch": 1.2509945529102149, + "grad_norm": 0.30358386039733887, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0049, + "step": 20440 + }, + { + "epoch": 1.251606585470347, + "grad_norm": 0.15514959394931793, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0048, + "step": 20450 + }, + { + "epoch": 1.2522186180304793, + "grad_norm": 0.1414988487958908, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0088, + "step": 20460 + }, + { + "epoch": 1.2528306505906115, + "grad_norm": 0.17399665713310242, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0046, + "step": 20470 + }, + { + "epoch": 1.2534426831507437, + "grad_norm": 0.22629426419734955, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0038, + "step": 20480 + }, + { + "epoch": 1.254054715710876, + "grad_norm": 0.30595293641090393, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0027, + "step": 20490 + }, + { + "epoch": 1.254666748271008, + "grad_norm": 0.17980262637138367, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0034, + "step": 20500 + }, + { + "epoch": 1.2552787808311403, + "grad_norm": 0.19016452133655548, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0029, + "step": 20510 + }, + { + "epoch": 1.2558908133912725, + "grad_norm": 0.20200394093990326, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0037, + "step": 20520 + }, + { + "epoch": 1.2565028459514047, + "grad_norm": 0.15347513556480408, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0041, + "step": 20530 + }, + { + "epoch": 1.257114878511537, + "grad_norm": 0.1851687729358673, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0042, + "step": 20540 + }, + { + "epoch": 1.2577269110716691, + "grad_norm": 0.2529662549495697, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0037, + "step": 20550 + }, + { + "epoch": 1.2583389436318013, + "grad_norm": 0.18209592998027802, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0037, + "step": 20560 + }, + { + "epoch": 1.2589509761919335, + "grad_norm": 0.18981963396072388, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0036, + "step": 20570 + }, + { + "epoch": 1.2595630087520657, + "grad_norm": 0.13232728838920593, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0035, + "step": 20580 + }, + { + "epoch": 1.260175041312198, + "grad_norm": 0.133514404296875, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0039, + "step": 20590 + }, + { + "epoch": 1.2607870738723301, + "grad_norm": 0.14339123666286469, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0043, + "step": 20600 + }, + { + "epoch": 1.2613991064324623, + "grad_norm": 0.48857489228248596, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0045, + "step": 20610 + }, + { + "epoch": 1.2620111389925945, + "grad_norm": 0.1513262242078781, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0029, + "step": 20620 + }, + { + "epoch": 1.2626231715527267, + "grad_norm": 0.1497354805469513, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0039, + "step": 20630 + }, + { + "epoch": 1.2632352041128587, + "grad_norm": 0.132791206240654, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0037, + "step": 20640 + }, + { + "epoch": 1.263847236672991, + "grad_norm": 0.13804496824741364, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0035, + "step": 20650 + }, + { + "epoch": 1.2644592692331231, + "grad_norm": 0.19393391907215118, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0049, + "step": 20660 + }, + { + "epoch": 1.2650713017932553, + "grad_norm": 0.17623338103294373, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0038, + "step": 20670 + }, + { + "epoch": 1.2656833343533875, + "grad_norm": 0.26931124925613403, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0042, + "step": 20680 + }, + { + "epoch": 1.2662953669135197, + "grad_norm": 0.17984439432621002, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0036, + "step": 20690 + }, + { + "epoch": 1.266907399473652, + "grad_norm": 0.19648219645023346, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0046, + "step": 20700 + }, + { + "epoch": 1.2675194320337841, + "grad_norm": 0.1464766263961792, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0024, + "step": 20710 + }, + { + "epoch": 1.2681314645939163, + "grad_norm": 0.1271074265241623, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0042, + "step": 20720 + }, + { + "epoch": 1.2687434971540485, + "grad_norm": 0.15960967540740967, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0079, + "step": 20730 + }, + { + "epoch": 1.2693555297141808, + "grad_norm": 0.13636153936386108, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0046, + "step": 20740 + }, + { + "epoch": 1.269967562274313, + "grad_norm": 0.19099050760269165, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0046, + "step": 20750 + }, + { + "epoch": 1.2705795948344452, + "grad_norm": 0.28632739186286926, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0036, + "step": 20760 + }, + { + "epoch": 1.2711916273945774, + "grad_norm": 0.2565019726753235, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0055, + "step": 20770 + }, + { + "epoch": 1.2718036599547096, + "grad_norm": 0.24443399906158447, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0053, + "step": 20780 + }, + { + "epoch": 1.2724156925148418, + "grad_norm": 0.1396762877702713, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0029, + "step": 20790 + }, + { + "epoch": 1.273027725074974, + "grad_norm": 0.3028377890586853, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0061, + "step": 20800 + }, + { + "epoch": 1.2736397576351062, + "grad_norm": 0.18195804953575134, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0034, + "step": 20810 + }, + { + "epoch": 1.2742517901952384, + "grad_norm": 0.16194652020931244, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0054, + "step": 20820 + }, + { + "epoch": 1.2748638227553706, + "grad_norm": 0.13011956214904785, + "learning_rate": 6.08816828695283e-06, + "loss": 0.003, + "step": 20830 + }, + { + "epoch": 1.2754758553155028, + "grad_norm": 0.23294220864772797, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0041, + "step": 20840 + }, + { + "epoch": 1.276087887875635, + "grad_norm": 0.1892961710691452, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0031, + "step": 20850 + }, + { + "epoch": 1.2766999204357672, + "grad_norm": 0.1984476000070572, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0046, + "step": 20860 + }, + { + "epoch": 1.2773119529958994, + "grad_norm": 0.158709317445755, + "learning_rate": 6.055535530104466e-06, + "loss": 0.003, + "step": 20870 + }, + { + "epoch": 1.2779239855560316, + "grad_norm": 0.16505110263824463, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0039, + "step": 20880 + }, + { + "epoch": 1.2785360181161638, + "grad_norm": 0.18332232534885406, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0036, + "step": 20890 + }, + { + "epoch": 1.279148050676296, + "grad_norm": 0.1797804981470108, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0049, + "step": 20900 + }, + { + "epoch": 1.2797600832364282, + "grad_norm": 0.19247964024543762, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0053, + "step": 20910 + }, + { + "epoch": 1.2803721157965604, + "grad_norm": 0.17845408618450165, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0045, + "step": 20920 + }, + { + "epoch": 1.2809841483566926, + "grad_norm": 0.09454555809497833, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0027, + "step": 20930 + }, + { + "epoch": 1.2815961809168248, + "grad_norm": 0.12647129595279694, + "learning_rate": 5.998651973182953e-06, + "loss": 0.004, + "step": 20940 + }, + { + "epoch": 1.282208213476957, + "grad_norm": 0.39115941524505615, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0051, + "step": 20950 + }, + { + "epoch": 1.2828202460370892, + "grad_norm": 0.29081296920776367, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0044, + "step": 20960 + }, + { + "epoch": 1.2834322785972214, + "grad_norm": 0.1849275827407837, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0042, + "step": 20970 + }, + { + "epoch": 1.2840443111573536, + "grad_norm": 0.24075689911842346, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0031, + "step": 20980 + }, + { + "epoch": 1.2846563437174858, + "grad_norm": 0.12463482469320297, + "learning_rate": 5.958196751005967e-06, + "loss": 0.003, + "step": 20990 + }, + { + "epoch": 1.285268376277618, + "grad_norm": 0.16987742483615875, + "learning_rate": 5.950123419134817e-06, + "loss": 0.004, + "step": 21000 + }, + { + "epoch": 1.2858804088377502, + "grad_norm": 0.20316782593727112, + "learning_rate": 5.942056013575106e-06, + "loss": 0.004, + "step": 21010 + }, + { + "epoch": 1.2864924413978824, + "grad_norm": 0.20989514887332916, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0053, + "step": 21020 + }, + { + "epoch": 1.2871044739580146, + "grad_norm": 0.33795273303985596, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0048, + "step": 21030 + }, + { + "epoch": 1.2877165065181468, + "grad_norm": 0.13918501138687134, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.004, + "step": 21040 + }, + { + "epoch": 1.288328539078279, + "grad_norm": 0.2992899715900421, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0038, + "step": 21050 + }, + { + "epoch": 1.288940571638411, + "grad_norm": 0.2540164589881897, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0037, + "step": 21060 + }, + { + "epoch": 1.2895526041985432, + "grad_norm": 0.161032035946846, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0047, + "step": 21070 + }, + { + "epoch": 1.2901646367586754, + "grad_norm": 0.1743200421333313, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0037, + "step": 21080 + }, + { + "epoch": 1.2907766693188076, + "grad_norm": 0.26604363322257996, + "learning_rate": 5.877731250949785e-06, + "loss": 0.004, + "step": 21090 + }, + { + "epoch": 1.2913887018789398, + "grad_norm": 0.275696724653244, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0044, + "step": 21100 + }, + { + "epoch": 1.292000734439072, + "grad_norm": 0.16888457536697388, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0042, + "step": 21110 + }, + { + "epoch": 1.2926127669992042, + "grad_norm": 0.12902231514453888, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0048, + "step": 21120 + }, + { + "epoch": 1.2932247995593364, + "grad_norm": 0.14577728509902954, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0046, + "step": 21130 + }, + { + "epoch": 1.2938368321194686, + "grad_norm": 0.1544434279203415, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0031, + "step": 21140 + }, + { + "epoch": 1.2944488646796009, + "grad_norm": 0.09238115698099136, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0035, + "step": 21150 + }, + { + "epoch": 1.295060897239733, + "grad_norm": 0.1770051270723343, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0033, + "step": 21160 + }, + { + "epoch": 1.2956729297998653, + "grad_norm": 0.20360831916332245, + "learning_rate": 5.813791207086085e-06, + "loss": 0.0037, + "step": 21170 + }, + { + "epoch": 1.2962849623599975, + "grad_norm": 0.18503794074058533, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0045, + "step": 21180 + }, + { + "epoch": 1.2968969949201297, + "grad_norm": 0.12918968498706818, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0048, + "step": 21190 + }, + { + "epoch": 1.2975090274802619, + "grad_norm": 0.14289438724517822, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0041, + "step": 21200 + }, + { + "epoch": 1.298121060040394, + "grad_norm": 0.17546117305755615, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0042, + "step": 21210 + }, + { + "epoch": 1.2987330926005263, + "grad_norm": 0.2919277846813202, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0051, + "step": 21220 + }, + { + "epoch": 1.2993451251606585, + "grad_norm": 0.0988069474697113, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0044, + "step": 21230 + }, + { + "epoch": 1.2999571577207907, + "grad_norm": 0.19284513592720032, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0037, + "step": 21240 + }, + { + "epoch": 1.3005691902809229, + "grad_norm": 0.12894058227539062, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0031, + "step": 21250 + }, + { + "epoch": 1.301181222841055, + "grad_norm": 0.14740346372127533, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0037, + "step": 21260 + }, + { + "epoch": 1.3017932554011873, + "grad_norm": 0.16817794740200043, + "learning_rate": 5.734414476316747e-06, + "loss": 0.005, + "step": 21270 + }, + { + "epoch": 1.3024052879613195, + "grad_norm": 0.29237234592437744, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0039, + "step": 21280 + }, + { + "epoch": 1.3030173205214517, + "grad_norm": 0.12649856507778168, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0035, + "step": 21290 + }, + { + "epoch": 1.303629353081584, + "grad_norm": 0.11057443916797638, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0039, + "step": 21300 + }, + { + "epoch": 1.304241385641716, + "grad_norm": 0.13494674861431122, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0035, + "step": 21310 + }, + { + "epoch": 1.3048534182018483, + "grad_norm": 0.3079472482204437, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0042, + "step": 21320 + }, + { + "epoch": 1.3054654507619805, + "grad_norm": 0.13513535261154175, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0048, + "step": 21330 + }, + { + "epoch": 1.3060774833221127, + "grad_norm": 0.39266663789749146, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0046, + "step": 21340 + }, + { + "epoch": 1.306689515882245, + "grad_norm": 0.15097978711128235, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0047, + "step": 21350 + }, + { + "epoch": 1.3073015484423771, + "grad_norm": 0.25206202268600464, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0049, + "step": 21360 + }, + { + "epoch": 1.3079135810025093, + "grad_norm": 0.16765817999839783, + "learning_rate": 5.655655685355026e-06, + "loss": 0.005, + "step": 21370 + }, + { + "epoch": 1.3085256135626415, + "grad_norm": 0.2137158215045929, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0048, + "step": 21380 + }, + { + "epoch": 1.3091376461227737, + "grad_norm": 0.19711454212665558, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0043, + "step": 21390 + }, + { + "epoch": 1.309749678682906, + "grad_norm": 0.1722051054239273, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0044, + "step": 21400 + }, + { + "epoch": 1.3103617112430381, + "grad_norm": 0.1807536482810974, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0045, + "step": 21410 + }, + { + "epoch": 1.3109737438031703, + "grad_norm": 0.15052185952663422, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.004, + "step": 21420 + }, + { + "epoch": 1.3115857763633025, + "grad_norm": 0.1485220491886139, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0036, + "step": 21430 + }, + { + "epoch": 1.3121978089234347, + "grad_norm": 0.15065325796604156, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0037, + "step": 21440 + }, + { + "epoch": 1.312809841483567, + "grad_norm": 0.17903591692447662, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0047, + "step": 21450 + }, + { + "epoch": 1.3134218740436991, + "grad_norm": 0.14310622215270996, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0043, + "step": 21460 + }, + { + "epoch": 1.3140339066038313, + "grad_norm": 0.12117830663919449, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0053, + "step": 21470 + }, + { + "epoch": 1.3146459391639636, + "grad_norm": 0.1484573632478714, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0036, + "step": 21480 + }, + { + "epoch": 1.3152579717240958, + "grad_norm": 0.16559219360351562, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0037, + "step": 21490 + }, + { + "epoch": 1.315870004284228, + "grad_norm": 0.21626432240009308, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0031, + "step": 21500 + }, + { + "epoch": 1.3164820368443602, + "grad_norm": 0.08177383989095688, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0052, + "step": 21510 + }, + { + "epoch": 1.3170940694044924, + "grad_norm": 0.18640732765197754, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0062, + "step": 21520 + }, + { + "epoch": 1.3177061019646246, + "grad_norm": 0.2599853277206421, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0039, + "step": 21530 + }, + { + "epoch": 1.3183181345247568, + "grad_norm": 0.1591203212738037, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0034, + "step": 21540 + }, + { + "epoch": 1.318930167084889, + "grad_norm": 0.2834412455558777, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0037, + "step": 21550 + }, + { + "epoch": 1.3195421996450212, + "grad_norm": 0.13853803277015686, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0038, + "step": 21560 + }, + { + "epoch": 1.3201542322051534, + "grad_norm": 0.14707128703594208, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0042, + "step": 21570 + }, + { + "epoch": 1.3207662647652856, + "grad_norm": 0.12561920285224915, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0038, + "step": 21580 + }, + { + "epoch": 1.3213782973254178, + "grad_norm": 0.4156799018383026, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0051, + "step": 21590 + }, + { + "epoch": 1.32199032988555, + "grad_norm": 0.11400662362575531, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0031, + "step": 21600 + }, + { + "epoch": 1.3226023624456822, + "grad_norm": 0.15658807754516602, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0052, + "step": 21610 + }, + { + "epoch": 1.3232143950058144, + "grad_norm": 0.1212862953543663, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0034, + "step": 21620 + }, + { + "epoch": 1.3238264275659466, + "grad_norm": 0.2201654314994812, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0036, + "step": 21630 + }, + { + "epoch": 1.3244384601260788, + "grad_norm": 0.11623375117778778, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0032, + "step": 21640 + }, + { + "epoch": 1.325050492686211, + "grad_norm": 0.13092897832393646, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0035, + "step": 21650 + }, + { + "epoch": 1.3256625252463432, + "grad_norm": 0.15409153699874878, + "learning_rate": 5.430834687545416e-06, + "loss": 0.004, + "step": 21660 + }, + { + "epoch": 1.3262745578064754, + "grad_norm": 0.3148297369480133, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0031, + "step": 21670 + }, + { + "epoch": 1.3268865903666076, + "grad_norm": 0.13435055315494537, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0033, + "step": 21680 + }, + { + "epoch": 1.3274986229267398, + "grad_norm": 0.17878089845180511, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0029, + "step": 21690 + }, + { + "epoch": 1.328110655486872, + "grad_norm": 0.1823783665895462, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0039, + "step": 21700 + }, + { + "epoch": 1.3287226880470042, + "grad_norm": 0.14492660760879517, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0033, + "step": 21710 + }, + { + "epoch": 1.3293347206071364, + "grad_norm": 0.1730341762304306, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0041, + "step": 21720 + }, + { + "epoch": 1.3299467531672686, + "grad_norm": 0.07961586117744446, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0039, + "step": 21730 + }, + { + "epoch": 1.3305587857274008, + "grad_norm": 0.14440582692623138, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0038, + "step": 21740 + }, + { + "epoch": 1.331170818287533, + "grad_norm": 0.22034496068954468, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0023, + "step": 21750 + }, + { + "epoch": 1.3317828508476652, + "grad_norm": 0.1861305832862854, + "learning_rate": 5.354573491223212e-06, + "loss": 0.005, + "step": 21760 + }, + { + "epoch": 1.3323948834077972, + "grad_norm": 0.15587164461612701, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.0044, + "step": 21770 + }, + { + "epoch": 1.3330069159679294, + "grad_norm": 0.6852900981903076, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0075, + "step": 21780 + }, + { + "epoch": 1.3336189485280616, + "grad_norm": 0.14315280318260193, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0034, + "step": 21790 + }, + { + "epoch": 1.3342309810881938, + "grad_norm": 0.350981205701828, + "learning_rate": 5.324254018551227e-06, + "loss": 0.004, + "step": 21800 + }, + { + "epoch": 1.334843013648326, + "grad_norm": 0.12344911694526672, + "learning_rate": 5.316690780174352e-06, + "loss": 0.004, + "step": 21810 + }, + { + "epoch": 1.3354550462084582, + "grad_norm": 0.18744061887264252, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0035, + "step": 21820 + }, + { + "epoch": 1.3360670787685904, + "grad_norm": 0.22747837007045746, + "learning_rate": 5.301584321328435e-06, + "loss": 0.004, + "step": 21830 + }, + { + "epoch": 1.3366791113287226, + "grad_norm": 0.22695699334144592, + "learning_rate": 5.294041118587667e-06, + "loss": 0.0042, + "step": 21840 + }, + { + "epoch": 1.3372911438888548, + "grad_norm": 0.17258964478969574, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0044, + "step": 21850 + }, + { + "epoch": 1.337903176448987, + "grad_norm": 0.1523793637752533, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0047, + "step": 21860 + }, + { + "epoch": 1.3385152090091192, + "grad_norm": 0.1983587145805359, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0037, + "step": 21870 + }, + { + "epoch": 1.3391272415692514, + "grad_norm": 0.1263747215270996, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0034, + "step": 21880 + }, + { + "epoch": 1.3397392741293837, + "grad_norm": 0.1550009399652481, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0037, + "step": 21890 + }, + { + "epoch": 1.3403513066895159, + "grad_norm": 0.14963915944099426, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0038, + "step": 21900 + }, + { + "epoch": 1.340963339249648, + "grad_norm": 0.17783671617507935, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0036, + "step": 21910 + }, + { + "epoch": 1.3415753718097803, + "grad_norm": 0.2715896964073181, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0036, + "step": 21920 + }, + { + "epoch": 1.3421874043699125, + "grad_norm": 0.22924886643886566, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0037, + "step": 21930 + }, + { + "epoch": 1.3427994369300447, + "grad_norm": 0.13689789175987244, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0033, + "step": 21940 + }, + { + "epoch": 1.3434114694901769, + "grad_norm": 0.09137748926877975, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0031, + "step": 21950 + }, + { + "epoch": 1.344023502050309, + "grad_norm": 0.17097881436347961, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0031, + "step": 21960 + }, + { + "epoch": 1.3446355346104413, + "grad_norm": 0.23919200897216797, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0046, + "step": 21970 + }, + { + "epoch": 1.3452475671705735, + "grad_norm": 0.14261527359485626, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0037, + "step": 21980 + }, + { + "epoch": 1.3458595997307057, + "grad_norm": 0.156734898686409, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0043, + "step": 21990 + }, + { + "epoch": 1.3464716322908379, + "grad_norm": 0.21755588054656982, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0032, + "step": 22000 + }, + { + "epoch": 1.34708366485097, + "grad_norm": 0.1373317390680313, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0033, + "step": 22010 + }, + { + "epoch": 1.3476956974111023, + "grad_norm": 0.1646856814622879, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0047, + "step": 22020 + }, + { + "epoch": 1.3483077299712345, + "grad_norm": 0.1908850073814392, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0044, + "step": 22030 + }, + { + "epoch": 1.3489197625313667, + "grad_norm": 0.24862833321094513, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0041, + "step": 22040 + }, + { + "epoch": 1.349531795091499, + "grad_norm": 0.15980397164821625, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0033, + "step": 22050 + }, + { + "epoch": 1.350143827651631, + "grad_norm": 0.1157977357506752, + "learning_rate": 5.129800405815733e-06, + "loss": 0.0036, + "step": 22060 + }, + { + "epoch": 1.3507558602117633, + "grad_norm": 0.11186888068914413, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0046, + "step": 22070 + }, + { + "epoch": 1.3513678927718955, + "grad_norm": 0.17715996503829956, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0035, + "step": 22080 + }, + { + "epoch": 1.3519799253320277, + "grad_norm": 0.1265174001455307, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0048, + "step": 22090 + }, + { + "epoch": 1.35259195789216, + "grad_norm": 0.13969522714614868, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0028, + "step": 22100 + }, + { + "epoch": 1.3532039904522921, + "grad_norm": 0.13246525824069977, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0026, + "step": 22110 + }, + { + "epoch": 1.3538160230124243, + "grad_norm": 0.14675064384937286, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0082, + "step": 22120 + }, + { + "epoch": 1.3544280555725565, + "grad_norm": 0.15810683369636536, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0031, + "step": 22130 + }, + { + "epoch": 1.3550400881326887, + "grad_norm": 0.20675864815711975, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0035, + "step": 22140 + }, + { + "epoch": 1.355652120692821, + "grad_norm": 0.1921442300081253, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0038, + "step": 22150 + }, + { + "epoch": 1.3562641532529531, + "grad_norm": 0.14300711452960968, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0035, + "step": 22160 + }, + { + "epoch": 1.3568761858130853, + "grad_norm": 0.0656728520989418, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0047, + "step": 22170 + }, + { + "epoch": 1.3574882183732175, + "grad_norm": 0.148203507065773, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0041, + "step": 22180 + }, + { + "epoch": 1.3581002509333495, + "grad_norm": 0.15472126007080078, + "learning_rate": 5.034310349217475e-06, + "loss": 0.004, + "step": 22190 + }, + { + "epoch": 1.3587122834934817, + "grad_norm": 0.12006669491529465, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0034, + "step": 22200 + }, + { + "epoch": 1.359324316053614, + "grad_norm": 0.15345145761966705, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0032, + "step": 22210 + }, + { + "epoch": 1.3599363486137461, + "grad_norm": 0.17429186403751373, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0039, + "step": 22220 + }, + { + "epoch": 1.3605483811738783, + "grad_norm": 0.20691345632076263, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0029, + "step": 22230 + }, + { + "epoch": 1.3611604137340105, + "grad_norm": 0.1874946504831314, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0042, + "step": 22240 + }, + { + "epoch": 1.3617724462941427, + "grad_norm": 0.12159912288188934, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0033, + "step": 22250 + }, + { + "epoch": 1.362384478854275, + "grad_norm": 0.29434919357299805, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0044, + "step": 22260 + }, + { + "epoch": 1.3629965114144071, + "grad_norm": 0.06661798804998398, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0031, + "step": 22270 + }, + { + "epoch": 1.3636085439745393, + "grad_norm": 0.14819994568824768, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0039, + "step": 22280 + }, + { + "epoch": 1.3642205765346715, + "grad_norm": 0.17289887368679047, + "learning_rate": 4.961660586405147e-06, + "loss": 0.0035, + "step": 22290 + }, + { + "epoch": 1.3648326090948038, + "grad_norm": 0.18789313733577728, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0036, + "step": 22300 + }, + { + "epoch": 1.365444641654936, + "grad_norm": 0.1877586394548416, + "learning_rate": 4.947215397583639e-06, + "loss": 0.004, + "step": 22310 + }, + { + "epoch": 1.3660566742150682, + "grad_norm": 0.11696574836969376, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0036, + "step": 22320 + }, + { + "epoch": 1.3666687067752004, + "grad_norm": 0.2511763274669647, + "learning_rate": 4.932798621873274e-06, + "loss": 0.004, + "step": 22330 + }, + { + "epoch": 1.3672807393353326, + "grad_norm": 0.15005314350128174, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0046, + "step": 22340 + }, + { + "epoch": 1.3678927718954648, + "grad_norm": 0.16856855154037476, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0056, + "step": 22350 + }, + { + "epoch": 1.368504804455597, + "grad_norm": 0.24532385170459747, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0035, + "step": 22360 + }, + { + "epoch": 1.3691168370157292, + "grad_norm": 0.29320162534713745, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0047, + "step": 22370 + }, + { + "epoch": 1.3697288695758614, + "grad_norm": 0.1518300473690033, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0041, + "step": 22380 + }, + { + "epoch": 1.3703409021359936, + "grad_norm": 0.13431201875209808, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0033, + "step": 22390 + }, + { + "epoch": 1.3709529346961258, + "grad_norm": 0.17390409111976624, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0039, + "step": 22400 + }, + { + "epoch": 1.371564967256258, + "grad_norm": 0.16482478380203247, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.007, + "step": 22410 + }, + { + "epoch": 1.3721769998163902, + "grad_norm": 0.11469490826129913, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0041, + "step": 22420 + }, + { + "epoch": 1.3727890323765224, + "grad_norm": 0.2327135056257248, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0043, + "step": 22430 + }, + { + "epoch": 1.3734010649366546, + "grad_norm": 0.1373092532157898, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0036, + "step": 22440 + }, + { + "epoch": 1.3740130974967868, + "grad_norm": 0.1534084528684616, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0028, + "step": 22450 + }, + { + "epoch": 1.374625130056919, + "grad_norm": 0.3217960596084595, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0044, + "step": 22460 + }, + { + "epoch": 1.3752371626170512, + "grad_norm": 0.14245563745498657, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0039, + "step": 22470 + }, + { + "epoch": 1.3758491951771834, + "grad_norm": 0.17652876675128937, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0031, + "step": 22480 + }, + { + "epoch": 1.3764612277373156, + "grad_norm": 0.1996244192123413, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0034, + "step": 22490 + }, + { + "epoch": 1.3770732602974478, + "grad_norm": 0.1658472716808319, + "learning_rate": 4.81141273556404e-06, + "loss": 0.003, + "step": 22500 + }, + { + "epoch": 1.37768529285758, + "grad_norm": 0.16233472526073456, + "learning_rate": 4.804337352679613e-06, + "loss": 0.004, + "step": 22510 + }, + { + "epoch": 1.3782973254177122, + "grad_norm": 0.13045033812522888, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0049, + "step": 22520 + }, + { + "epoch": 1.3789093579778444, + "grad_norm": 0.1195274218916893, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0042, + "step": 22530 + }, + { + "epoch": 1.3795213905379766, + "grad_norm": 0.14395804703235626, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0036, + "step": 22540 + }, + { + "epoch": 1.3801334230981088, + "grad_norm": 0.24495497345924377, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0047, + "step": 22550 + }, + { + "epoch": 1.380745455658241, + "grad_norm": 0.14288006722927094, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0044, + "step": 22560 + }, + { + "epoch": 1.3813574882183732, + "grad_norm": 0.16967979073524475, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0051, + "step": 22570 + }, + { + "epoch": 1.3819695207785054, + "grad_norm": 0.2023036777973175, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0032, + "step": 22580 + }, + { + "epoch": 1.3825815533386376, + "grad_norm": 0.1191902756690979, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0026, + "step": 22590 + }, + { + "epoch": 1.3831935858987698, + "grad_norm": 0.16922403872013092, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0037, + "step": 22600 + }, + { + "epoch": 1.383805618458902, + "grad_norm": 0.12394976615905762, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0038, + "step": 22610 + }, + { + "epoch": 1.3844176510190342, + "grad_norm": 0.23889753222465515, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0041, + "step": 22620 + }, + { + "epoch": 1.3850296835791664, + "grad_norm": 0.31215062737464905, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0036, + "step": 22630 + }, + { + "epoch": 1.3856417161392987, + "grad_norm": 0.1519152820110321, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0042, + "step": 22640 + }, + { + "epoch": 1.3862537486994309, + "grad_norm": 0.3375433683395386, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0039, + "step": 22650 + }, + { + "epoch": 1.386865781259563, + "grad_norm": 0.21715323626995087, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0027, + "step": 22660 + }, + { + "epoch": 1.3874778138196953, + "grad_norm": 0.2066027969121933, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0033, + "step": 22670 + }, + { + "epoch": 1.3880898463798275, + "grad_norm": 0.11542408168315887, + "learning_rate": 4.6851750421442e-06, + "loss": 0.004, + "step": 22680 + }, + { + "epoch": 1.3887018789399597, + "grad_norm": 0.1183561235666275, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0037, + "step": 22690 + }, + { + "epoch": 1.3893139115000919, + "grad_norm": 0.24478662014007568, + "learning_rate": 4.67129597392514e-06, + "loss": 0.004, + "step": 22700 + }, + { + "epoch": 1.389925944060224, + "grad_norm": 0.28880801796913147, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0039, + "step": 22710 + }, + { + "epoch": 1.3905379766203563, + "grad_norm": 0.14014701545238495, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0034, + "step": 22720 + }, + { + "epoch": 1.3911500091804885, + "grad_norm": 0.1549793928861618, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0033, + "step": 22730 + }, + { + "epoch": 1.3917620417406207, + "grad_norm": 0.1423012614250183, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0041, + "step": 22740 + }, + { + "epoch": 1.3923740743007529, + "grad_norm": 0.291273832321167, + "learning_rate": 4.636728419531758e-06, + "loss": 0.004, + "step": 22750 + }, + { + "epoch": 1.392986106860885, + "grad_norm": 0.38278621435165405, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0045, + "step": 22760 + }, + { + "epoch": 1.3935981394210173, + "grad_norm": 0.20528365671634674, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0042, + "step": 22770 + }, + { + "epoch": 1.3942101719811495, + "grad_norm": 0.11913729459047318, + "learning_rate": 4.616077433849538e-06, + "loss": 0.003, + "step": 22780 + }, + { + "epoch": 1.3948222045412817, + "grad_norm": 0.21683627367019653, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0027, + "step": 22790 + }, + { + "epoch": 1.395434237101414, + "grad_norm": 0.12143554538488388, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0031, + "step": 22800 + }, + { + "epoch": 1.396046269661546, + "grad_norm": 0.14171159267425537, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0039, + "step": 22810 + }, + { + "epoch": 1.3966583022216783, + "grad_norm": 0.19254790246486664, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0043, + "step": 22820 + }, + { + "epoch": 1.3972703347818105, + "grad_norm": 0.12295825034379959, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0045, + "step": 22830 + }, + { + "epoch": 1.3978823673419427, + "grad_norm": 0.1274985820055008, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0037, + "step": 22840 + }, + { + "epoch": 1.398494399902075, + "grad_norm": 0.2940427362918854, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0059, + "step": 22850 + }, + { + "epoch": 1.3991064324622071, + "grad_norm": 0.15357589721679688, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0032, + "step": 22860 + }, + { + "epoch": 1.3997184650223393, + "grad_norm": 0.12781603634357452, + "learning_rate": 4.554529907376127e-06, + "loss": 0.003, + "step": 22870 + }, + { + "epoch": 1.4003304975824715, + "grad_norm": 0.34976109862327576, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0047, + "step": 22880 + }, + { + "epoch": 1.4009425301426035, + "grad_norm": 0.1797824203968048, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0034, + "step": 22890 + }, + { + "epoch": 1.4015545627027357, + "grad_norm": 0.13750647008419037, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0046, + "step": 22900 + }, + { + "epoch": 1.402166595262868, + "grad_norm": 0.22893266379833221, + "learning_rate": 4.527371771040039e-06, + "loss": 0.005, + "step": 22910 + }, + { + "epoch": 1.4027786278230001, + "grad_norm": 0.1595923751592636, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0045, + "step": 22920 + }, + { + "epoch": 1.4033906603831323, + "grad_norm": 0.11474192142486572, + "learning_rate": 4.513838246961138e-06, + "loss": 0.003, + "step": 22930 + }, + { + "epoch": 1.4040026929432645, + "grad_norm": 0.12208060175180435, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0038, + "step": 22940 + }, + { + "epoch": 1.4046147255033967, + "grad_norm": 0.2919016480445862, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0036, + "step": 22950 + }, + { + "epoch": 1.405226758063529, + "grad_norm": 0.19161155819892883, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0049, + "step": 22960 + }, + { + "epoch": 1.4058387906236611, + "grad_norm": 0.1454700380563736, + "learning_rate": 4.486862604628113e-06, + "loss": 0.004, + "step": 22970 + }, + { + "epoch": 1.4064508231837933, + "grad_norm": 0.227305606007576, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0043, + "step": 22980 + }, + { + "epoch": 1.4070628557439255, + "grad_norm": 0.09430288523435593, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0051, + "step": 22990 + }, + { + "epoch": 1.4076748883040577, + "grad_norm": 0.09664178639650345, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0036, + "step": 23000 + }, + { + "epoch": 1.40828692086419, + "grad_norm": 0.21268269419670105, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0031, + "step": 23010 + }, + { + "epoch": 1.4088989534243221, + "grad_norm": 0.09796992689371109, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0041, + "step": 23020 + }, + { + "epoch": 1.4095109859844543, + "grad_norm": 0.18376071751117706, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0039, + "step": 23030 + }, + { + "epoch": 1.4101230185445865, + "grad_norm": 0.10276145488023758, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0035, + "step": 23040 + }, + { + "epoch": 1.4107350511047188, + "grad_norm": 0.16089564561843872, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0051, + "step": 23050 + }, + { + "epoch": 1.411347083664851, + "grad_norm": 0.1825491487979889, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0036, + "step": 23060 + }, + { + "epoch": 1.4119591162249832, + "grad_norm": 0.24405492842197418, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0028, + "step": 23070 + }, + { + "epoch": 1.4125711487851154, + "grad_norm": 0.14085668325424194, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0039, + "step": 23080 + }, + { + "epoch": 1.4131831813452476, + "grad_norm": 0.11708472669124603, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0035, + "step": 23090 + }, + { + "epoch": 1.4137952139053798, + "grad_norm": 0.12108796834945679, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0036, + "step": 23100 + }, + { + "epoch": 1.414407246465512, + "grad_norm": 0.14601854979991913, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0036, + "step": 23110 + }, + { + "epoch": 1.4150192790256442, + "grad_norm": 0.10614772886037827, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0031, + "step": 23120 + }, + { + "epoch": 1.4156313115857764, + "grad_norm": 0.09014416486024857, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0027, + "step": 23130 + }, + { + "epoch": 1.4162433441459086, + "grad_norm": 0.15246634185314178, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0031, + "step": 23140 + }, + { + "epoch": 1.4168553767060408, + "grad_norm": 0.20104879140853882, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0075, + "step": 23150 + }, + { + "epoch": 1.417467409266173, + "grad_norm": 0.1359969973564148, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0036, + "step": 23160 + }, + { + "epoch": 1.4180794418263052, + "grad_norm": 0.19849587976932526, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0054, + "step": 23170 + }, + { + "epoch": 1.4186914743864374, + "grad_norm": 0.12617377936840057, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0024, + "step": 23180 + }, + { + "epoch": 1.4193035069465696, + "grad_norm": 0.15024134516716003, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0034, + "step": 23190 + }, + { + "epoch": 1.4199155395067018, + "grad_norm": 0.2345605194568634, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0036, + "step": 23200 + }, + { + "epoch": 1.420527572066834, + "grad_norm": 0.13125917315483093, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0035, + "step": 23210 + }, + { + "epoch": 1.4211396046269662, + "grad_norm": 0.20977836847305298, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0038, + "step": 23220 + }, + { + "epoch": 1.4217516371870984, + "grad_norm": 0.3925677537918091, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0043, + "step": 23230 + }, + { + "epoch": 1.4223636697472306, + "grad_norm": 0.17691555619239807, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0048, + "step": 23240 + }, + { + "epoch": 1.4229757023073628, + "grad_norm": 0.18366187810897827, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0033, + "step": 23250 + }, + { + "epoch": 1.423587734867495, + "grad_norm": 0.15539205074310303, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0039, + "step": 23260 + }, + { + "epoch": 1.4241997674276272, + "grad_norm": 0.15048520267009735, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0032, + "step": 23270 + }, + { + "epoch": 1.4248117999877594, + "grad_norm": 0.2631739675998688, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0061, + "step": 23280 + }, + { + "epoch": 1.4254238325478916, + "grad_norm": 0.18545641005039215, + "learning_rate": 4.275502195405868e-06, + "loss": 0.005, + "step": 23290 + }, + { + "epoch": 1.4260358651080238, + "grad_norm": 0.25486356019973755, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0033, + "step": 23300 + }, + { + "epoch": 1.426647897668156, + "grad_norm": 0.2514204978942871, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0043, + "step": 23310 + }, + { + "epoch": 1.427259930228288, + "grad_norm": 0.12997376918792725, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0034, + "step": 23320 + }, + { + "epoch": 1.4278719627884202, + "grad_norm": 0.26096200942993164, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0047, + "step": 23330 + }, + { + "epoch": 1.4284839953485524, + "grad_norm": 0.2292930781841278, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0038, + "step": 23340 + }, + { + "epoch": 1.4290960279086846, + "grad_norm": 0.20056717097759247, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0037, + "step": 23350 + }, + { + "epoch": 1.4297080604688168, + "grad_norm": 0.1608581393957138, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0032, + "step": 23360 + }, + { + "epoch": 1.430320093028949, + "grad_norm": 0.235102578997612, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0034, + "step": 23370 + }, + { + "epoch": 1.4309321255890812, + "grad_norm": 0.11869259178638458, + "learning_rate": 4.217502203129258e-06, + "loss": 0.005, + "step": 23380 + }, + { + "epoch": 1.4315441581492134, + "grad_norm": 0.167036771774292, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0045, + "step": 23390 + }, + { + "epoch": 1.4321561907093456, + "grad_norm": 0.13766071200370789, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0044, + "step": 23400 + }, + { + "epoch": 1.4327682232694778, + "grad_norm": 0.15444986522197723, + "learning_rate": 4.198311874248223e-06, + "loss": 0.004, + "step": 23410 + }, + { + "epoch": 1.43338025582961, + "grad_norm": 0.11997724324464798, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0034, + "step": 23420 + }, + { + "epoch": 1.4339922883897422, + "grad_norm": 0.1533307433128357, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0038, + "step": 23430 + }, + { + "epoch": 1.4346043209498744, + "grad_norm": 0.10954161733388901, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0045, + "step": 23440 + }, + { + "epoch": 1.4352163535100066, + "grad_norm": 0.16601058840751648, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0039, + "step": 23450 + }, + { + "epoch": 1.4358283860701389, + "grad_norm": 0.1756889373064041, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0035, + "step": 23460 + }, + { + "epoch": 1.436440418630271, + "grad_norm": 0.12633845210075378, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0046, + "step": 23470 + }, + { + "epoch": 1.4370524511904033, + "grad_norm": 0.15678541362285614, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0025, + "step": 23480 + }, + { + "epoch": 1.4376644837505355, + "grad_norm": 0.13923659920692444, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0052, + "step": 23490 + }, + { + "epoch": 1.4382765163106677, + "grad_norm": 0.28792211413383484, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0053, + "step": 23500 + }, + { + "epoch": 1.4388885488707999, + "grad_norm": 0.16125047206878662, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0037, + "step": 23510 + }, + { + "epoch": 1.439500581430932, + "grad_norm": 0.2653597593307495, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0067, + "step": 23520 + }, + { + "epoch": 1.4401126139910643, + "grad_norm": 0.2692917585372925, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0031, + "step": 23530 + }, + { + "epoch": 1.4407246465511965, + "grad_norm": 0.2234862893819809, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0044, + "step": 23540 + }, + { + "epoch": 1.4413366791113287, + "grad_norm": 0.17526887357234955, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0025, + "step": 23550 + }, + { + "epoch": 1.4419487116714609, + "grad_norm": 0.10404029488563538, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0041, + "step": 23560 + }, + { + "epoch": 1.442560744231593, + "grad_norm": 0.1385052353143692, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0027, + "step": 23570 + }, + { + "epoch": 1.4431727767917253, + "grad_norm": 0.30865412950515747, + "learning_rate": 4.090929556079854e-06, + "loss": 0.004, + "step": 23580 + }, + { + "epoch": 1.4437848093518575, + "grad_norm": 0.10908320546150208, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0041, + "step": 23590 + }, + { + "epoch": 1.4443968419119897, + "grad_norm": 0.09885916113853455, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0045, + "step": 23600 + }, + { + "epoch": 1.445008874472122, + "grad_norm": 0.1685211956501007, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0031, + "step": 23610 + }, + { + "epoch": 1.445620907032254, + "grad_norm": 0.0967954769730568, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0031, + "step": 23620 + }, + { + "epoch": 1.4462329395923863, + "grad_norm": 0.07489120960235596, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0038, + "step": 23630 + }, + { + "epoch": 1.4468449721525185, + "grad_norm": 0.20616063475608826, + "learning_rate": 4.053587511509546e-06, + "loss": 0.0043, + "step": 23640 + }, + { + "epoch": 1.4474570047126507, + "grad_norm": 0.15788249671459198, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0031, + "step": 23650 + }, + { + "epoch": 1.448069037272783, + "grad_norm": 0.10360633581876755, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0027, + "step": 23660 + }, + { + "epoch": 1.4486810698329151, + "grad_norm": 0.2871163785457611, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0028, + "step": 23670 + }, + { + "epoch": 1.4492931023930473, + "grad_norm": 0.15280364453792572, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0042, + "step": 23680 + }, + { + "epoch": 1.4499051349531795, + "grad_norm": 0.17502477765083313, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0031, + "step": 23690 + }, + { + "epoch": 1.4505171675133117, + "grad_norm": 0.2154005616903305, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0036, + "step": 23700 + }, + { + "epoch": 1.451129200073444, + "grad_norm": 0.15002919733524323, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0033, + "step": 23710 + }, + { + "epoch": 1.4517412326335761, + "grad_norm": 0.10422170162200928, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0032, + "step": 23720 + }, + { + "epoch": 1.4523532651937083, + "grad_norm": 0.15197636187076569, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0043, + "step": 23730 + }, + { + "epoch": 1.4529652977538405, + "grad_norm": 0.2571481466293335, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0039, + "step": 23740 + }, + { + "epoch": 1.4535773303139727, + "grad_norm": 0.12697578966617584, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0034, + "step": 23750 + }, + { + "epoch": 1.454189362874105, + "grad_norm": 0.14347535371780396, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0051, + "step": 23760 + }, + { + "epoch": 1.4548013954342371, + "grad_norm": 0.1494351178407669, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0037, + "step": 23770 + }, + { + "epoch": 1.4554134279943693, + "grad_norm": 0.23901797831058502, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0032, + "step": 23780 + }, + { + "epoch": 1.4560254605545015, + "grad_norm": 0.1434790939092636, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0036, + "step": 23790 + }, + { + "epoch": 1.4566374931146338, + "grad_norm": 0.1456829458475113, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0034, + "step": 23800 + }, + { + "epoch": 1.457249525674766, + "grad_norm": 0.33969590067863464, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0038, + "step": 23810 + }, + { + "epoch": 1.4578615582348982, + "grad_norm": 0.1768753081560135, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0027, + "step": 23820 + }, + { + "epoch": 1.4584735907950304, + "grad_norm": 0.15212708711624146, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0032, + "step": 23830 + }, + { + "epoch": 1.4590856233551626, + "grad_norm": 0.10870973765850067, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0033, + "step": 23840 + }, + { + "epoch": 1.4596976559152948, + "grad_norm": 0.17898528277873993, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0022, + "step": 23850 + }, + { + "epoch": 1.460309688475427, + "grad_norm": 0.15515227615833282, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0034, + "step": 23860 + }, + { + "epoch": 1.4609217210355592, + "grad_norm": 0.11047070473432541, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0032, + "step": 23870 + }, + { + "epoch": 1.4615337535956914, + "grad_norm": 0.08628113567829132, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0032, + "step": 23880 + }, + { + "epoch": 1.4621457861558236, + "grad_norm": 0.358903706073761, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0043, + "step": 23890 + }, + { + "epoch": 1.4627578187159558, + "grad_norm": 0.13986052572727203, + "learning_rate": 3.895183209452123e-06, + "loss": 0.003, + "step": 23900 + }, + { + "epoch": 1.463369851276088, + "grad_norm": 0.09236793220043182, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0029, + "step": 23910 + }, + { + "epoch": 1.4639818838362202, + "grad_norm": 0.14616963267326355, + "learning_rate": 3.883230136754435e-06, + "loss": 0.005, + "step": 23920 + }, + { + "epoch": 1.4645939163963524, + "grad_norm": 0.0754290223121643, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0031, + "step": 23930 + }, + { + "epoch": 1.4652059489564846, + "grad_norm": 0.16520163416862488, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0037, + "step": 23940 + }, + { + "epoch": 1.4658179815166168, + "grad_norm": 0.06801608204841614, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0023, + "step": 23950 + }, + { + "epoch": 1.466430014076749, + "grad_norm": 0.3087909519672394, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0042, + "step": 23960 + }, + { + "epoch": 1.4670420466368812, + "grad_norm": 0.23470532894134521, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0046, + "step": 23970 + }, + { + "epoch": 1.4676540791970134, + "grad_norm": 0.10248749703168869, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0025, + "step": 23980 + }, + { + "epoch": 1.4682661117571456, + "grad_norm": 0.12478570640087128, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0035, + "step": 23990 + }, + { + "epoch": 1.4688781443172778, + "grad_norm": 0.16669252514839172, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0034, + "step": 24000 + }, + { + "epoch": 1.46949017687741, + "grad_norm": 0.12477939575910568, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0033, + "step": 24010 + }, + { + "epoch": 1.470102209437542, + "grad_norm": 0.1738445907831192, + "learning_rate": 3.823967005382315e-06, + "loss": 0.003, + "step": 24020 + }, + { + "epoch": 1.4707142419976742, + "grad_norm": 0.11228524148464203, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0029, + "step": 24030 + }, + { + "epoch": 1.4713262745578064, + "grad_norm": 0.28472721576690674, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0035, + "step": 24040 + }, + { + "epoch": 1.4719383071179386, + "grad_norm": 0.18087328970432281, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0037, + "step": 24050 + }, + { + "epoch": 1.4725503396780708, + "grad_norm": 0.39030423760414124, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0043, + "step": 24060 + }, + { + "epoch": 1.473162372238203, + "grad_norm": 0.164345845580101, + "learning_rate": 3.794650811106129e-06, + "loss": 0.0029, + "step": 24070 + }, + { + "epoch": 1.4737744047983352, + "grad_norm": 0.14081600308418274, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.0038, + "step": 24080 + }, + { + "epoch": 1.4743864373584674, + "grad_norm": 0.27649205923080444, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0037, + "step": 24090 + }, + { + "epoch": 1.4749984699185996, + "grad_norm": 0.08673480153083801, + "learning_rate": 3.777162510056721e-06, + "loss": 0.004, + "step": 24100 + }, + { + "epoch": 1.4756105024787318, + "grad_norm": 0.11770286411046982, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0033, + "step": 24110 + }, + { + "epoch": 1.476222535038864, + "grad_norm": 0.11967290937900543, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0035, + "step": 24120 + }, + { + "epoch": 1.4768345675989962, + "grad_norm": 0.12635833024978638, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0043, + "step": 24130 + }, + { + "epoch": 1.4774466001591284, + "grad_norm": 0.13505803048610687, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.0034, + "step": 24140 + }, + { + "epoch": 1.4780586327192606, + "grad_norm": 0.17781652510166168, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0031, + "step": 24150 + }, + { + "epoch": 1.4786706652793928, + "grad_norm": 0.18974725902080536, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0051, + "step": 24160 + }, + { + "epoch": 1.479282697839525, + "grad_norm": 0.12072815746068954, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0027, + "step": 24170 + }, + { + "epoch": 1.4798947303996572, + "grad_norm": 0.10813914984464645, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0023, + "step": 24180 + }, + { + "epoch": 1.4805067629597894, + "grad_norm": 0.07975378632545471, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0035, + "step": 24190 + }, + { + "epoch": 1.4811187955199216, + "grad_norm": 0.0948014184832573, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0033, + "step": 24200 + }, + { + "epoch": 1.4817308280800539, + "grad_norm": 0.11943913251161575, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0038, + "step": 24210 + }, + { + "epoch": 1.482342860640186, + "grad_norm": 0.34374934434890747, + "learning_rate": 3.707974016467e-06, + "loss": 0.0043, + "step": 24220 + }, + { + "epoch": 1.4829548932003183, + "grad_norm": 0.264528751373291, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0037, + "step": 24230 + }, + { + "epoch": 1.4835669257604505, + "grad_norm": 0.08419078588485718, + "learning_rate": 3.696562092850226e-06, + "loss": 0.0031, + "step": 24240 + }, + { + "epoch": 1.4841789583205827, + "grad_norm": 0.3805602192878723, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0039, + "step": 24250 + }, + { + "epoch": 1.4847909908807149, + "grad_norm": 0.09091196954250336, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0034, + "step": 24260 + }, + { + "epoch": 1.485403023440847, + "grad_norm": 0.1352047175168991, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0033, + "step": 24270 + }, + { + "epoch": 1.4860150560009793, + "grad_norm": 0.14287787675857544, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0033, + "step": 24280 + }, + { + "epoch": 1.4866270885611115, + "grad_norm": 0.15490861237049103, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0037, + "step": 24290 + }, + { + "epoch": 1.4872391211212437, + "grad_norm": 0.08607941120862961, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0042, + "step": 24300 + }, + { + "epoch": 1.4878511536813759, + "grad_norm": 0.2872561514377594, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0039, + "step": 24310 + }, + { + "epoch": 1.488463186241508, + "grad_norm": 0.09383561462163925, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.0036, + "step": 24320 + }, + { + "epoch": 1.4890752188016403, + "grad_norm": 0.13576671481132507, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0039, + "step": 24330 + }, + { + "epoch": 1.4896872513617725, + "grad_norm": 0.21924526989459991, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0038, + "step": 24340 + }, + { + "epoch": 1.4902992839219047, + "grad_norm": 0.24333837628364563, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0034, + "step": 24350 + }, + { + "epoch": 1.490911316482037, + "grad_norm": 0.08171682059764862, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0034, + "step": 24360 + }, + { + "epoch": 1.491523349042169, + "grad_norm": 0.11815544962882996, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.0027, + "step": 24370 + }, + { + "epoch": 1.4921353816023013, + "grad_norm": 0.15248773992061615, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0035, + "step": 24380 + }, + { + "epoch": 1.4927474141624335, + "grad_norm": 0.13664020597934723, + "learning_rate": 3.612069140022124e-06, + "loss": 0.0032, + "step": 24390 + }, + { + "epoch": 1.4933594467225657, + "grad_norm": 0.2877022624015808, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0034, + "step": 24400 + }, + { + "epoch": 1.493971479282698, + "grad_norm": 0.1447642594575882, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0033, + "step": 24410 + }, + { + "epoch": 1.4945835118428301, + "grad_norm": 0.18032193183898926, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0036, + "step": 24420 + }, + { + "epoch": 1.4951955444029623, + "grad_norm": 0.1249038353562355, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0023, + "step": 24430 + }, + { + "epoch": 1.4958075769630943, + "grad_norm": 0.21674089133739471, + "learning_rate": 3.584337233394337e-06, + "loss": 0.0036, + "step": 24440 + }, + { + "epoch": 1.4964196095232265, + "grad_norm": 0.2503979504108429, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0039, + "step": 24450 + }, + { + "epoch": 1.4970316420833587, + "grad_norm": 0.15412171185016632, + "learning_rate": 3.573305344104808e-06, + "loss": 0.0035, + "step": 24460 + }, + { + "epoch": 1.497643674643491, + "grad_norm": 0.17718803882598877, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0055, + "step": 24470 + }, + { + "epoch": 1.498255707203623, + "grad_norm": 0.24290283024311066, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0033, + "step": 24480 + }, + { + "epoch": 1.4988677397637553, + "grad_norm": 0.20131447911262512, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.0035, + "step": 24490 + }, + { + "epoch": 1.4994797723238875, + "grad_norm": 0.18041104078292847, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0037, + "step": 24500 + }, + { + "epoch": 1.5000918048840197, + "grad_norm": 0.11311472952365875, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0027, + "step": 24510 + }, + { + "epoch": 1.500703837444152, + "grad_norm": 0.10401099175214767, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0035, + "step": 24520 + }, + { + "epoch": 1.5013158700042841, + "grad_norm": 0.16640698909759521, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.0029, + "step": 24530 + }, + { + "epoch": 1.5019279025644163, + "grad_norm": 0.1116192489862442, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0027, + "step": 24540 + }, + { + "epoch": 1.5025399351245485, + "grad_norm": 0.14617346227169037, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0037, + "step": 24550 + }, + { + "epoch": 1.5031519676846807, + "grad_norm": 0.10546499490737915, + "learning_rate": 3.518669865884119e-06, + "loss": 0.0025, + "step": 24560 + }, + { + "epoch": 1.503764000244813, + "grad_norm": 0.11696954816579819, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0039, + "step": 24570 + }, + { + "epoch": 1.5043760328049451, + "grad_norm": 0.1503429412841797, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0038, + "step": 24580 + }, + { + "epoch": 1.5049880653650773, + "grad_norm": 0.13094773888587952, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0037, + "step": 24590 + }, + { + "epoch": 1.5056000979252095, + "grad_norm": 0.1519947648048401, + "learning_rate": 3.497061149826966e-06, + "loss": 0.0027, + "step": 24600 + }, + { + "epoch": 1.5062121304853417, + "grad_norm": 0.3586391806602478, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0041, + "step": 24610 + }, + { + "epoch": 1.506824163045474, + "grad_norm": 0.14964115619659424, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0029, + "step": 24620 + }, + { + "epoch": 1.5074361956056062, + "grad_norm": 0.2676304578781128, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.0033, + "step": 24630 + }, + { + "epoch": 1.5080482281657384, + "grad_norm": 0.117411769926548, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0041, + "step": 24640 + }, + { + "epoch": 1.5086602607258706, + "grad_norm": 0.11224953830242157, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0035, + "step": 24650 + }, + { + "epoch": 1.5092722932860028, + "grad_norm": 0.14367471635341644, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.0033, + "step": 24660 + }, + { + "epoch": 1.509884325846135, + "grad_norm": 0.27663105726242065, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.004, + "step": 24670 + }, + { + "epoch": 1.5104963584062672, + "grad_norm": 0.08599471300840378, + "learning_rate": 3.454266765790622e-06, + "loss": 0.0031, + "step": 24680 + }, + { + "epoch": 1.5111083909663994, + "grad_norm": 0.11320041120052338, + "learning_rate": 3.448957251110008e-06, + "loss": 0.0033, + "step": 24690 + }, + { + "epoch": 1.5117204235265316, + "grad_norm": 0.0896427258849144, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0032, + "step": 24700 + }, + { + "epoch": 1.5123324560866638, + "grad_norm": 0.1055784597992897, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0028, + "step": 24710 + }, + { + "epoch": 1.512944488646796, + "grad_norm": 0.0936208963394165, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0036, + "step": 24720 + }, + { + "epoch": 1.5135565212069282, + "grad_norm": 0.13069137930870056, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.0033, + "step": 24730 + }, + { + "epoch": 1.5141685537670604, + "grad_norm": 0.17260710895061493, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0034, + "step": 24740 + }, + { + "epoch": 1.5147805863271926, + "grad_norm": 0.26109611988067627, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.0044, + "step": 24750 + }, + { + "epoch": 1.5153926188873248, + "grad_norm": 0.22439827024936676, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.0042, + "step": 24760 + }, + { + "epoch": 1.516004651447457, + "grad_norm": 0.2269357591867447, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0039, + "step": 24770 + }, + { + "epoch": 1.5166166840075892, + "grad_norm": 0.20416954159736633, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0039, + "step": 24780 + }, + { + "epoch": 1.5172287165677214, + "grad_norm": 0.1766926646232605, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0031, + "step": 24790 + }, + { + "epoch": 1.5178407491278536, + "grad_norm": 0.05759773403406143, + "learning_rate": 3.391138816571675e-06, + "loss": 0.0029, + "step": 24800 + }, + { + "epoch": 1.5184527816879858, + "grad_norm": 0.19152496755123138, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0034, + "step": 24810 + }, + { + "epoch": 1.519064814248118, + "grad_norm": 0.09876703470945358, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0032, + "step": 24820 + }, + { + "epoch": 1.5196768468082502, + "grad_norm": 0.11626110225915909, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0037, + "step": 24830 + }, + { + "epoch": 1.5202888793683824, + "grad_norm": 0.13713783025741577, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0027, + "step": 24840 + }, + { + "epoch": 1.5209009119285146, + "grad_norm": 0.19144660234451294, + "learning_rate": 3.36521439484193e-06, + "loss": 0.004, + "step": 24850 + }, + { + "epoch": 1.5215129444886468, + "grad_norm": 0.1376778483390808, + "learning_rate": 3.36005636574796e-06, + "loss": 0.0037, + "step": 24860 + }, + { + "epoch": 1.522124977048779, + "grad_norm": 0.4120432436466217, + "learning_rate": 3.354907302553392e-06, + "loss": 0.0042, + "step": 24870 + }, + { + "epoch": 1.5227370096089112, + "grad_norm": 0.14245551824569702, + "learning_rate": 3.349767211300933e-06, + "loss": 0.003, + "step": 24880 + }, + { + "epoch": 1.5233490421690434, + "grad_norm": 0.19136923551559448, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0053, + "step": 24890 + }, + { + "epoch": 1.5239610747291756, + "grad_norm": 0.28412777185440063, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.0031, + "step": 24900 + }, + { + "epoch": 1.5245731072893078, + "grad_norm": 0.18925072252750397, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.003, + "step": 24910 + }, + { + "epoch": 1.52518513984944, + "grad_norm": 0.21378494799137115, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0034, + "step": 24920 + }, + { + "epoch": 1.5257971724095722, + "grad_norm": 0.19160443544387817, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0037, + "step": 24930 + }, + { + "epoch": 1.5264092049697044, + "grad_norm": 0.19070027768611908, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0038, + "step": 24940 + }, + { + "epoch": 1.5270212375298367, + "grad_norm": 0.20489074289798737, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.003, + "step": 24950 + }, + { + "epoch": 1.5276332700899689, + "grad_norm": 0.15747228264808655, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0037, + "step": 24960 + }, + { + "epoch": 1.528245302650101, + "grad_norm": 0.21312901377677917, + "learning_rate": 3.303911119253872e-06, + "loss": 0.0035, + "step": 24970 + }, + { + "epoch": 1.5288573352102333, + "grad_norm": 0.10329846292734146, + "learning_rate": 3.298861077451818e-06, + "loss": 0.0033, + "step": 24980 + }, + { + "epoch": 1.5294693677703655, + "grad_norm": 0.13872355222702026, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0056, + "step": 24990 + }, + { + "epoch": 1.5300814003304977, + "grad_norm": 0.08532251417636871, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0026, + "step": 25000 + }, + { + "epoch": 1.5306934328906299, + "grad_norm": 0.1309783011674881, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.0038, + "step": 25010 + }, + { + "epoch": 1.531305465450762, + "grad_norm": 0.16484731435775757, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.0034, + "step": 25020 + }, + { + "epoch": 1.5319174980108943, + "grad_norm": 0.1756003201007843, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0044, + "step": 25030 + }, + { + "epoch": 1.5325295305710265, + "grad_norm": 0.13745243847370148, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0063, + "step": 25040 + }, + { + "epoch": 1.5331415631311587, + "grad_norm": 0.1077183336019516, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0032, + "step": 25050 + }, + { + "epoch": 1.5337535956912909, + "grad_norm": 0.3091605007648468, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0036, + "step": 25060 + }, + { + "epoch": 1.534365628251423, + "grad_norm": 0.13469856977462769, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0031, + "step": 25070 + }, + { + "epoch": 1.5349776608115553, + "grad_norm": 0.2445354014635086, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0037, + "step": 25080 + }, + { + "epoch": 1.5355896933716875, + "grad_norm": 0.1065889522433281, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0038, + "step": 25090 + }, + { + "epoch": 1.5362017259318197, + "grad_norm": 0.1539459079504013, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0036, + "step": 25100 + }, + { + "epoch": 1.536813758491952, + "grad_norm": 0.23242861032485962, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0037, + "step": 25110 + }, + { + "epoch": 1.537425791052084, + "grad_norm": 0.18660615384578705, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0034, + "step": 25120 + }, + { + "epoch": 1.5380378236122163, + "grad_norm": 0.14089861512184143, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0057, + "step": 25130 + }, + { + "epoch": 1.5386498561723485, + "grad_norm": 0.30568358302116394, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0036, + "step": 25140 + }, + { + "epoch": 1.5392618887324807, + "grad_norm": 0.0965384691953659, + "learning_rate": 3.214397932123149e-06, + "loss": 0.0029, + "step": 25150 + }, + { + "epoch": 1.539873921292613, + "grad_norm": 0.12925416231155396, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0028, + "step": 25160 + }, + { + "epoch": 1.5404859538527451, + "grad_norm": 0.10820749402046204, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0025, + "step": 25170 + }, + { + "epoch": 1.5410979864128773, + "grad_norm": 0.200232595205307, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0038, + "step": 25180 + }, + { + "epoch": 1.5417100189730095, + "grad_norm": 0.13515910506248474, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.0037, + "step": 25190 + }, + { + "epoch": 1.5423220515331417, + "grad_norm": 0.08493158221244812, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0026, + "step": 25200 + }, + { + "epoch": 1.542934084093274, + "grad_norm": 0.21674226224422455, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0047, + "step": 25210 + }, + { + "epoch": 1.543546116653406, + "grad_norm": 0.18259066343307495, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0029, + "step": 25220 + }, + { + "epoch": 1.5441581492135381, + "grad_norm": 0.14857260882854462, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0029, + "step": 25230 + }, + { + "epoch": 1.5447701817736703, + "grad_norm": 0.1540914922952652, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.0026, + "step": 25240 + }, + { + "epoch": 1.5453822143338025, + "grad_norm": 0.08827090263366699, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0029, + "step": 25250 + }, + { + "epoch": 1.5459942468939347, + "grad_norm": 0.07511961460113525, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0035, + "step": 25260 + }, + { + "epoch": 1.546606279454067, + "grad_norm": 0.26209381222724915, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.0033, + "step": 25270 + }, + { + "epoch": 1.5472183120141991, + "grad_norm": 0.08861620724201202, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.0033, + "step": 25280 + }, + { + "epoch": 1.5478303445743313, + "grad_norm": 0.1642802655696869, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0032, + "step": 25290 + }, + { + "epoch": 1.5484423771344635, + "grad_norm": 0.24771225452423096, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0034, + "step": 25300 + }, + { + "epoch": 1.5490544096945957, + "grad_norm": 0.2717854976654053, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.004, + "step": 25310 + }, + { + "epoch": 1.549666442254728, + "grad_norm": 0.12177802622318268, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.0029, + "step": 25320 + }, + { + "epoch": 1.5502784748148601, + "grad_norm": 0.09988416731357574, + "learning_rate": 3.127844986891409e-06, + "loss": 0.0052, + "step": 25330 + }, + { + "epoch": 1.5508905073749923, + "grad_norm": 0.08877446502447128, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0024, + "step": 25340 + }, + { + "epoch": 1.5515025399351245, + "grad_norm": 0.16233091056346893, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.003, + "step": 25350 + }, + { + "epoch": 1.5521145724952568, + "grad_norm": 0.10167178511619568, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.0034, + "step": 25360 + }, + { + "epoch": 1.552726605055389, + "grad_norm": 0.14738866686820984, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0036, + "step": 25370 + }, + { + "epoch": 1.5533386376155212, + "grad_norm": 0.07526370882987976, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0036, + "step": 25380 + }, + { + "epoch": 1.5539506701756534, + "grad_norm": 0.1659732311964035, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0032, + "step": 25390 + }, + { + "epoch": 1.5545627027357856, + "grad_norm": 0.18707287311553955, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0023, + "step": 25400 + }, + { + "epoch": 1.5551747352959178, + "grad_norm": 0.21416662633419037, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0029, + "step": 25410 + }, + { + "epoch": 1.55578676785605, + "grad_norm": 0.3034561574459076, + "learning_rate": 3.085688933413021e-06, + "loss": 0.003, + "step": 25420 + }, + { + "epoch": 1.5563988004161822, + "grad_norm": 0.18879717588424683, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0035, + "step": 25430 + }, + { + "epoch": 1.5570108329763144, + "grad_norm": 0.12917254865169525, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.0038, + "step": 25440 + }, + { + "epoch": 1.5576228655364466, + "grad_norm": 0.0970548763871193, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0046, + "step": 25450 + }, + { + "epoch": 1.5582348980965788, + "grad_norm": 0.17424598336219788, + "learning_rate": 3.067194157156521e-06, + "loss": 0.003, + "step": 25460 + }, + { + "epoch": 1.558846930656711, + "grad_norm": 0.11429346352815628, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0039, + "step": 25470 + }, + { + "epoch": 1.5594589632168432, + "grad_norm": 0.19154596328735352, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0028, + "step": 25480 + }, + { + "epoch": 1.5600709957769754, + "grad_norm": 0.1475156843662262, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.0028, + "step": 25490 + }, + { + "epoch": 1.5606830283371074, + "grad_norm": 0.29066604375839233, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0037, + "step": 25500 + }, + { + "epoch": 1.5612950608972396, + "grad_norm": 0.21379634737968445, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.004, + "step": 25510 + }, + { + "epoch": 1.5619070934573718, + "grad_norm": 0.1648091822862625, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.003, + "step": 25520 + }, + { + "epoch": 1.562519126017504, + "grad_norm": 0.2791198790073395, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0038, + "step": 25530 + }, + { + "epoch": 1.5631311585776362, + "grad_norm": 0.13038018345832825, + "learning_rate": 3.030651808761638e-06, + "loss": 0.0028, + "step": 25540 + }, + { + "epoch": 1.5637431911377684, + "grad_norm": 0.07513634115457535, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0032, + "step": 25550 + }, + { + "epoch": 1.5643552236979006, + "grad_norm": 0.34259703755378723, + "learning_rate": 3.021609639602321e-06, + "loss": 0.0034, + "step": 25560 + }, + { + "epoch": 1.5649672562580328, + "grad_norm": 0.1602829545736313, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.0036, + "step": 25570 + }, + { + "epoch": 1.565579288818165, + "grad_norm": 0.11303776502609253, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.003, + "step": 25580 + }, + { + "epoch": 1.5661913213782972, + "grad_norm": 0.06348636001348495, + "learning_rate": 3.008116622200155e-06, + "loss": 0.0038, + "step": 25590 + }, + { + "epoch": 1.5668033539384294, + "grad_norm": 0.2563594579696655, + "learning_rate": 3.003637700546652e-06, + "loss": 0.0027, + "step": 25600 + }, + { + "epoch": 1.5674153864985616, + "grad_norm": 0.08260748535394669, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0029, + "step": 25610 + }, + { + "epoch": 1.5680274190586938, + "grad_norm": 0.15986980497837067, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0044, + "step": 25620 + }, + { + "epoch": 1.568639451618826, + "grad_norm": 0.19412761926651, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.004, + "step": 25630 + }, + { + "epoch": 1.5692514841789582, + "grad_norm": 0.16794568300247192, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0042, + "step": 25640 + }, + { + "epoch": 1.5698635167390904, + "grad_norm": 0.34898805618286133, + "learning_rate": 2.981383959667165e-06, + "loss": 0.003, + "step": 25650 + }, + { + "epoch": 1.5704755492992226, + "grad_norm": 0.11825685203075409, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0037, + "step": 25660 + }, + { + "epoch": 1.5710875818593548, + "grad_norm": 0.1430155634880066, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0049, + "step": 25670 + }, + { + "epoch": 1.571699614419487, + "grad_norm": 0.13148540258407593, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0031, + "step": 25680 + }, + { + "epoch": 1.5723116469796192, + "grad_norm": 0.14384756982326508, + "learning_rate": 2.963750320724704e-06, + "loss": 0.0036, + "step": 25690 + }, + { + "epoch": 1.5729236795397514, + "grad_norm": 0.11322541534900665, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0028, + "step": 25700 + }, + { + "epoch": 1.5735357120998836, + "grad_norm": 0.1428067833185196, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0033, + "step": 25710 + }, + { + "epoch": 1.5741477446600158, + "grad_norm": 0.1169947013258934, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.0024, + "step": 25720 + }, + { + "epoch": 1.574759777220148, + "grad_norm": 0.33150142431259155, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0038, + "step": 25730 + }, + { + "epoch": 1.5753718097802802, + "grad_norm": 0.12486983090639114, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.003, + "step": 25740 + }, + { + "epoch": 1.5759838423404124, + "grad_norm": 0.12485318630933762, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0025, + "step": 25750 + }, + { + "epoch": 1.5765958749005446, + "grad_norm": 0.10158280283212662, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0032, + "step": 25760 + }, + { + "epoch": 1.5772079074606769, + "grad_norm": 0.13820113241672516, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0041, + "step": 25770 + }, + { + "epoch": 1.577819940020809, + "grad_norm": 0.18718287348747253, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0027, + "step": 25780 + }, + { + "epoch": 1.5784319725809413, + "grad_norm": 0.154324010014534, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.004, + "step": 25790 + }, + { + "epoch": 1.5790440051410735, + "grad_norm": 0.10862802714109421, + "learning_rate": 2.916036854664115e-06, + "loss": 0.0039, + "step": 25800 + }, + { + "epoch": 1.5796560377012057, + "grad_norm": 0.11738114804029465, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0032, + "step": 25810 + }, + { + "epoch": 1.5802680702613379, + "grad_norm": 0.08674368262290955, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0035, + "step": 25820 + }, + { + "epoch": 1.58088010282147, + "grad_norm": 0.16917847096920013, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0032, + "step": 25830 + }, + { + "epoch": 1.5814921353816023, + "grad_norm": 0.10122957825660706, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0044, + "step": 25840 + }, + { + "epoch": 1.5821041679417345, + "grad_norm": 0.14450572431087494, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.0031, + "step": 25850 + }, + { + "epoch": 1.5827162005018667, + "grad_norm": 0.11220426112413406, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0035, + "step": 25860 + }, + { + "epoch": 1.5833282330619989, + "grad_norm": 0.15793107450008392, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0034, + "step": 25870 + }, + { + "epoch": 1.583940265622131, + "grad_norm": 0.11485118418931961, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0049, + "step": 25880 + }, + { + "epoch": 1.5845522981822633, + "grad_norm": 0.11588255316019058, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0032, + "step": 25890 + }, + { + "epoch": 1.5851643307423955, + "grad_norm": 0.09770877659320831, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0033, + "step": 25900 + }, + { + "epoch": 1.5857763633025277, + "grad_norm": 0.4078996479511261, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0041, + "step": 25910 + }, + { + "epoch": 1.58638839586266, + "grad_norm": 0.16744333505630493, + "learning_rate": 2.865295218604555e-06, + "loss": 0.003, + "step": 25920 + }, + { + "epoch": 1.587000428422792, + "grad_norm": 0.10358662158250809, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0034, + "step": 25930 + }, + { + "epoch": 1.5876124609829243, + "grad_norm": 0.1420212686061859, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0024, + "step": 25940 + }, + { + "epoch": 1.5882244935430565, + "grad_norm": 0.1387208104133606, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0035, + "step": 25950 + }, + { + "epoch": 1.5888365261031887, + "grad_norm": 0.2383398711681366, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0044, + "step": 25960 + }, + { + "epoch": 1.589448558663321, + "grad_norm": 0.1263049691915512, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.0029, + "step": 25970 + }, + { + "epoch": 1.5900605912234531, + "grad_norm": 0.10938797891139984, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0029, + "step": 25980 + }, + { + "epoch": 1.5906726237835853, + "grad_norm": 0.18173988163471222, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0033, + "step": 25990 + }, + { + "epoch": 1.5912846563437175, + "grad_norm": 0.20956522226333618, + "learning_rate": 2.832230653119002e-06, + "loss": 0.003, + "step": 26000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.624061151019008e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/training_args.bin b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cd9e28a44ae85140e2ef027a82e8be4c39167cc4 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5644791eb57bcb4c4808b4c2429b71e4c49eece4fc60f263f4553a3380f230bb +size 6097 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/added_tokens.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/generation_config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00001-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c7e73b631f8b2e4b681ae079d800d1805986a0c4 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59953db9f62d3b4aa922f3af4236e97005504315bd6496f77809092bdbc22fb3 +size 4921072616 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00002-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0e79e956daf80aa40db4b64972bb0ed0d3c67958 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4769ec5a2230aa709ec1bb9e62b69ba59491e9488f00889604915c0bb9c8791c +size 4978830984 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00003-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..11d2aee91baa2dfbac7e88862ccf3bfccb37c401 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fc45a5ff77dd6fb9410ca729dd4c9ac6e57bffe8d9854573b97de25dcbde236 +size 4100977896 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model.safetensors.index.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/norm_stats.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..f33de4b80f47e0bac1a414431a8354d8345d60c5 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -24.65332532291412, + -30.64622355117798, + -14.452480476760865, + -1.8581012797355654, + -2.2742317820549007, + -1.9569469915390014, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 3.0011677881240857, + 22.348905650329584, + 21.68580058555603, + 2.3937565994262693, + 4.117288079452516, + 3.295379007720948, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + -6.570000648498535, + -1.0618462562561035, + 3.623035430908203, + 0.010442602448165417, + 0.7240540385246277, + 0.44398337602615356, + 0.12898989021778107, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 7.04909086227417, + 17.099597930908203, + 8.363018989562988, + 0.6997263431549072, + 1.1358375549316406, + 0.9687971472740173, + 0.9916459321975708, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.777750787353515, + -21.249025872802733, + -2.4021557040214536, + -4.092200187206268, + -3.2986312219619753, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.645499613952634, + 30.59561934127808, + 14.405443457031247, + 1.8499586300849913, + 2.268683268356323, + 1.963451420021057, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.6817545890808105, + 1.3444018363952637, + -3.5411791801452637, + -0.009792014956474304, + -0.7230188846588135, + -0.44849714636802673, + 0.15749873220920563, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.988739013671875, + 16.884004592895508, + 8.242538452148438, + 0.6991510391235352, + 1.1302146911621094, + 0.9690405130386353, + 0.9875192046165466, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/pi0.yaml b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff8c0ecf109af377331583e4079865e7d8037bc8 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 5 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/special_tokens_map.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/tokenizer.model b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/tokenizer_config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/trainer_state.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2a37d0a68fa2cea01a634c81049aeaf8115709bd --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/trainer_state.json @@ -0,0 +1,19634 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7136911683701572, + "eval_steps": 500, + "global_step": 28000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006120325601321991, + "grad_norm": 2.2432243824005127, + "learning_rate": 1.8e-07, + "loss": 0.1384, + "step": 10 + }, + { + "epoch": 0.0012240651202643981, + "grad_norm": 1.959119439125061, + "learning_rate": 3.8e-07, + "loss": 0.1388, + "step": 20 + }, + { + "epoch": 0.001836097680396597, + "grad_norm": 1.8843899965286255, + "learning_rate": 5.800000000000001e-07, + "loss": 0.1307, + "step": 30 + }, + { + "epoch": 0.0024481302405287963, + "grad_norm": 1.7569042444229126, + "learning_rate": 7.8e-07, + "loss": 0.1238, + "step": 40 + }, + { + "epoch": 0.0030601628006609954, + "grad_norm": 2.6189017295837402, + "learning_rate": 9.800000000000001e-07, + "loss": 0.1275, + "step": 50 + }, + { + "epoch": 0.003672195360793194, + "grad_norm": 1.8418694734573364, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.1032, + "step": 60 + }, + { + "epoch": 0.004284227920925393, + "grad_norm": 1.481676697731018, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0816, + "step": 70 + }, + { + "epoch": 0.004896260481057593, + "grad_norm": 0.9590038061141968, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.0732, + "step": 80 + }, + { + "epoch": 0.005508293041189791, + "grad_norm": 1.002897024154663, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.0608, + "step": 90 + }, + { + "epoch": 0.006120325601321991, + "grad_norm": 0.9830108284950256, + "learning_rate": 1.98e-06, + "loss": 0.042, + "step": 100 + }, + { + "epoch": 0.006732358161454189, + "grad_norm": 0.858244001865387, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.0314, + "step": 110 + }, + { + "epoch": 0.007344390721586388, + "grad_norm": 0.5761063694953918, + "learning_rate": 2.38e-06, + "loss": 0.029, + "step": 120 + }, + { + "epoch": 0.007956423281718587, + "grad_norm": 0.5434514284133911, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.0227, + "step": 130 + }, + { + "epoch": 0.008568455841850786, + "grad_norm": 0.6488766670227051, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.0202, + "step": 140 + }, + { + "epoch": 0.009180488401982986, + "grad_norm": 0.36763015389442444, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.0157, + "step": 150 + }, + { + "epoch": 0.009792520962115185, + "grad_norm": 0.49271446466445923, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.0194, + "step": 160 + }, + { + "epoch": 0.010404553522247383, + "grad_norm": 0.23608209192752838, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.0122, + "step": 170 + }, + { + "epoch": 0.011016586082379582, + "grad_norm": 0.47871828079223633, + "learning_rate": 3.58e-06, + "loss": 0.0131, + "step": 180 + }, + { + "epoch": 0.011628618642511782, + "grad_norm": 0.6862446069717407, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.0131, + "step": 190 + }, + { + "epoch": 0.012240651202643981, + "grad_norm": 0.7964349389076233, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0155, + "step": 200 + }, + { + "epoch": 0.01285268376277618, + "grad_norm": 0.5564846396446228, + "learning_rate": 4.18e-06, + "loss": 0.0104, + "step": 210 + }, + { + "epoch": 0.013464716322908379, + "grad_norm": 0.2810452878475189, + "learning_rate": 4.38e-06, + "loss": 0.0128, + "step": 220 + }, + { + "epoch": 0.014076748883040578, + "grad_norm": 0.4474979341030121, + "learning_rate": 4.58e-06, + "loss": 0.0188, + "step": 230 + }, + { + "epoch": 0.014688781443172776, + "grad_norm": 0.47965875267982483, + "learning_rate": 4.78e-06, + "loss": 0.0141, + "step": 240 + }, + { + "epoch": 0.015300814003304975, + "grad_norm": 0.3410812020301819, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0085, + "step": 250 + }, + { + "epoch": 0.015912846563437173, + "grad_norm": 0.39907002449035645, + "learning_rate": 5.18e-06, + "loss": 0.0106, + "step": 260 + }, + { + "epoch": 0.016524879123569373, + "grad_norm": 0.28909367322921753, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0103, + "step": 270 + }, + { + "epoch": 0.017136911683701572, + "grad_norm": 0.31524109840393066, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0101, + "step": 280 + }, + { + "epoch": 0.017748944243833772, + "grad_norm": 0.29430100321769714, + "learning_rate": 5.78e-06, + "loss": 0.0109, + "step": 290 + }, + { + "epoch": 0.01836097680396597, + "grad_norm": 0.2709169387817383, + "learning_rate": 5.98e-06, + "loss": 0.0102, + "step": 300 + }, + { + "epoch": 0.01897300936409817, + "grad_norm": 0.33067119121551514, + "learning_rate": 6.18e-06, + "loss": 0.0095, + "step": 310 + }, + { + "epoch": 0.01958504192423037, + "grad_norm": 0.28110620379447937, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0102, + "step": 320 + }, + { + "epoch": 0.02019707448436257, + "grad_norm": 0.27736902236938477, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0088, + "step": 330 + }, + { + "epoch": 0.020809107044494766, + "grad_norm": 0.3238557279109955, + "learning_rate": 6.780000000000001e-06, + "loss": 0.01, + "step": 340 + }, + { + "epoch": 0.021421139604626965, + "grad_norm": 0.30263441801071167, + "learning_rate": 6.98e-06, + "loss": 0.0095, + "step": 350 + }, + { + "epoch": 0.022033172164759165, + "grad_norm": 0.2618265450000763, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0096, + "step": 360 + }, + { + "epoch": 0.022645204724891364, + "grad_norm": 0.272565633058548, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0093, + "step": 370 + }, + { + "epoch": 0.023257237285023564, + "grad_norm": 0.44272440671920776, + "learning_rate": 7.58e-06, + "loss": 0.0087, + "step": 380 + }, + { + "epoch": 0.023869269845155763, + "grad_norm": 0.27631404995918274, + "learning_rate": 7.78e-06, + "loss": 0.0093, + "step": 390 + }, + { + "epoch": 0.024481302405287963, + "grad_norm": 0.4108494520187378, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0093, + "step": 400 + }, + { + "epoch": 0.02509333496542016, + "grad_norm": 0.43498387932777405, + "learning_rate": 8.18e-06, + "loss": 0.0098, + "step": 410 + }, + { + "epoch": 0.02570536752555236, + "grad_norm": 0.3419845700263977, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0091, + "step": 420 + }, + { + "epoch": 0.026317400085684558, + "grad_norm": 0.5677013993263245, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0104, + "step": 430 + }, + { + "epoch": 0.026929432645816757, + "grad_norm": 0.24424298107624054, + "learning_rate": 8.78e-06, + "loss": 0.0089, + "step": 440 + }, + { + "epoch": 0.027541465205948957, + "grad_norm": 0.267781138420105, + "learning_rate": 8.98e-06, + "loss": 0.0107, + "step": 450 + }, + { + "epoch": 0.028153497766081156, + "grad_norm": 0.38459253311157227, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0081, + "step": 460 + }, + { + "epoch": 0.028765530326213356, + "grad_norm": 0.2647954523563385, + "learning_rate": 9.38e-06, + "loss": 0.0082, + "step": 470 + }, + { + "epoch": 0.029377562886345552, + "grad_norm": 0.44312018156051636, + "learning_rate": 9.58e-06, + "loss": 0.0102, + "step": 480 + }, + { + "epoch": 0.02998959544647775, + "grad_norm": 0.2309781014919281, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0118, + "step": 490 + }, + { + "epoch": 0.03060162800660995, + "grad_norm": 0.41755014657974243, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0094, + "step": 500 + }, + { + "epoch": 0.03121366056674215, + "grad_norm": 0.38537120819091797, + "learning_rate": 1.018e-05, + "loss": 0.011, + "step": 510 + }, + { + "epoch": 0.031825693126874346, + "grad_norm": 0.49801477789878845, + "learning_rate": 1.038e-05, + "loss": 0.0093, + "step": 520 + }, + { + "epoch": 0.03243772568700655, + "grad_norm": 0.3854966163635254, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0116, + "step": 530 + }, + { + "epoch": 0.033049758247138745, + "grad_norm": 0.3163810968399048, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.008, + "step": 540 + }, + { + "epoch": 0.03366179080727095, + "grad_norm": 0.33000636100769043, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0093, + "step": 550 + }, + { + "epoch": 0.034273823367403145, + "grad_norm": 0.3350297808647156, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0083, + "step": 560 + }, + { + "epoch": 0.03488585592753535, + "grad_norm": 0.18780949711799622, + "learning_rate": 1.138e-05, + "loss": 0.0097, + "step": 570 + }, + { + "epoch": 0.035497888487667544, + "grad_norm": 0.20399607717990875, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0092, + "step": 580 + }, + { + "epoch": 0.03610992104779974, + "grad_norm": 0.15931005775928497, + "learning_rate": 1.178e-05, + "loss": 0.0076, + "step": 590 + }, + { + "epoch": 0.03672195360793194, + "grad_norm": 0.20751547813415527, + "learning_rate": 1.198e-05, + "loss": 0.0079, + "step": 600 + }, + { + "epoch": 0.03733398616806414, + "grad_norm": 0.39666953682899475, + "learning_rate": 1.218e-05, + "loss": 0.0072, + "step": 610 + }, + { + "epoch": 0.03794601872819634, + "grad_norm": 0.385407030582428, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0089, + "step": 620 + }, + { + "epoch": 0.03855805128832854, + "grad_norm": 0.5228332877159119, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0092, + "step": 630 + }, + { + "epoch": 0.03917008384846074, + "grad_norm": 0.29315415024757385, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0098, + "step": 640 + }, + { + "epoch": 0.03978211640859294, + "grad_norm": 0.4300646483898163, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0079, + "step": 650 + }, + { + "epoch": 0.04039414896872514, + "grad_norm": 0.38021156191825867, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0103, + "step": 660 + }, + { + "epoch": 0.041006181528857336, + "grad_norm": 0.43489688634872437, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0105, + "step": 670 + }, + { + "epoch": 0.04161821408898953, + "grad_norm": 0.48019328713417053, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0124, + "step": 680 + }, + { + "epoch": 0.042230246649121735, + "grad_norm": 0.28486984968185425, + "learning_rate": 1.378e-05, + "loss": 0.0122, + "step": 690 + }, + { + "epoch": 0.04284227920925393, + "grad_norm": 0.35172080993652344, + "learning_rate": 1.398e-05, + "loss": 0.0093, + "step": 700 + }, + { + "epoch": 0.043454311769386134, + "grad_norm": 0.32531124353408813, + "learning_rate": 1.418e-05, + "loss": 0.0116, + "step": 710 + }, + { + "epoch": 0.04406634432951833, + "grad_norm": 0.388637512922287, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0077, + "step": 720 + }, + { + "epoch": 0.04467837688965053, + "grad_norm": 0.3816429078578949, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0123, + "step": 730 + }, + { + "epoch": 0.04529040944978273, + "grad_norm": 0.22786036133766174, + "learning_rate": 1.478e-05, + "loss": 0.0089, + "step": 740 + }, + { + "epoch": 0.045902442009914925, + "grad_norm": 0.2965328097343445, + "learning_rate": 1.498e-05, + "loss": 0.011, + "step": 750 + }, + { + "epoch": 0.04651447457004713, + "grad_norm": 0.3568362593650818, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0101, + "step": 760 + }, + { + "epoch": 0.047126507130179324, + "grad_norm": 0.2972166836261749, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0093, + "step": 770 + }, + { + "epoch": 0.04773853969031153, + "grad_norm": 0.4221388101577759, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.012, + "step": 780 + }, + { + "epoch": 0.04835057225044372, + "grad_norm": 0.37255391478538513, + "learning_rate": 1.578e-05, + "loss": 0.0085, + "step": 790 + }, + { + "epoch": 0.048962604810575926, + "grad_norm": 0.36007094383239746, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.008, + "step": 800 + }, + { + "epoch": 0.04957463737070812, + "grad_norm": 0.40588808059692383, + "learning_rate": 1.618e-05, + "loss": 0.0081, + "step": 810 + }, + { + "epoch": 0.05018666993084032, + "grad_norm": 0.46563687920570374, + "learning_rate": 1.638e-05, + "loss": 0.0076, + "step": 820 + }, + { + "epoch": 0.05079870249097252, + "grad_norm": 0.3161381483078003, + "learning_rate": 1.658e-05, + "loss": 0.0129, + "step": 830 + }, + { + "epoch": 0.05141073505110472, + "grad_norm": 0.3800298869609833, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0146, + "step": 840 + }, + { + "epoch": 0.05202276761123692, + "grad_norm": 0.36572107672691345, + "learning_rate": 1.698e-05, + "loss": 0.0148, + "step": 850 + }, + { + "epoch": 0.052634800171369116, + "grad_norm": 0.4084141254425049, + "learning_rate": 1.718e-05, + "loss": 0.0085, + "step": 860 + }, + { + "epoch": 0.05324683273150132, + "grad_norm": 0.2906867265701294, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0116, + "step": 870 + }, + { + "epoch": 0.053858865291633515, + "grad_norm": 0.41204380989074707, + "learning_rate": 1.758e-05, + "loss": 0.0076, + "step": 880 + }, + { + "epoch": 0.05447089785176571, + "grad_norm": 0.5292996764183044, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0094, + "step": 890 + }, + { + "epoch": 0.055082930411897914, + "grad_norm": 0.23192685842514038, + "learning_rate": 1.798e-05, + "loss": 0.0116, + "step": 900 + }, + { + "epoch": 0.05569496297203011, + "grad_norm": 0.41050270199775696, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0099, + "step": 910 + }, + { + "epoch": 0.05630699553216231, + "grad_norm": 0.3336002230644226, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0082, + "step": 920 + }, + { + "epoch": 0.05691902809229451, + "grad_norm": 0.46233776211738586, + "learning_rate": 1.858e-05, + "loss": 0.0104, + "step": 930 + }, + { + "epoch": 0.05753106065242671, + "grad_norm": 0.36776405572891235, + "learning_rate": 1.878e-05, + "loss": 0.0115, + "step": 940 + }, + { + "epoch": 0.05814309321255891, + "grad_norm": 0.47848618030548096, + "learning_rate": 1.898e-05, + "loss": 0.0108, + "step": 950 + }, + { + "epoch": 0.058755125772691104, + "grad_norm": 0.35507604479789734, + "learning_rate": 1.918e-05, + "loss": 0.0095, + "step": 960 + }, + { + "epoch": 0.05936715833282331, + "grad_norm": 0.4613397717475891, + "learning_rate": 1.938e-05, + "loss": 0.0119, + "step": 970 + }, + { + "epoch": 0.0599791908929555, + "grad_norm": 0.34492260217666626, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0109, + "step": 980 + }, + { + "epoch": 0.060591223453087706, + "grad_norm": 0.34624582529067993, + "learning_rate": 1.978e-05, + "loss": 0.0099, + "step": 990 + }, + { + "epoch": 0.0612032560132199, + "grad_norm": 0.9161475896835327, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0109, + "step": 1000 + }, + { + "epoch": 0.061815288573352105, + "grad_norm": 0.367807537317276, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0087, + "step": 1010 + }, + { + "epoch": 0.0624273211334843, + "grad_norm": 0.4043216407299042, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.0084, + "step": 1020 + }, + { + "epoch": 0.0630393536936165, + "grad_norm": 0.315305233001709, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0075, + "step": 1030 + }, + { + "epoch": 0.06365138625374869, + "grad_norm": 0.49702969193458557, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0103, + "step": 1040 + }, + { + "epoch": 0.0642634188138809, + "grad_norm": 0.46286216378211975, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0116, + "step": 1050 + }, + { + "epoch": 0.0648754513740131, + "grad_norm": 0.332142174243927, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0103, + "step": 1060 + }, + { + "epoch": 0.0654874839341453, + "grad_norm": 0.6118510961532593, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0118, + "step": 1070 + }, + { + "epoch": 0.06609951649427749, + "grad_norm": 0.49074795842170715, + "learning_rate": 1.999967041472886e-05, + "loss": 0.011, + "step": 1080 + }, + { + "epoch": 0.0667115490544097, + "grad_norm": 0.42575374245643616, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0125, + "step": 1090 + }, + { + "epoch": 0.0673235816145419, + "grad_norm": 0.3223794996738434, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0091, + "step": 1100 + }, + { + "epoch": 0.06793561417467409, + "grad_norm": 0.4952760636806488, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.011, + "step": 1110 + }, + { + "epoch": 0.06854764673480629, + "grad_norm": 0.36144813895225525, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0096, + "step": 1120 + }, + { + "epoch": 0.06915967929493849, + "grad_norm": 0.31190025806427, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0115, + "step": 1130 + }, + { + "epoch": 0.0697717118550707, + "grad_norm": 0.7014928460121155, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.014, + "step": 1140 + }, + { + "epoch": 0.07038374441520288, + "grad_norm": 0.4382205605506897, + "learning_rate": 1.999882759038658e-05, + "loss": 0.0111, + "step": 1150 + }, + { + "epoch": 0.07099577697533509, + "grad_norm": 0.3750714659690857, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0116, + "step": 1160 + }, + { + "epoch": 0.07160780953546729, + "grad_norm": 0.4174371361732483, + "learning_rate": 1.999849173538598e-05, + "loss": 0.009, + "step": 1170 + }, + { + "epoch": 0.07221984209559948, + "grad_norm": 0.44394591450691223, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0094, + "step": 1180 + }, + { + "epoch": 0.07283187465573168, + "grad_norm": 0.43412888050079346, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0098, + "step": 1190 + }, + { + "epoch": 0.07344390721586389, + "grad_norm": 0.6421196460723877, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.01, + "step": 1200 + }, + { + "epoch": 0.07405593977599609, + "grad_norm": 0.6313903331756592, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0137, + "step": 1210 + }, + { + "epoch": 0.07466797233612828, + "grad_norm": 0.49340254068374634, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0104, + "step": 1220 + }, + { + "epoch": 0.07528000489626048, + "grad_norm": 0.40420663356781006, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0126, + "step": 1230 + }, + { + "epoch": 0.07589203745639268, + "grad_norm": 0.3955318033695221, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.013, + "step": 1240 + }, + { + "epoch": 0.07650407001652489, + "grad_norm": 0.4967520236968994, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0098, + "step": 1250 + }, + { + "epoch": 0.07711610257665708, + "grad_norm": 0.3380029499530792, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0084, + "step": 1260 + }, + { + "epoch": 0.07772813513678928, + "grad_norm": 0.4542321562767029, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.009, + "step": 1270 + }, + { + "epoch": 0.07834016769692148, + "grad_norm": 0.4533286392688751, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0093, + "step": 1280 + }, + { + "epoch": 0.07895220025705367, + "grad_norm": 0.39559242129325867, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0106, + "step": 1290 + }, + { + "epoch": 0.07956423281718587, + "grad_norm": 0.23190362751483917, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.01, + "step": 1300 + }, + { + "epoch": 0.08017626537731808, + "grad_norm": 0.4732286334037781, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0089, + "step": 1310 + }, + { + "epoch": 0.08078829793745028, + "grad_norm": 0.3010174036026001, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0091, + "step": 1320 + }, + { + "epoch": 0.08140033049758247, + "grad_norm": 0.3989834189414978, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0097, + "step": 1330 + }, + { + "epoch": 0.08201236305771467, + "grad_norm": 0.4597114622592926, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.01, + "step": 1340 + }, + { + "epoch": 0.08262439561784687, + "grad_norm": 0.426826536655426, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.011, + "step": 1350 + }, + { + "epoch": 0.08323642817797906, + "grad_norm": 0.4876341223716736, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0105, + "step": 1360 + }, + { + "epoch": 0.08384846073811127, + "grad_norm": 0.5444457530975342, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0099, + "step": 1370 + }, + { + "epoch": 0.08446049329824347, + "grad_norm": 0.5096126794815063, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.007, + "step": 1380 + }, + { + "epoch": 0.08507252585837567, + "grad_norm": 0.43828368186950684, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.009, + "step": 1390 + }, + { + "epoch": 0.08568455841850786, + "grad_norm": 0.40163955092430115, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0103, + "step": 1400 + }, + { + "epoch": 0.08629659097864006, + "grad_norm": 0.3110432028770447, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0115, + "step": 1410 + }, + { + "epoch": 0.08690862353877227, + "grad_norm": 0.8393893241882324, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.012, + "step": 1420 + }, + { + "epoch": 0.08752065609890446, + "grad_norm": 0.2751714289188385, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0093, + "step": 1430 + }, + { + "epoch": 0.08813268865903666, + "grad_norm": 0.36969971656799316, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0112, + "step": 1440 + }, + { + "epoch": 0.08874472121916886, + "grad_norm": 0.3721938729286194, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0113, + "step": 1450 + }, + { + "epoch": 0.08935675377930107, + "grad_norm": 0.26564934849739075, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0107, + "step": 1460 + }, + { + "epoch": 0.08996878633943325, + "grad_norm": 0.36552169919013977, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0123, + "step": 1470 + }, + { + "epoch": 0.09058081889956546, + "grad_norm": 0.23664990067481995, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0074, + "step": 1480 + }, + { + "epoch": 0.09119285145969766, + "grad_norm": 0.49903133511543274, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0084, + "step": 1490 + }, + { + "epoch": 0.09180488401982985, + "grad_norm": 0.43505051732063293, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0115, + "step": 1500 + }, + { + "epoch": 0.09241691657996205, + "grad_norm": 0.20318932831287384, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0088, + "step": 1510 + }, + { + "epoch": 0.09302894914009426, + "grad_norm": 0.3289708197116852, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.008, + "step": 1520 + }, + { + "epoch": 0.09364098170022646, + "grad_norm": 0.3920934200286865, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0145, + "step": 1530 + }, + { + "epoch": 0.09425301426035865, + "grad_norm": 0.40396374464035034, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0081, + "step": 1540 + }, + { + "epoch": 0.09486504682049085, + "grad_norm": 0.4044182300567627, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.012, + "step": 1550 + }, + { + "epoch": 0.09547707938062305, + "grad_norm": 0.2318611741065979, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0115, + "step": 1560 + }, + { + "epoch": 0.09608911194075524, + "grad_norm": 0.3905714750289917, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.008, + "step": 1570 + }, + { + "epoch": 0.09670114450088745, + "grad_norm": 0.2516922652721405, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0084, + "step": 1580 + }, + { + "epoch": 0.09731317706101965, + "grad_norm": 0.338455468416214, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0122, + "step": 1590 + }, + { + "epoch": 0.09792520962115185, + "grad_norm": 0.31875041127204895, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0081, + "step": 1600 + }, + { + "epoch": 0.09853724218128404, + "grad_norm": 0.2996121644973755, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0068, + "step": 1610 + }, + { + "epoch": 0.09914927474141624, + "grad_norm": 0.4381162226200104, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0103, + "step": 1620 + }, + { + "epoch": 0.09976130730154845, + "grad_norm": 0.5531038045883179, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0168, + "step": 1630 + }, + { + "epoch": 0.10037333986168064, + "grad_norm": 1.1283385753631592, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0119, + "step": 1640 + }, + { + "epoch": 0.10098537242181284, + "grad_norm": 0.38017332553863525, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0096, + "step": 1650 + }, + { + "epoch": 0.10159740498194504, + "grad_norm": 0.4669477045536041, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0111, + "step": 1660 + }, + { + "epoch": 0.10220943754207724, + "grad_norm": 0.3903254270553589, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0098, + "step": 1670 + }, + { + "epoch": 0.10282147010220943, + "grad_norm": 0.49671587347984314, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0083, + "step": 1680 + }, + { + "epoch": 0.10343350266234164, + "grad_norm": 0.36555853486061096, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0088, + "step": 1690 + }, + { + "epoch": 0.10404553522247384, + "grad_norm": 0.21804726123809814, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0086, + "step": 1700 + }, + { + "epoch": 0.10465756778260603, + "grad_norm": 0.6744784116744995, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0109, + "step": 1710 + }, + { + "epoch": 0.10526960034273823, + "grad_norm": 0.34379470348358154, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0096, + "step": 1720 + }, + { + "epoch": 0.10588163290287043, + "grad_norm": 0.27760598063468933, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0095, + "step": 1730 + }, + { + "epoch": 0.10649366546300264, + "grad_norm": 0.36294442415237427, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0091, + "step": 1740 + }, + { + "epoch": 0.10710569802313483, + "grad_norm": 0.42200908064842224, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.011, + "step": 1750 + }, + { + "epoch": 0.10771773058326703, + "grad_norm": 0.47863906621932983, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0107, + "step": 1760 + }, + { + "epoch": 0.10832976314339923, + "grad_norm": 0.32717248797416687, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0112, + "step": 1770 + }, + { + "epoch": 0.10894179570353142, + "grad_norm": 0.4255545735359192, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0106, + "step": 1780 + }, + { + "epoch": 0.10955382826366362, + "grad_norm": 0.5034983158111572, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0098, + "step": 1790 + }, + { + "epoch": 0.11016586082379583, + "grad_norm": 0.37071412801742554, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0099, + "step": 1800 + }, + { + "epoch": 0.11077789338392803, + "grad_norm": 0.23624737560749054, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0069, + "step": 1810 + }, + { + "epoch": 0.11138992594406022, + "grad_norm": 0.5815485715866089, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0095, + "step": 1820 + }, + { + "epoch": 0.11200195850419242, + "grad_norm": 1.1828722953796387, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0104, + "step": 1830 + }, + { + "epoch": 0.11261399106432463, + "grad_norm": 0.38099589943885803, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0106, + "step": 1840 + }, + { + "epoch": 0.11322602362445681, + "grad_norm": 0.38476184010505676, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0112, + "step": 1850 + }, + { + "epoch": 0.11383805618458902, + "grad_norm": 0.48982104659080505, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0125, + "step": 1860 + }, + { + "epoch": 0.11445008874472122, + "grad_norm": 0.4165821671485901, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0099, + "step": 1870 + }, + { + "epoch": 0.11506212130485342, + "grad_norm": 0.3412662446498871, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0061, + "step": 1880 + }, + { + "epoch": 0.11567415386498561, + "grad_norm": 0.46617937088012695, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0129, + "step": 1890 + }, + { + "epoch": 0.11628618642511782, + "grad_norm": 0.2705824077129364, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0082, + "step": 1900 + }, + { + "epoch": 0.11689821898525002, + "grad_norm": 0.3567829430103302, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0125, + "step": 1910 + }, + { + "epoch": 0.11751025154538221, + "grad_norm": 0.4438138008117676, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0129, + "step": 1920 + }, + { + "epoch": 0.11812228410551441, + "grad_norm": 0.356703519821167, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0083, + "step": 1930 + }, + { + "epoch": 0.11873431666564661, + "grad_norm": 0.6039804220199585, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0086, + "step": 1940 + }, + { + "epoch": 0.11934634922577882, + "grad_norm": 0.4572801887989044, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0067, + "step": 1950 + }, + { + "epoch": 0.119958381785911, + "grad_norm": 0.5063445568084717, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0091, + "step": 1960 + }, + { + "epoch": 0.12057041434604321, + "grad_norm": 0.3467857837677002, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.008, + "step": 1970 + }, + { + "epoch": 0.12118244690617541, + "grad_norm": 0.4875742197036743, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0102, + "step": 1980 + }, + { + "epoch": 0.1217944794663076, + "grad_norm": 0.3209119141101837, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0081, + "step": 1990 + }, + { + "epoch": 0.1224065120264398, + "grad_norm": 0.4731980860233307, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0118, + "step": 2000 + }, + { + "epoch": 0.123018544586572, + "grad_norm": 0.5742963552474976, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.0125, + "step": 2010 + }, + { + "epoch": 0.12363057714670421, + "grad_norm": 0.41357406973838806, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.1242426097068364, + "grad_norm": 0.6277521252632141, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0096, + "step": 2030 + }, + { + "epoch": 0.1248546422669686, + "grad_norm": 0.41252902150154114, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0108, + "step": 2040 + }, + { + "epoch": 0.1254666748271008, + "grad_norm": 0.782122790813446, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0134, + "step": 2050 + }, + { + "epoch": 0.126078707387233, + "grad_norm": 0.45011264085769653, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0102, + "step": 2060 + }, + { + "epoch": 0.1266907399473652, + "grad_norm": 0.2724951207637787, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0088, + "step": 2070 + }, + { + "epoch": 0.12730277250749739, + "grad_norm": 0.2351481169462204, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.009, + "step": 2080 + }, + { + "epoch": 0.1279148050676296, + "grad_norm": 0.34568479657173157, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0092, + "step": 2090 + }, + { + "epoch": 0.1285268376277618, + "grad_norm": 0.44493499398231506, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0087, + "step": 2100 + }, + { + "epoch": 0.129138870187894, + "grad_norm": 0.3011283874511719, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0108, + "step": 2110 + }, + { + "epoch": 0.1297509027480262, + "grad_norm": 0.4170232117176056, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0087, + "step": 2120 + }, + { + "epoch": 0.1303629353081584, + "grad_norm": 0.2696056365966797, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0093, + "step": 2130 + }, + { + "epoch": 0.1309749678682906, + "grad_norm": 0.4092336893081665, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0083, + "step": 2140 + }, + { + "epoch": 0.13158700042842278, + "grad_norm": 0.36637401580810547, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.01, + "step": 2150 + }, + { + "epoch": 0.13219903298855498, + "grad_norm": 0.28675684332847595, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0079, + "step": 2160 + }, + { + "epoch": 0.13281106554868718, + "grad_norm": 0.27699902653694153, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0071, + "step": 2170 + }, + { + "epoch": 0.1334230981088194, + "grad_norm": 0.3832298517227173, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0104, + "step": 2180 + }, + { + "epoch": 0.1340351306689516, + "grad_norm": 0.3590598702430725, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0074, + "step": 2190 + }, + { + "epoch": 0.1346471632290838, + "grad_norm": 0.21830014884471893, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0093, + "step": 2200 + }, + { + "epoch": 0.135259195789216, + "grad_norm": 0.342492938041687, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0109, + "step": 2210 + }, + { + "epoch": 0.13587122834934817, + "grad_norm": 0.6337023973464966, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0082, + "step": 2220 + }, + { + "epoch": 0.13648326090948038, + "grad_norm": 0.41742798686027527, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0083, + "step": 2230 + }, + { + "epoch": 0.13709529346961258, + "grad_norm": 0.3180190324783325, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0083, + "step": 2240 + }, + { + "epoch": 0.13770732602974478, + "grad_norm": 0.36720144748687744, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0114, + "step": 2250 + }, + { + "epoch": 0.13831935858987698, + "grad_norm": 0.29457366466522217, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0077, + "step": 2260 + }, + { + "epoch": 0.1389313911500092, + "grad_norm": 0.24702222645282745, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0074, + "step": 2270 + }, + { + "epoch": 0.1395434237101414, + "grad_norm": 0.3203345835208893, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0079, + "step": 2280 + }, + { + "epoch": 0.14015545627027357, + "grad_norm": 0.4375395178794861, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0127, + "step": 2290 + }, + { + "epoch": 0.14076748883040577, + "grad_norm": 0.44338247179985046, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0077, + "step": 2300 + }, + { + "epoch": 0.14137952139053797, + "grad_norm": 0.31765618920326233, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0091, + "step": 2310 + }, + { + "epoch": 0.14199155395067017, + "grad_norm": 0.322534441947937, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0068, + "step": 2320 + }, + { + "epoch": 0.14260358651080238, + "grad_norm": 0.23571068048477173, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0094, + "step": 2330 + }, + { + "epoch": 0.14321561907093458, + "grad_norm": 0.26818808913230896, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0092, + "step": 2340 + }, + { + "epoch": 0.14382765163106678, + "grad_norm": 0.31886982917785645, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0084, + "step": 2350 + }, + { + "epoch": 0.14443968419119896, + "grad_norm": 0.5176070928573608, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0104, + "step": 2360 + }, + { + "epoch": 0.14505171675133116, + "grad_norm": 0.4322161078453064, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0081, + "step": 2370 + }, + { + "epoch": 0.14566374931146336, + "grad_norm": 0.4076510965824127, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0102, + "step": 2380 + }, + { + "epoch": 0.14627578187159557, + "grad_norm": 0.3808838725090027, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0096, + "step": 2390 + }, + { + "epoch": 0.14688781443172777, + "grad_norm": 0.5045232176780701, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0109, + "step": 2400 + }, + { + "epoch": 0.14749984699185997, + "grad_norm": 0.3932737708091736, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0077, + "step": 2410 + }, + { + "epoch": 0.14811187955199218, + "grad_norm": 0.28561875224113464, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0141, + "step": 2420 + }, + { + "epoch": 0.14872391211212435, + "grad_norm": 0.414410799741745, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0094, + "step": 2430 + }, + { + "epoch": 0.14933594467225655, + "grad_norm": 0.4587285816669464, + "learning_rate": 1.989086647373215e-05, + "loss": 0.009, + "step": 2440 + }, + { + "epoch": 0.14994797723238876, + "grad_norm": 0.7567377686500549, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0084, + "step": 2450 + }, + { + "epoch": 0.15056000979252096, + "grad_norm": 0.4980221390724182, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0087, + "step": 2460 + }, + { + "epoch": 0.15117204235265316, + "grad_norm": 0.41810303926467896, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0082, + "step": 2470 + }, + { + "epoch": 0.15178407491278537, + "grad_norm": 0.4193445146083832, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0131, + "step": 2480 + }, + { + "epoch": 0.15239610747291757, + "grad_norm": 0.2561246156692505, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0074, + "step": 2490 + }, + { + "epoch": 0.15300814003304977, + "grad_norm": 0.22316500544548035, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0069, + "step": 2500 + }, + { + "epoch": 0.15362017259318195, + "grad_norm": 0.31504112482070923, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0097, + "step": 2510 + }, + { + "epoch": 0.15423220515331415, + "grad_norm": 0.2944568991661072, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0101, + "step": 2520 + }, + { + "epoch": 0.15484423771344635, + "grad_norm": 0.2744649052619934, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0074, + "step": 2530 + }, + { + "epoch": 0.15545627027357856, + "grad_norm": 0.2717166841030121, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.007, + "step": 2540 + }, + { + "epoch": 0.15606830283371076, + "grad_norm": 0.32652929425239563, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0097, + "step": 2550 + }, + { + "epoch": 0.15668033539384296, + "grad_norm": 0.3169964849948883, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0089, + "step": 2560 + }, + { + "epoch": 0.15729236795397517, + "grad_norm": 0.24130010604858398, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0083, + "step": 2570 + }, + { + "epoch": 0.15790440051410734, + "grad_norm": 0.3869011700153351, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0088, + "step": 2580 + }, + { + "epoch": 0.15851643307423954, + "grad_norm": 0.2944110333919525, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0085, + "step": 2590 + }, + { + "epoch": 0.15912846563437175, + "grad_norm": 0.27993839979171753, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0097, + "step": 2600 + }, + { + "epoch": 0.15974049819450395, + "grad_norm": 0.42018845677375793, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0104, + "step": 2610 + }, + { + "epoch": 0.16035253075463615, + "grad_norm": 0.45006832480430603, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0072, + "step": 2620 + }, + { + "epoch": 0.16096456331476836, + "grad_norm": 0.275564581155777, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0072, + "step": 2630 + }, + { + "epoch": 0.16157659587490056, + "grad_norm": 0.503052294254303, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0091, + "step": 2640 + }, + { + "epoch": 0.16218862843503273, + "grad_norm": 0.33740976452827454, + "learning_rate": 1.985678043265668e-05, + "loss": 0.008, + "step": 2650 + }, + { + "epoch": 0.16280066099516494, + "grad_norm": 0.5379078984260559, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0097, + "step": 2660 + }, + { + "epoch": 0.16341269355529714, + "grad_norm": 0.3605813980102539, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0109, + "step": 2670 + }, + { + "epoch": 0.16402472611542934, + "grad_norm": 0.49490585923194885, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.013, + "step": 2680 + }, + { + "epoch": 0.16463675867556155, + "grad_norm": 0.29894375801086426, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0089, + "step": 2690 + }, + { + "epoch": 0.16524879123569375, + "grad_norm": 0.395270437002182, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0092, + "step": 2700 + }, + { + "epoch": 0.16586082379582595, + "grad_norm": 0.25507843494415283, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0078, + "step": 2710 + }, + { + "epoch": 0.16647285635595813, + "grad_norm": 0.3304852843284607, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0178, + "step": 2720 + }, + { + "epoch": 0.16708488891609033, + "grad_norm": 0.4356633126735687, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0082, + "step": 2730 + }, + { + "epoch": 0.16769692147622253, + "grad_norm": 0.4104527533054352, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0086, + "step": 2740 + }, + { + "epoch": 0.16830895403635474, + "grad_norm": 0.25723493099212646, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0097, + "step": 2750 + }, + { + "epoch": 0.16892098659648694, + "grad_norm": 0.3280608057975769, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0079, + "step": 2760 + }, + { + "epoch": 0.16953301915661914, + "grad_norm": 0.4641128480434418, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0081, + "step": 2770 + }, + { + "epoch": 0.17014505171675134, + "grad_norm": 0.2704941928386688, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.0112, + "step": 2780 + }, + { + "epoch": 0.17075708427688352, + "grad_norm": 0.42343780398368835, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0084, + "step": 2790 + }, + { + "epoch": 0.17136911683701572, + "grad_norm": 0.2606532573699951, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0085, + "step": 2800 + }, + { + "epoch": 0.17198114939714793, + "grad_norm": 0.39099374413490295, + "learning_rate": 1.982773261916081e-05, + "loss": 0.014, + "step": 2810 + }, + { + "epoch": 0.17259318195728013, + "grad_norm": 0.32653889060020447, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0095, + "step": 2820 + }, + { + "epoch": 0.17320521451741233, + "grad_norm": 0.34765321016311646, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0084, + "step": 2830 + }, + { + "epoch": 0.17381724707754453, + "grad_norm": 0.2844177186489105, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.011, + "step": 2840 + }, + { + "epoch": 0.17442927963767674, + "grad_norm": 0.5079899430274963, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0114, + "step": 2850 + }, + { + "epoch": 0.1750413121978089, + "grad_norm": 0.4043678045272827, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0073, + "step": 2860 + }, + { + "epoch": 0.17565334475794112, + "grad_norm": 0.3833003640174866, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0076, + "step": 2870 + }, + { + "epoch": 0.17626537731807332, + "grad_norm": 0.2826341986656189, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0072, + "step": 2880 + }, + { + "epoch": 0.17687740987820552, + "grad_norm": 0.6043460965156555, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0084, + "step": 2890 + }, + { + "epoch": 0.17748944243833772, + "grad_norm": 0.3238481879234314, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0149, + "step": 2900 + }, + { + "epoch": 0.17810147499846993, + "grad_norm": 0.45817995071411133, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0103, + "step": 2910 + }, + { + "epoch": 0.17871350755860213, + "grad_norm": 0.21048744022846222, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0094, + "step": 2920 + }, + { + "epoch": 0.1793255401187343, + "grad_norm": 0.3401891887187958, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0058, + "step": 2930 + }, + { + "epoch": 0.1799375726788665, + "grad_norm": 0.3655509948730469, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0071, + "step": 2940 + }, + { + "epoch": 0.1805496052389987, + "grad_norm": 0.47406241297721863, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0067, + "step": 2950 + }, + { + "epoch": 0.18116163779913091, + "grad_norm": 0.3278841972351074, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0121, + "step": 2960 + }, + { + "epoch": 0.18177367035926312, + "grad_norm": 0.271436482667923, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.008, + "step": 2970 + }, + { + "epoch": 0.18238570291939532, + "grad_norm": 0.41475561261177063, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.008, + "step": 2980 + }, + { + "epoch": 0.18299773547952752, + "grad_norm": 0.5389090776443481, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0091, + "step": 2990 + }, + { + "epoch": 0.1836097680396597, + "grad_norm": 0.3958609700202942, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0106, + "step": 3000 + }, + { + "epoch": 0.1842218005997919, + "grad_norm": 0.3456019461154938, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0084, + "step": 3010 + }, + { + "epoch": 0.1848338331599241, + "grad_norm": 0.2959386706352234, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0071, + "step": 3020 + }, + { + "epoch": 0.1854458657200563, + "grad_norm": 0.2617223858833313, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0082, + "step": 3030 + }, + { + "epoch": 0.1860578982801885, + "grad_norm": 0.45173966884613037, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0105, + "step": 3040 + }, + { + "epoch": 0.1866699308403207, + "grad_norm": 0.4127421975135803, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.008, + "step": 3050 + }, + { + "epoch": 0.18728196340045292, + "grad_norm": 0.3142230808734894, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0085, + "step": 3060 + }, + { + "epoch": 0.1878939959605851, + "grad_norm": 0.49720287322998047, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0089, + "step": 3070 + }, + { + "epoch": 0.1885060285207173, + "grad_norm": 0.6417365074157715, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.0088, + "step": 3080 + }, + { + "epoch": 0.1891180610808495, + "grad_norm": 0.44801583886146545, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.0098, + "step": 3090 + }, + { + "epoch": 0.1897300936409817, + "grad_norm": 0.3606127202510834, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0087, + "step": 3100 + }, + { + "epoch": 0.1903421262011139, + "grad_norm": 0.268971711397171, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0092, + "step": 3110 + }, + { + "epoch": 0.1909541587612461, + "grad_norm": 0.2367011308670044, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0074, + "step": 3120 + }, + { + "epoch": 0.1915661913213783, + "grad_norm": 0.41643625497817993, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0062, + "step": 3130 + }, + { + "epoch": 0.19217822388151048, + "grad_norm": 0.33202284574508667, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0081, + "step": 3140 + }, + { + "epoch": 0.1927902564416427, + "grad_norm": 0.279813289642334, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0074, + "step": 3150 + }, + { + "epoch": 0.1934022890017749, + "grad_norm": 0.5127174258232117, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0101, + "step": 3160 + }, + { + "epoch": 0.1940143215619071, + "grad_norm": 0.36921849846839905, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0078, + "step": 3170 + }, + { + "epoch": 0.1946263541220393, + "grad_norm": 0.3509728014469147, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0114, + "step": 3180 + }, + { + "epoch": 0.1952383866821715, + "grad_norm": 0.3088139295578003, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0089, + "step": 3190 + }, + { + "epoch": 0.1958504192423037, + "grad_norm": 0.43653762340545654, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0116, + "step": 3200 + }, + { + "epoch": 0.19646245180243588, + "grad_norm": 0.2522308826446533, + "learning_rate": 1.974353140804231e-05, + "loss": 0.007, + "step": 3210 + }, + { + "epoch": 0.19707448436256808, + "grad_norm": 0.37519100308418274, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0098, + "step": 3220 + }, + { + "epoch": 0.19768651692270028, + "grad_norm": 0.379027783870697, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0086, + "step": 3230 + }, + { + "epoch": 0.1982985494828325, + "grad_norm": 0.2713090479373932, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0073, + "step": 3240 + }, + { + "epoch": 0.1989105820429647, + "grad_norm": 0.41106846928596497, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0188, + "step": 3250 + }, + { + "epoch": 0.1995226146030969, + "grad_norm": 0.3914758861064911, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0097, + "step": 3260 + }, + { + "epoch": 0.2001346471632291, + "grad_norm": 0.4763018488883972, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0082, + "step": 3270 + }, + { + "epoch": 0.20074667972336127, + "grad_norm": 0.23002664744853973, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0085, + "step": 3280 + }, + { + "epoch": 0.20135871228349347, + "grad_norm": 0.2887377142906189, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0074, + "step": 3290 + }, + { + "epoch": 0.20197074484362568, + "grad_norm": 0.2322079837322235, + "learning_rate": 1.972231769371516e-05, + "loss": 0.009, + "step": 3300 + }, + { + "epoch": 0.20258277740375788, + "grad_norm": 0.39307233691215515, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0095, + "step": 3310 + }, + { + "epoch": 0.20319480996389008, + "grad_norm": 0.5209783315658569, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.012, + "step": 3320 + }, + { + "epoch": 0.20380684252402229, + "grad_norm": 0.45187172293663025, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0086, + "step": 3330 + }, + { + "epoch": 0.2044188750841545, + "grad_norm": 0.480970174074173, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0072, + "step": 3340 + }, + { + "epoch": 0.20503090764428666, + "grad_norm": 0.30979010462760925, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0091, + "step": 3350 + }, + { + "epoch": 0.20564294020441887, + "grad_norm": 0.6410729289054871, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0096, + "step": 3360 + }, + { + "epoch": 0.20625497276455107, + "grad_norm": 0.23707512021064758, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0078, + "step": 3370 + }, + { + "epoch": 0.20686700532468327, + "grad_norm": 0.3029544949531555, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0115, + "step": 3380 + }, + { + "epoch": 0.20747903788481548, + "grad_norm": 0.28677740693092346, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0068, + "step": 3390 + }, + { + "epoch": 0.20809107044494768, + "grad_norm": 0.2433662712574005, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0082, + "step": 3400 + }, + { + "epoch": 0.20870310300507988, + "grad_norm": 0.38066667318344116, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0089, + "step": 3410 + }, + { + "epoch": 0.20931513556521206, + "grad_norm": 0.3830282390117645, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0073, + "step": 3420 + }, + { + "epoch": 0.20992716812534426, + "grad_norm": 0.359684556722641, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0088, + "step": 3430 + }, + { + "epoch": 0.21053920068547646, + "grad_norm": 0.3497346341609955, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0082, + "step": 3440 + }, + { + "epoch": 0.21115123324560867, + "grad_norm": 0.3664748966693878, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0091, + "step": 3450 + }, + { + "epoch": 0.21176326580574087, + "grad_norm": 0.382804811000824, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0077, + "step": 3460 + }, + { + "epoch": 0.21237529836587307, + "grad_norm": 0.22746194899082184, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0107, + "step": 3470 + }, + { + "epoch": 0.21298733092600527, + "grad_norm": 0.4094266891479492, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0096, + "step": 3480 + }, + { + "epoch": 0.21359936348613745, + "grad_norm": 0.26990365982055664, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0089, + "step": 3490 + }, + { + "epoch": 0.21421139604626965, + "grad_norm": 0.2602371275424957, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0074, + "step": 3500 + }, + { + "epoch": 0.21482342860640186, + "grad_norm": 0.34200435876846313, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0083, + "step": 3510 + }, + { + "epoch": 0.21543546116653406, + "grad_norm": 0.4260508716106415, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0098, + "step": 3520 + }, + { + "epoch": 0.21604749372666626, + "grad_norm": 0.4017483592033386, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0103, + "step": 3530 + }, + { + "epoch": 0.21665952628679847, + "grad_norm": 0.40005844831466675, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0094, + "step": 3540 + }, + { + "epoch": 0.21727155884693067, + "grad_norm": 0.3856841027736664, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0095, + "step": 3550 + }, + { + "epoch": 0.21788359140706284, + "grad_norm": 0.3245168626308441, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0067, + "step": 3560 + }, + { + "epoch": 0.21849562396719505, + "grad_norm": 0.2698485255241394, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0079, + "step": 3570 + }, + { + "epoch": 0.21910765652732725, + "grad_norm": 0.24520452320575714, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0109, + "step": 3580 + }, + { + "epoch": 0.21971968908745945, + "grad_norm": 0.397175133228302, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0114, + "step": 3590 + }, + { + "epoch": 0.22033172164759166, + "grad_norm": 0.40339091420173645, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0094, + "step": 3600 + }, + { + "epoch": 0.22094375420772386, + "grad_norm": 0.404435396194458, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0087, + "step": 3610 + }, + { + "epoch": 0.22155578676785606, + "grad_norm": 0.3300188183784485, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0073, + "step": 3620 + }, + { + "epoch": 0.22216781932798824, + "grad_norm": 0.23486892879009247, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0087, + "step": 3630 + }, + { + "epoch": 0.22277985188812044, + "grad_norm": 0.37211188673973083, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0078, + "step": 3640 + }, + { + "epoch": 0.22339188444825264, + "grad_norm": 0.32422709465026855, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.009, + "step": 3650 + }, + { + "epoch": 0.22400391700838485, + "grad_norm": 0.43535664677619934, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0099, + "step": 3660 + }, + { + "epoch": 0.22461594956851705, + "grad_norm": 0.3295724093914032, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0074, + "step": 3670 + }, + { + "epoch": 0.22522798212864925, + "grad_norm": 0.2840734124183655, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0082, + "step": 3680 + }, + { + "epoch": 0.22584001468878145, + "grad_norm": 0.2861844599246979, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0079, + "step": 3690 + }, + { + "epoch": 0.22645204724891363, + "grad_norm": 0.3194407820701599, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0071, + "step": 3700 + }, + { + "epoch": 0.22706407980904583, + "grad_norm": 0.38770729303359985, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0076, + "step": 3710 + }, + { + "epoch": 0.22767611236917804, + "grad_norm": 0.4637960195541382, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0093, + "step": 3720 + }, + { + "epoch": 0.22828814492931024, + "grad_norm": 0.31972312927246094, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0078, + "step": 3730 + }, + { + "epoch": 0.22890017748944244, + "grad_norm": 0.5273001790046692, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0076, + "step": 3740 + }, + { + "epoch": 0.22951221004957464, + "grad_norm": 0.30589622259140015, + "learning_rate": 1.960385541132679e-05, + "loss": 0.009, + "step": 3750 + }, + { + "epoch": 0.23012424260970685, + "grad_norm": 0.31634265184402466, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0063, + "step": 3760 + }, + { + "epoch": 0.23073627516983902, + "grad_norm": 0.32762402296066284, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0081, + "step": 3770 + }, + { + "epoch": 0.23134830772997123, + "grad_norm": 0.42696496844291687, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0089, + "step": 3780 + }, + { + "epoch": 0.23196034029010343, + "grad_norm": 0.4676671624183655, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0093, + "step": 3790 + }, + { + "epoch": 0.23257237285023563, + "grad_norm": 0.3347911536693573, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0131, + "step": 3800 + }, + { + "epoch": 0.23318440541036783, + "grad_norm": 0.3083193600177765, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0072, + "step": 3810 + }, + { + "epoch": 0.23379643797050004, + "grad_norm": 0.38178423047065735, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0083, + "step": 3820 + }, + { + "epoch": 0.23440847053063224, + "grad_norm": 0.2796846330165863, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0072, + "step": 3830 + }, + { + "epoch": 0.23502050309076442, + "grad_norm": 0.37444883584976196, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.008, + "step": 3840 + }, + { + "epoch": 0.23563253565089662, + "grad_norm": 0.3286772668361664, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0092, + "step": 3850 + }, + { + "epoch": 0.23624456821102882, + "grad_norm": 0.45423513650894165, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0068, + "step": 3860 + }, + { + "epoch": 0.23685660077116102, + "grad_norm": 0.36881721019744873, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0068, + "step": 3870 + }, + { + "epoch": 0.23746863333129323, + "grad_norm": 0.3560579717159271, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0084, + "step": 3880 + }, + { + "epoch": 0.23808066589142543, + "grad_norm": 0.43887296319007874, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0076, + "step": 3890 + }, + { + "epoch": 0.23869269845155763, + "grad_norm": 0.3080165982246399, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0069, + "step": 3900 + }, + { + "epoch": 0.2393047310116898, + "grad_norm": 0.2327195703983307, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0077, + "step": 3910 + }, + { + "epoch": 0.239916763571822, + "grad_norm": 0.5960802435874939, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0088, + "step": 3920 + }, + { + "epoch": 0.24052879613195421, + "grad_norm": 0.36213600635528564, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0076, + "step": 3930 + }, + { + "epoch": 0.24114082869208642, + "grad_norm": 0.2950032949447632, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0115, + "step": 3940 + }, + { + "epoch": 0.24175286125221862, + "grad_norm": 0.4527084529399872, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0089, + "step": 3950 + }, + { + "epoch": 0.24236489381235082, + "grad_norm": 0.4422491192817688, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0135, + "step": 3960 + }, + { + "epoch": 0.24297692637248303, + "grad_norm": 0.45049232244491577, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 0.2435889589326152, + "grad_norm": 0.2566494941711426, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0095, + "step": 3980 + }, + { + "epoch": 0.2442009914927474, + "grad_norm": 0.49880343675613403, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0099, + "step": 3990 + }, + { + "epoch": 0.2448130240528796, + "grad_norm": 0.4699341952800751, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0084, + "step": 4000 + }, + { + "epoch": 0.2454250566130118, + "grad_norm": 0.41230708360671997, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0071, + "step": 4010 + }, + { + "epoch": 0.246037089173144, + "grad_norm": 0.4836854934692383, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.01, + "step": 4020 + }, + { + "epoch": 0.24664912173327622, + "grad_norm": 0.3056115508079529, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0082, + "step": 4030 + }, + { + "epoch": 0.24726115429340842, + "grad_norm": 0.151325523853302, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0062, + "step": 4040 + }, + { + "epoch": 0.2478731868535406, + "grad_norm": 0.3798811137676239, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0081, + "step": 4050 + }, + { + "epoch": 0.2484852194136728, + "grad_norm": 0.3308229148387909, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0072, + "step": 4060 + }, + { + "epoch": 0.249097251973805, + "grad_norm": 0.2891339957714081, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0074, + "step": 4070 + }, + { + "epoch": 0.2497092845339372, + "grad_norm": 0.24179549515247345, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 0.2503213170940694, + "grad_norm": 0.20879383385181427, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0064, + "step": 4090 + }, + { + "epoch": 0.2509333496542016, + "grad_norm": 0.39275774359703064, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0073, + "step": 4100 + }, + { + "epoch": 0.2515453822143338, + "grad_norm": 0.2925782799720764, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0095, + "step": 4110 + }, + { + "epoch": 0.252157414774466, + "grad_norm": 0.6465128660202026, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0102, + "step": 4120 + }, + { + "epoch": 0.2527694473345982, + "grad_norm": 0.34663915634155273, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.007, + "step": 4130 + }, + { + "epoch": 0.2533814798947304, + "grad_norm": 0.3387165367603302, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0091, + "step": 4140 + }, + { + "epoch": 0.2539935124548626, + "grad_norm": 0.32989630103111267, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0084, + "step": 4150 + }, + { + "epoch": 0.25460554501499477, + "grad_norm": 0.22870391607284546, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0071, + "step": 4160 + }, + { + "epoch": 0.255217577575127, + "grad_norm": 0.3866496682167053, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0079, + "step": 4170 + }, + { + "epoch": 0.2558296101352592, + "grad_norm": 0.29885268211364746, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0068, + "step": 4180 + }, + { + "epoch": 0.2564416426953914, + "grad_norm": 0.4693736135959625, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0095, + "step": 4190 + }, + { + "epoch": 0.2570536752555236, + "grad_norm": 0.2822454273700714, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0074, + "step": 4200 + }, + { + "epoch": 0.2576657078156558, + "grad_norm": 0.21141012012958527, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0099, + "step": 4210 + }, + { + "epoch": 0.258277740375788, + "grad_norm": 0.2284570336341858, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0102, + "step": 4220 + }, + { + "epoch": 0.2588897729359202, + "grad_norm": 0.4675048887729645, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0112, + "step": 4230 + }, + { + "epoch": 0.2595018054960524, + "grad_norm": 0.3906441628932953, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0088, + "step": 4240 + }, + { + "epoch": 0.2601138380561846, + "grad_norm": 0.22990387678146362, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0091, + "step": 4250 + }, + { + "epoch": 0.2607258706163168, + "grad_norm": 0.41871073842048645, + "learning_rate": 1.944490251296856e-05, + "loss": 0.009, + "step": 4260 + }, + { + "epoch": 0.261337903176449, + "grad_norm": 0.2724440395832062, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0074, + "step": 4270 + }, + { + "epoch": 0.2619499357365812, + "grad_norm": 0.42590636014938354, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0069, + "step": 4280 + }, + { + "epoch": 0.2625619682967134, + "grad_norm": 0.3604855239391327, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0068, + "step": 4290 + }, + { + "epoch": 0.26317400085684556, + "grad_norm": 0.475304514169693, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0082, + "step": 4300 + }, + { + "epoch": 0.26378603341697776, + "grad_norm": 0.24752479791641235, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0065, + "step": 4310 + }, + { + "epoch": 0.26439806597710996, + "grad_norm": 0.4384835958480835, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0104, + "step": 4320 + }, + { + "epoch": 0.26501009853724217, + "grad_norm": 0.24999107420444489, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0076, + "step": 4330 + }, + { + "epoch": 0.26562213109737437, + "grad_norm": 0.292491614818573, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0069, + "step": 4340 + }, + { + "epoch": 0.2662341636575066, + "grad_norm": 0.2380208522081375, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0069, + "step": 4350 + }, + { + "epoch": 0.2668461962176388, + "grad_norm": 0.2906023859977722, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0063, + "step": 4360 + }, + { + "epoch": 0.267458228777771, + "grad_norm": 0.4718990623950958, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0074, + "step": 4370 + }, + { + "epoch": 0.2680702613379032, + "grad_norm": 0.33257269859313965, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0073, + "step": 4380 + }, + { + "epoch": 0.2686822938980354, + "grad_norm": 0.34411463141441345, + "learning_rate": 1.940024231916886e-05, + "loss": 0.006, + "step": 4390 + }, + { + "epoch": 0.2692943264581676, + "grad_norm": 0.40312516689300537, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0093, + "step": 4400 + }, + { + "epoch": 0.2699063590182998, + "grad_norm": 0.2248350828886032, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0082, + "step": 4410 + }, + { + "epoch": 0.270518391578432, + "grad_norm": 0.30094820261001587, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0092, + "step": 4420 + }, + { + "epoch": 0.2711304241385642, + "grad_norm": 0.4277440309524536, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0099, + "step": 4430 + }, + { + "epoch": 0.27174245669869634, + "grad_norm": 0.2876254916191101, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0078, + "step": 4440 + }, + { + "epoch": 0.27235448925882855, + "grad_norm": 0.3453986346721649, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0096, + "step": 4450 + }, + { + "epoch": 0.27296652181896075, + "grad_norm": 0.31379634141921997, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0088, + "step": 4460 + }, + { + "epoch": 0.27357855437909295, + "grad_norm": 0.294477254152298, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0073, + "step": 4470 + }, + { + "epoch": 0.27419058693922516, + "grad_norm": 0.3773270845413208, + "learning_rate": 1.936834723687526e-05, + "loss": 0.008, + "step": 4480 + }, + { + "epoch": 0.27480261949935736, + "grad_norm": 0.31942978501319885, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0086, + "step": 4490 + }, + { + "epoch": 0.27541465205948956, + "grad_norm": 0.46827632188796997, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0081, + "step": 4500 + }, + { + "epoch": 0.27602668461962176, + "grad_norm": 0.2735249102115631, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0107, + "step": 4510 + }, + { + "epoch": 0.27663871717975397, + "grad_norm": 0.30048197507858276, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0082, + "step": 4520 + }, + { + "epoch": 0.27725074973988617, + "grad_norm": 0.3507469594478607, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0093, + "step": 4530 + }, + { + "epoch": 0.2778627823000184, + "grad_norm": 0.5642989277839661, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0091, + "step": 4540 + }, + { + "epoch": 0.2784748148601506, + "grad_norm": 0.2769993245601654, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0105, + "step": 4550 + }, + { + "epoch": 0.2790868474202828, + "grad_norm": 0.30269622802734375, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0066, + "step": 4560 + }, + { + "epoch": 0.279698879980415, + "grad_norm": 0.3717023432254791, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0114, + "step": 4570 + }, + { + "epoch": 0.28031091254054713, + "grad_norm": 0.5065163373947144, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0075, + "step": 4580 + }, + { + "epoch": 0.28092294510067933, + "grad_norm": 0.4302189350128174, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0077, + "step": 4590 + }, + { + "epoch": 0.28153497766081154, + "grad_norm": 0.44008374214172363, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0086, + "step": 4600 + }, + { + "epoch": 0.28214701022094374, + "grad_norm": 0.4647364318370819, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0073, + "step": 4610 + }, + { + "epoch": 0.28275904278107594, + "grad_norm": 0.4229913651943207, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0077, + "step": 4620 + }, + { + "epoch": 0.28337107534120815, + "grad_norm": 0.36600178480148315, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0078, + "step": 4630 + }, + { + "epoch": 0.28398310790134035, + "grad_norm": 0.47143280506134033, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0085, + "step": 4640 + }, + { + "epoch": 0.28459514046147255, + "grad_norm": 0.29140496253967285, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0056, + "step": 4650 + }, + { + "epoch": 0.28520717302160475, + "grad_norm": 0.3964666426181793, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0071, + "step": 4660 + }, + { + "epoch": 0.28581920558173696, + "grad_norm": 0.407536119222641, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0095, + "step": 4670 + }, + { + "epoch": 0.28643123814186916, + "grad_norm": 0.33687031269073486, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0091, + "step": 4680 + }, + { + "epoch": 0.28704327070200136, + "grad_norm": 0.3182448446750641, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0087, + "step": 4690 + }, + { + "epoch": 0.28765530326213357, + "grad_norm": 0.40998023748397827, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0099, + "step": 4700 + }, + { + "epoch": 0.28826733582226577, + "grad_norm": 0.28750360012054443, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0087, + "step": 4710 + }, + { + "epoch": 0.2888793683823979, + "grad_norm": 0.36494627594947815, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0062, + "step": 4720 + }, + { + "epoch": 0.2894914009425301, + "grad_norm": 0.37047910690307617, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0094, + "step": 4730 + }, + { + "epoch": 0.2901034335026623, + "grad_norm": 0.2577553987503052, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0082, + "step": 4740 + }, + { + "epoch": 0.2907154660627945, + "grad_norm": 0.24589397013187408, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0067, + "step": 4750 + }, + { + "epoch": 0.29132749862292673, + "grad_norm": 0.37927499413490295, + "learning_rate": 1.926404507646751e-05, + "loss": 0.008, + "step": 4760 + }, + { + "epoch": 0.29193953118305893, + "grad_norm": 0.40547946095466614, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0101, + "step": 4770 + }, + { + "epoch": 0.29255156374319113, + "grad_norm": 0.47896578907966614, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0114, + "step": 4780 + }, + { + "epoch": 0.29316359630332334, + "grad_norm": 0.42911696434020996, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0066, + "step": 4790 + }, + { + "epoch": 0.29377562886345554, + "grad_norm": 0.21735505759716034, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0072, + "step": 4800 + }, + { + "epoch": 0.29438766142358774, + "grad_norm": 0.25916650891304016, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0078, + "step": 4810 + }, + { + "epoch": 0.29499969398371995, + "grad_norm": 0.23863966763019562, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0086, + "step": 4820 + }, + { + "epoch": 0.29561172654385215, + "grad_norm": 0.41552650928497314, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0078, + "step": 4830 + }, + { + "epoch": 0.29622375910398435, + "grad_norm": 0.2775874733924866, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0082, + "step": 4840 + }, + { + "epoch": 0.29683579166411656, + "grad_norm": 0.28962916135787964, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0088, + "step": 4850 + }, + { + "epoch": 0.2974478242242487, + "grad_norm": 0.3488757610321045, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0076, + "step": 4860 + }, + { + "epoch": 0.2980598567843809, + "grad_norm": 0.3833489716053009, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0096, + "step": 4870 + }, + { + "epoch": 0.2986718893445131, + "grad_norm": 0.20357537269592285, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.0061, + "step": 4880 + }, + { + "epoch": 0.2992839219046453, + "grad_norm": 0.4648539423942566, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0099, + "step": 4890 + }, + { + "epoch": 0.2998959544647775, + "grad_norm": 0.2701941728591919, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0062, + "step": 4900 + }, + { + "epoch": 0.3005079870249097, + "grad_norm": 0.31277161836624146, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0069, + "step": 4910 + }, + { + "epoch": 0.3011200195850419, + "grad_norm": 0.27697697281837463, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0094, + "step": 4920 + }, + { + "epoch": 0.3017320521451741, + "grad_norm": 0.22880606353282928, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0074, + "step": 4930 + }, + { + "epoch": 0.3023440847053063, + "grad_norm": 0.258404940366745, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0078, + "step": 4940 + }, + { + "epoch": 0.30295611726543853, + "grad_norm": 0.394394189119339, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0109, + "step": 4950 + }, + { + "epoch": 0.30356814982557073, + "grad_norm": 0.24108687043190002, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0082, + "step": 4960 + }, + { + "epoch": 0.30418018238570294, + "grad_norm": 0.34520867466926575, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0098, + "step": 4970 + }, + { + "epoch": 0.30479221494583514, + "grad_norm": 0.33723267912864685, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0104, + "step": 4980 + }, + { + "epoch": 0.30540424750596734, + "grad_norm": 0.28276878595352173, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0072, + "step": 4990 + }, + { + "epoch": 0.30601628006609954, + "grad_norm": 0.32236188650131226, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.012, + "step": 5000 + }, + { + "epoch": 0.3066283126262317, + "grad_norm": 0.20596888661384583, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0073, + "step": 5010 + }, + { + "epoch": 0.3072403451863639, + "grad_norm": 0.37921255826950073, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0073, + "step": 5020 + }, + { + "epoch": 0.3078523777464961, + "grad_norm": 0.30738911032676697, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0083, + "step": 5030 + }, + { + "epoch": 0.3084644103066283, + "grad_norm": 0.1938163936138153, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0065, + "step": 5040 + }, + { + "epoch": 0.3090764428667605, + "grad_norm": 0.25826898217201233, + "learning_rate": 1.914800406458133e-05, + "loss": 0.008, + "step": 5050 + }, + { + "epoch": 0.3096884754268927, + "grad_norm": 0.18951697647571564, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0058, + "step": 5060 + }, + { + "epoch": 0.3103005079870249, + "grad_norm": 0.3877381980419159, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0095, + "step": 5070 + }, + { + "epoch": 0.3109125405471571, + "grad_norm": 0.3133573830127716, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0088, + "step": 5080 + }, + { + "epoch": 0.3115245731072893, + "grad_norm": 0.33131852746009827, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0062, + "step": 5090 + }, + { + "epoch": 0.3121366056674215, + "grad_norm": 0.21276263892650604, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0071, + "step": 5100 + }, + { + "epoch": 0.3127486382275537, + "grad_norm": 0.46878281235694885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0084, + "step": 5110 + }, + { + "epoch": 0.3133606707876859, + "grad_norm": 0.44227683544158936, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0097, + "step": 5120 + }, + { + "epoch": 0.3139727033478181, + "grad_norm": 0.41950204968452454, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0115, + "step": 5130 + }, + { + "epoch": 0.31458473590795033, + "grad_norm": 0.4214445948600769, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0081, + "step": 5140 + }, + { + "epoch": 0.3151967684680825, + "grad_norm": 0.3779868483543396, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0075, + "step": 5150 + }, + { + "epoch": 0.3158088010282147, + "grad_norm": 0.4587777853012085, + "learning_rate": 1.910187855634501e-05, + "loss": 0.009, + "step": 5160 + }, + { + "epoch": 0.3164208335883469, + "grad_norm": 0.4875587224960327, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0099, + "step": 5170 + }, + { + "epoch": 0.3170328661484791, + "grad_norm": 0.22378237545490265, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0071, + "step": 5180 + }, + { + "epoch": 0.3176448987086113, + "grad_norm": 0.3360678553581238, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0101, + "step": 5190 + }, + { + "epoch": 0.3182569312687435, + "grad_norm": 0.36370640993118286, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0068, + "step": 5200 + }, + { + "epoch": 0.3188689638288757, + "grad_norm": 0.25814393162727356, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0076, + "step": 5210 + }, + { + "epoch": 0.3194809963890079, + "grad_norm": 0.39010074734687805, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0066, + "step": 5220 + }, + { + "epoch": 0.3200930289491401, + "grad_norm": 0.44009074568748474, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0078, + "step": 5230 + }, + { + "epoch": 0.3207050615092723, + "grad_norm": 0.45733046531677246, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0074, + "step": 5240 + }, + { + "epoch": 0.3213170940694045, + "grad_norm": 0.4555135667324066, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0089, + "step": 5250 + }, + { + "epoch": 0.3219291266295367, + "grad_norm": 0.5864276885986328, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0083, + "step": 5260 + }, + { + "epoch": 0.3225411591896689, + "grad_norm": 0.3305470943450928, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0094, + "step": 5270 + }, + { + "epoch": 0.3231531917498011, + "grad_norm": 0.21458053588867188, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.0091, + "step": 5280 + }, + { + "epoch": 0.32376522430993326, + "grad_norm": 0.2927384376525879, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.007, + "step": 5290 + }, + { + "epoch": 0.32437725687006547, + "grad_norm": 0.387608140707016, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0068, + "step": 5300 + }, + { + "epoch": 0.32498928943019767, + "grad_norm": 0.28193122148513794, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0065, + "step": 5310 + }, + { + "epoch": 0.3256013219903299, + "grad_norm": 0.33098119497299194, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0082, + "step": 5320 + }, + { + "epoch": 0.3262133545504621, + "grad_norm": 0.5442482233047485, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0124, + "step": 5330 + }, + { + "epoch": 0.3268253871105943, + "grad_norm": 0.503669798374176, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0093, + "step": 5340 + }, + { + "epoch": 0.3274374196707265, + "grad_norm": 0.2307574301958084, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0071, + "step": 5350 + }, + { + "epoch": 0.3280494522308587, + "grad_norm": 0.3543917238712311, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.007, + "step": 5360 + }, + { + "epoch": 0.3286614847909909, + "grad_norm": 0.21763169765472412, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0059, + "step": 5370 + }, + { + "epoch": 0.3292735173511231, + "grad_norm": 0.38023391366004944, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0068, + "step": 5380 + }, + { + "epoch": 0.3298855499112553, + "grad_norm": 0.44597327709198, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0069, + "step": 5390 + }, + { + "epoch": 0.3304975824713875, + "grad_norm": 0.2994389533996582, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0083, + "step": 5400 + }, + { + "epoch": 0.3311096150315197, + "grad_norm": 0.26668304204940796, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0071, + "step": 5410 + }, + { + "epoch": 0.3317216475916519, + "grad_norm": 0.25944197177886963, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0065, + "step": 5420 + }, + { + "epoch": 0.33233368015178405, + "grad_norm": 0.3646431267261505, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0065, + "step": 5430 + }, + { + "epoch": 0.33294571271191625, + "grad_norm": 0.34860959649086, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0098, + "step": 5440 + }, + { + "epoch": 0.33355774527204846, + "grad_norm": 0.33718568086624146, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0069, + "step": 5450 + }, + { + "epoch": 0.33416977783218066, + "grad_norm": 0.2417302280664444, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0064, + "step": 5460 + }, + { + "epoch": 0.33478181039231286, + "grad_norm": 0.26607826352119446, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0048, + "step": 5470 + }, + { + "epoch": 0.33539384295244506, + "grad_norm": 0.31762364506721497, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0064, + "step": 5480 + }, + { + "epoch": 0.33600587551257727, + "grad_norm": 0.21427015960216522, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0079, + "step": 5490 + }, + { + "epoch": 0.33661790807270947, + "grad_norm": 0.3372637629508972, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0077, + "step": 5500 + }, + { + "epoch": 0.3372299406328417, + "grad_norm": 0.3760700821876526, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0066, + "step": 5510 + }, + { + "epoch": 0.3378419731929739, + "grad_norm": 0.22838029265403748, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0061, + "step": 5520 + }, + { + "epoch": 0.3384540057531061, + "grad_norm": 0.3105243444442749, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0089, + "step": 5530 + }, + { + "epoch": 0.3390660383132383, + "grad_norm": 0.23694929480552673, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0086, + "step": 5540 + }, + { + "epoch": 0.3396780708733705, + "grad_norm": 0.22935174405574799, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0074, + "step": 5550 + }, + { + "epoch": 0.3402901034335027, + "grad_norm": 0.26384714245796204, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0069, + "step": 5560 + }, + { + "epoch": 0.34090213599363484, + "grad_norm": 0.33245643973350525, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0061, + "step": 5570 + }, + { + "epoch": 0.34151416855376704, + "grad_norm": 0.3904813230037689, + "learning_rate": 1.891523933768891e-05, + "loss": 0.009, + "step": 5580 + }, + { + "epoch": 0.34212620111389924, + "grad_norm": 0.33858415484428406, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0067, + "step": 5590 + }, + { + "epoch": 0.34273823367403145, + "grad_norm": 0.3197486996650696, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0082, + "step": 5600 + }, + { + "epoch": 0.34335026623416365, + "grad_norm": 0.23814789950847626, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0068, + "step": 5610 + }, + { + "epoch": 0.34396229879429585, + "grad_norm": 0.3820457458496094, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0082, + "step": 5620 + }, + { + "epoch": 0.34457433135442805, + "grad_norm": 0.27518680691719055, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0063, + "step": 5630 + }, + { + "epoch": 0.34518636391456026, + "grad_norm": 0.24741721153259277, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0076, + "step": 5640 + }, + { + "epoch": 0.34579839647469246, + "grad_norm": 0.5140052437782288, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0178, + "step": 5650 + }, + { + "epoch": 0.34641042903482466, + "grad_norm": 0.5363543033599854, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0097, + "step": 5660 + }, + { + "epoch": 0.34702246159495687, + "grad_norm": 0.41116055846214294, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0078, + "step": 5670 + }, + { + "epoch": 0.34763449415508907, + "grad_norm": 0.412762314081192, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0086, + "step": 5680 + }, + { + "epoch": 0.34824652671522127, + "grad_norm": 0.399527907371521, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0088, + "step": 5690 + }, + { + "epoch": 0.3488585592753535, + "grad_norm": 0.3447834551334381, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0084, + "step": 5700 + }, + { + "epoch": 0.3494705918354856, + "grad_norm": 0.3418859541416168, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0092, + "step": 5710 + }, + { + "epoch": 0.3500826243956178, + "grad_norm": 0.3336535692214966, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.0065, + "step": 5720 + }, + { + "epoch": 0.35069465695575003, + "grad_norm": 0.34575122594833374, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0068, + "step": 5730 + }, + { + "epoch": 0.35130668951588223, + "grad_norm": 0.34325110912323, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.01, + "step": 5740 + }, + { + "epoch": 0.35191872207601443, + "grad_norm": 0.20104236900806427, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0085, + "step": 5750 + }, + { + "epoch": 0.35253075463614664, + "grad_norm": 0.33699074387550354, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0073, + "step": 5760 + }, + { + "epoch": 0.35314278719627884, + "grad_norm": 0.33322635293006897, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0091, + "step": 5770 + }, + { + "epoch": 0.35375481975641104, + "grad_norm": 0.26897475123405457, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0087, + "step": 5780 + }, + { + "epoch": 0.35436685231654325, + "grad_norm": 0.5310013890266418, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0067, + "step": 5790 + }, + { + "epoch": 0.35497888487667545, + "grad_norm": 0.4203440845012665, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0097, + "step": 5800 + }, + { + "epoch": 0.35559091743680765, + "grad_norm": 0.2179369181394577, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0082, + "step": 5810 + }, + { + "epoch": 0.35620294999693985, + "grad_norm": 0.2789444625377655, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0066, + "step": 5820 + }, + { + "epoch": 0.35681498255707206, + "grad_norm": 0.28009694814682007, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.007, + "step": 5830 + }, + { + "epoch": 0.35742701511720426, + "grad_norm": 0.304768443107605, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0065, + "step": 5840 + }, + { + "epoch": 0.3580390476773364, + "grad_norm": 0.2829401195049286, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0061, + "step": 5850 + }, + { + "epoch": 0.3586510802374686, + "grad_norm": 0.3388998508453369, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0083, + "step": 5860 + }, + { + "epoch": 0.3592631127976008, + "grad_norm": 0.3313426673412323, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0069, + "step": 5870 + }, + { + "epoch": 0.359875145357733, + "grad_norm": 0.2886904180049896, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0094, + "step": 5880 + }, + { + "epoch": 0.3604871779178652, + "grad_norm": 0.3132432997226715, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0086, + "step": 5890 + }, + { + "epoch": 0.3610992104779974, + "grad_norm": 0.37195107340812683, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0137, + "step": 5900 + }, + { + "epoch": 0.3617112430381296, + "grad_norm": 0.30853375792503357, + "learning_rate": 1.875708056549365e-05, + "loss": 0.01, + "step": 5910 + }, + { + "epoch": 0.36232327559826183, + "grad_norm": 0.39785459637641907, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0073, + "step": 5920 + }, + { + "epoch": 0.36293530815839403, + "grad_norm": 0.26958727836608887, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0059, + "step": 5930 + }, + { + "epoch": 0.36354734071852624, + "grad_norm": 0.354956716299057, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0069, + "step": 5940 + }, + { + "epoch": 0.36415937327865844, + "grad_norm": 0.3470858037471771, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0066, + "step": 5950 + }, + { + "epoch": 0.36477140583879064, + "grad_norm": 0.30000701546669006, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0075, + "step": 5960 + }, + { + "epoch": 0.36538343839892284, + "grad_norm": 0.5558263063430786, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0083, + "step": 5970 + }, + { + "epoch": 0.36599547095905505, + "grad_norm": 0.39146295189857483, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0062, + "step": 5980 + }, + { + "epoch": 0.3666075035191872, + "grad_norm": 0.44002753496170044, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0089, + "step": 5990 + }, + { + "epoch": 0.3672195360793194, + "grad_norm": 0.3220095932483673, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0097, + "step": 6000 + }, + { + "epoch": 0.3678315686394516, + "grad_norm": 0.3569507598876953, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0067, + "step": 6010 + }, + { + "epoch": 0.3684436011995838, + "grad_norm": 0.3004184365272522, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0084, + "step": 6020 + }, + { + "epoch": 0.369055633759716, + "grad_norm": 0.2931320071220398, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0079, + "step": 6030 + }, + { + "epoch": 0.3696676663198482, + "grad_norm": 0.39551016688346863, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0088, + "step": 6040 + }, + { + "epoch": 0.3702796988799804, + "grad_norm": 0.33755603432655334, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0082, + "step": 6050 + }, + { + "epoch": 0.3708917314401126, + "grad_norm": 0.3101558983325958, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0069, + "step": 6060 + }, + { + "epoch": 0.3715037640002448, + "grad_norm": 0.2921602129936218, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0065, + "step": 6070 + }, + { + "epoch": 0.372115796560377, + "grad_norm": 0.3601403832435608, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0063, + "step": 6080 + }, + { + "epoch": 0.3727278291205092, + "grad_norm": 0.34929168224334717, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0073, + "step": 6090 + }, + { + "epoch": 0.3733398616806414, + "grad_norm": 0.3987390995025635, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0068, + "step": 6100 + }, + { + "epoch": 0.37395189424077363, + "grad_norm": 0.2641090452671051, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0091, + "step": 6110 + }, + { + "epoch": 0.37456392680090583, + "grad_norm": 0.23139338195323944, + "learning_rate": 1.865125972978549e-05, + "loss": 0.006, + "step": 6120 + }, + { + "epoch": 0.375175959361038, + "grad_norm": 0.26552167534828186, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0056, + "step": 6130 + }, + { + "epoch": 0.3757879919211702, + "grad_norm": 0.43827885389328003, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0084, + "step": 6140 + }, + { + "epoch": 0.3764000244813024, + "grad_norm": 0.27495354413986206, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.012, + "step": 6150 + }, + { + "epoch": 0.3770120570414346, + "grad_norm": 0.36078640818595886, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0104, + "step": 6160 + }, + { + "epoch": 0.3776240896015668, + "grad_norm": 0.28252753615379333, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0103, + "step": 6170 + }, + { + "epoch": 0.378236122161699, + "grad_norm": 0.2674558162689209, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0069, + "step": 6180 + }, + { + "epoch": 0.3788481547218312, + "grad_norm": 0.21457509696483612, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0089, + "step": 6190 + }, + { + "epoch": 0.3794601872819634, + "grad_norm": 0.3142339885234833, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0075, + "step": 6200 + }, + { + "epoch": 0.3800722198420956, + "grad_norm": 0.32714203000068665, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0084, + "step": 6210 + }, + { + "epoch": 0.3806842524022278, + "grad_norm": 0.2632557153701782, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0072, + "step": 6220 + }, + { + "epoch": 0.38129628496236, + "grad_norm": 0.1893932968378067, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0063, + "step": 6230 + }, + { + "epoch": 0.3819083175224922, + "grad_norm": 0.49935290217399597, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0087, + "step": 6240 + }, + { + "epoch": 0.3825203500826244, + "grad_norm": 0.34605127573013306, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0101, + "step": 6250 + }, + { + "epoch": 0.3831323826427566, + "grad_norm": 0.3294198513031006, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0067, + "step": 6260 + }, + { + "epoch": 0.38374441520288877, + "grad_norm": 0.34797370433807373, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0065, + "step": 6270 + }, + { + "epoch": 0.38435644776302097, + "grad_norm": 0.37710750102996826, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0061, + "step": 6280 + }, + { + "epoch": 0.3849684803231532, + "grad_norm": 0.39949893951416016, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0066, + "step": 6290 + }, + { + "epoch": 0.3855805128832854, + "grad_norm": 0.33014294505119324, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0095, + "step": 6300 + }, + { + "epoch": 0.3861925454434176, + "grad_norm": 0.4329249858856201, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0089, + "step": 6310 + }, + { + "epoch": 0.3868045780035498, + "grad_norm": 0.298330157995224, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0076, + "step": 6320 + }, + { + "epoch": 0.387416610563682, + "grad_norm": 0.2672661542892456, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0074, + "step": 6330 + }, + { + "epoch": 0.3880286431238142, + "grad_norm": 0.48193076252937317, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0103, + "step": 6340 + }, + { + "epoch": 0.3886406756839464, + "grad_norm": 0.29180601239204407, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0075, + "step": 6350 + }, + { + "epoch": 0.3892527082440786, + "grad_norm": 0.21320492029190063, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0077, + "step": 6360 + }, + { + "epoch": 0.3898647408042108, + "grad_norm": 0.37252935767173767, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0078, + "step": 6370 + }, + { + "epoch": 0.390476773364343, + "grad_norm": 0.284586101770401, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0089, + "step": 6380 + }, + { + "epoch": 0.3910888059244752, + "grad_norm": 0.5030382871627808, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0088, + "step": 6390 + }, + { + "epoch": 0.3917008384846074, + "grad_norm": 0.357239305973053, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0069, + "step": 6400 + }, + { + "epoch": 0.39231287104473955, + "grad_norm": 0.20308594405651093, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0062, + "step": 6410 + }, + { + "epoch": 0.39292490360487176, + "grad_norm": 0.2678150534629822, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0085, + "step": 6420 + }, + { + "epoch": 0.39353693616500396, + "grad_norm": 0.35160595178604126, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0072, + "step": 6430 + }, + { + "epoch": 0.39414896872513616, + "grad_norm": 0.33254173398017883, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0083, + "step": 6440 + }, + { + "epoch": 0.39476100128526836, + "grad_norm": 0.22763408720493317, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0061, + "step": 6450 + }, + { + "epoch": 0.39537303384540057, + "grad_norm": 0.20889192819595337, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0067, + "step": 6460 + }, + { + "epoch": 0.39598506640553277, + "grad_norm": 0.22515206038951874, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0086, + "step": 6470 + }, + { + "epoch": 0.396597098965665, + "grad_norm": 0.36421817541122437, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.0064, + "step": 6480 + }, + { + "epoch": 0.3972091315257972, + "grad_norm": 0.3869773745536804, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0084, + "step": 6490 + }, + { + "epoch": 0.3978211640859294, + "grad_norm": 0.26248687505722046, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0061, + "step": 6500 + }, + { + "epoch": 0.3984331966460616, + "grad_norm": 0.22152310609817505, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0057, + "step": 6510 + }, + { + "epoch": 0.3990452292061938, + "grad_norm": 0.25921961665153503, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0071, + "step": 6520 + }, + { + "epoch": 0.399657261766326, + "grad_norm": 0.3289903998374939, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0076, + "step": 6530 + }, + { + "epoch": 0.4002692943264582, + "grad_norm": 0.2767571210861206, + "learning_rate": 1.8427795928237e-05, + "loss": 0.01, + "step": 6540 + }, + { + "epoch": 0.40088132688659034, + "grad_norm": 0.46339666843414307, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0064, + "step": 6550 + }, + { + "epoch": 0.40149335944672254, + "grad_norm": 0.2942553460597992, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0067, + "step": 6560 + }, + { + "epoch": 0.40210539200685474, + "grad_norm": 0.3868240714073181, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0085, + "step": 6570 + }, + { + "epoch": 0.40271742456698695, + "grad_norm": 0.3999684154987335, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0067, + "step": 6580 + }, + { + "epoch": 0.40332945712711915, + "grad_norm": 0.42856812477111816, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0091, + "step": 6590 + }, + { + "epoch": 0.40394148968725135, + "grad_norm": 0.3099806010723114, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0084, + "step": 6600 + }, + { + "epoch": 0.40455352224738356, + "grad_norm": 0.3798827826976776, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0066, + "step": 6610 + }, + { + "epoch": 0.40516555480751576, + "grad_norm": 0.19007280468940735, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0068, + "step": 6620 + }, + { + "epoch": 0.40577758736764796, + "grad_norm": 0.3723277151584625, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0085, + "step": 6630 + }, + { + "epoch": 0.40638961992778017, + "grad_norm": 0.21034900844097137, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0069, + "step": 6640 + }, + { + "epoch": 0.40700165248791237, + "grad_norm": 0.29838645458221436, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0095, + "step": 6650 + }, + { + "epoch": 0.40761368504804457, + "grad_norm": 0.2645854353904724, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0061, + "step": 6660 + }, + { + "epoch": 0.4082257176081768, + "grad_norm": 0.21633592247962952, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.006, + "step": 6670 + }, + { + "epoch": 0.408837750168309, + "grad_norm": 0.25387731194496155, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.008, + "step": 6680 + }, + { + "epoch": 0.4094497827284412, + "grad_norm": 0.3752288520336151, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0092, + "step": 6690 + }, + { + "epoch": 0.41006181528857333, + "grad_norm": 0.33368971943855286, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0072, + "step": 6700 + }, + { + "epoch": 0.41067384784870553, + "grad_norm": 0.34388917684555054, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0074, + "step": 6710 + }, + { + "epoch": 0.41128588040883773, + "grad_norm": 0.2683192789554596, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.007, + "step": 6720 + }, + { + "epoch": 0.41189791296896994, + "grad_norm": 0.5121234059333801, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0074, + "step": 6730 + }, + { + "epoch": 0.41250994552910214, + "grad_norm": 0.333406925201416, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0117, + "step": 6740 + }, + { + "epoch": 0.41312197808923434, + "grad_norm": 0.26011794805526733, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0062, + "step": 6750 + }, + { + "epoch": 0.41373401064936655, + "grad_norm": 0.28925821185112, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0066, + "step": 6760 + }, + { + "epoch": 0.41434604320949875, + "grad_norm": 0.2202957570552826, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0058, + "step": 6770 + }, + { + "epoch": 0.41495807576963095, + "grad_norm": 0.2740793824195862, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0072, + "step": 6780 + }, + { + "epoch": 0.41557010832976315, + "grad_norm": 0.46569427847862244, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0069, + "step": 6790 + }, + { + "epoch": 0.41618214088989536, + "grad_norm": 0.3959881067276001, + "learning_rate": 1.828172598376902e-05, + "loss": 0.009, + "step": 6800 + }, + { + "epoch": 0.41679417345002756, + "grad_norm": 0.2465214729309082, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0068, + "step": 6810 + }, + { + "epoch": 0.41740620601015976, + "grad_norm": 0.3207756280899048, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0083, + "step": 6820 + }, + { + "epoch": 0.41801823857029197, + "grad_norm": 0.5600990653038025, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0076, + "step": 6830 + }, + { + "epoch": 0.4186302711304241, + "grad_norm": 0.32832831144332886, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0072, + "step": 6840 + }, + { + "epoch": 0.4192423036905563, + "grad_norm": 0.3397129774093628, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0083, + "step": 6850 + }, + { + "epoch": 0.4198543362506885, + "grad_norm": 0.3481312096118927, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0064, + "step": 6860 + }, + { + "epoch": 0.4204663688108207, + "grad_norm": 0.4542059898376465, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0104, + "step": 6870 + }, + { + "epoch": 0.4210784013709529, + "grad_norm": 0.2517620325088501, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0089, + "step": 6880 + }, + { + "epoch": 0.42169043393108513, + "grad_norm": 0.3671923875808716, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0068, + "step": 6890 + }, + { + "epoch": 0.42230246649121733, + "grad_norm": 0.41340726613998413, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0084, + "step": 6900 + }, + { + "epoch": 0.42291449905134954, + "grad_norm": 0.22815965116024017, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0079, + "step": 6910 + }, + { + "epoch": 0.42352653161148174, + "grad_norm": 0.35324010252952576, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0073, + "step": 6920 + }, + { + "epoch": 0.42413856417161394, + "grad_norm": 0.30134323239326477, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0071, + "step": 6930 + }, + { + "epoch": 0.42475059673174614, + "grad_norm": 0.4007415771484375, + "learning_rate": 1.82006727813775e-05, + "loss": 0.006, + "step": 6940 + }, + { + "epoch": 0.42536262929187835, + "grad_norm": 0.3320179879665375, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0074, + "step": 6950 + }, + { + "epoch": 0.42597466185201055, + "grad_norm": 0.311971515417099, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0062, + "step": 6960 + }, + { + "epoch": 0.42658669441214275, + "grad_norm": 0.34347453713417053, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0068, + "step": 6970 + }, + { + "epoch": 0.4271987269722749, + "grad_norm": 0.25632336735725403, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0113, + "step": 6980 + }, + { + "epoch": 0.4278107595324071, + "grad_norm": 0.21711130440235138, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0068, + "step": 6990 + }, + { + "epoch": 0.4284227920925393, + "grad_norm": 0.3381270170211792, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0064, + "step": 7000 + }, + { + "epoch": 0.4290348246526715, + "grad_norm": 0.32262885570526123, + "learning_rate": 1.815952390818299e-05, + "loss": 0.0091, + "step": 7010 + }, + { + "epoch": 0.4296468572128037, + "grad_norm": 0.65865558385849, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0104, + "step": 7020 + }, + { + "epoch": 0.4302588897729359, + "grad_norm": 0.3021128177642822, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.009, + "step": 7030 + }, + { + "epoch": 0.4308709223330681, + "grad_norm": 0.2859005331993103, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0065, + "step": 7040 + }, + { + "epoch": 0.4314829548932003, + "grad_norm": 0.3379405736923218, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0062, + "step": 7050 + }, + { + "epoch": 0.4320949874533325, + "grad_norm": 0.22009991109371185, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.007, + "step": 7060 + }, + { + "epoch": 0.4327070200134647, + "grad_norm": 0.24766206741333008, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0072, + "step": 7070 + }, + { + "epoch": 0.43331905257359693, + "grad_norm": 0.3557615280151367, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0096, + "step": 7080 + }, + { + "epoch": 0.43393108513372913, + "grad_norm": 0.5700691938400269, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0066, + "step": 7090 + }, + { + "epoch": 0.43454311769386134, + "grad_norm": 0.3194892704486847, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0065, + "step": 7100 + }, + { + "epoch": 0.43515515025399354, + "grad_norm": 0.2766750752925873, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0074, + "step": 7110 + }, + { + "epoch": 0.4357671828141257, + "grad_norm": 0.2775132656097412, + "learning_rate": 1.809403050791396e-05, + "loss": 0.007, + "step": 7120 + }, + { + "epoch": 0.4363792153742579, + "grad_norm": 0.4468507170677185, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0066, + "step": 7130 + }, + { + "epoch": 0.4369912479343901, + "grad_norm": 0.3282400369644165, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0185, + "step": 7140 + }, + { + "epoch": 0.4376032804945223, + "grad_norm": 0.2625710964202881, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0088, + "step": 7150 + }, + { + "epoch": 0.4382153130546545, + "grad_norm": 0.47729599475860596, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.008, + "step": 7160 + }, + { + "epoch": 0.4388273456147867, + "grad_norm": 0.30350950360298157, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0091, + "step": 7170 + }, + { + "epoch": 0.4394393781749189, + "grad_norm": 0.3514627516269684, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0065, + "step": 7180 + }, + { + "epoch": 0.4400514107350511, + "grad_norm": 0.26150578260421753, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0087, + "step": 7190 + }, + { + "epoch": 0.4406634432951833, + "grad_norm": 0.374138206243515, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0073, + "step": 7200 + }, + { + "epoch": 0.4412754758553155, + "grad_norm": 0.2980635166168213, + "learning_rate": 1.803969531201634e-05, + "loss": 0.007, + "step": 7210 + }, + { + "epoch": 0.4418875084154477, + "grad_norm": 0.38190510869026184, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0077, + "step": 7220 + }, + { + "epoch": 0.4424995409755799, + "grad_norm": 0.28819066286087036, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0065, + "step": 7230 + }, + { + "epoch": 0.4431115735357121, + "grad_norm": 0.43382275104522705, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0077, + "step": 7240 + }, + { + "epoch": 0.4437236060958443, + "grad_norm": 0.31589648127555847, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0065, + "step": 7250 + }, + { + "epoch": 0.4443356386559765, + "grad_norm": 0.3744536340236664, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0067, + "step": 7260 + }, + { + "epoch": 0.4449476712161087, + "grad_norm": 0.2600225806236267, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.008, + "step": 7270 + }, + { + "epoch": 0.4455597037762409, + "grad_norm": 0.28064799308776855, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0061, + "step": 7280 + }, + { + "epoch": 0.4461717363363731, + "grad_norm": 0.2745135426521301, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0075, + "step": 7290 + }, + { + "epoch": 0.4467837688965053, + "grad_norm": 0.23609793186187744, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0056, + "step": 7300 + }, + { + "epoch": 0.4473958014566375, + "grad_norm": 0.35910022258758545, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0079, + "step": 7310 + }, + { + "epoch": 0.4480078340167697, + "grad_norm": 0.22230662405490875, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0065, + "step": 7320 + }, + { + "epoch": 0.4486198665769019, + "grad_norm": 0.3835199475288391, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.008, + "step": 7330 + }, + { + "epoch": 0.4492318991370341, + "grad_norm": 0.37863102555274963, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0083, + "step": 7340 + }, + { + "epoch": 0.4498439316971663, + "grad_norm": 0.25412216782569885, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0056, + "step": 7350 + }, + { + "epoch": 0.4504559642572985, + "grad_norm": 0.43248918652534485, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0061, + "step": 7360 + }, + { + "epoch": 0.4510679968174307, + "grad_norm": 0.2937811613082886, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0073, + "step": 7370 + }, + { + "epoch": 0.4516800293775629, + "grad_norm": 0.3018436133861542, + "learning_rate": 1.793524061803872e-05, + "loss": 0.007, + "step": 7380 + }, + { + "epoch": 0.4522920619376951, + "grad_norm": 0.32781726121902466, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0079, + "step": 7390 + }, + { + "epoch": 0.45290409449782726, + "grad_norm": 0.2843719720840454, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0067, + "step": 7400 + }, + { + "epoch": 0.45351612705795946, + "grad_norm": 0.27588292956352234, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0092, + "step": 7410 + }, + { + "epoch": 0.45412815961809166, + "grad_norm": 0.38858234882354736, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0067, + "step": 7420 + }, + { + "epoch": 0.45474019217822387, + "grad_norm": 0.4235166609287262, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0082, + "step": 7430 + }, + { + "epoch": 0.45535222473835607, + "grad_norm": 0.272210031747818, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0059, + "step": 7440 + }, + { + "epoch": 0.4559642572984883, + "grad_norm": 0.23851896822452545, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0071, + "step": 7450 + }, + { + "epoch": 0.4565762898586205, + "grad_norm": 0.37179476022720337, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0073, + "step": 7460 + }, + { + "epoch": 0.4571883224187527, + "grad_norm": 0.31902605295181274, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.009, + "step": 7470 + }, + { + "epoch": 0.4578003549788849, + "grad_norm": 0.47023633122444153, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0091, + "step": 7480 + }, + { + "epoch": 0.4584123875390171, + "grad_norm": 0.35726839303970337, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0079, + "step": 7490 + }, + { + "epoch": 0.4590244200991493, + "grad_norm": 0.27567291259765625, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0073, + "step": 7500 + }, + { + "epoch": 0.4596364526592815, + "grad_norm": 0.23053516447544098, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0065, + "step": 7510 + }, + { + "epoch": 0.4602484852194137, + "grad_norm": 0.2169056385755539, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0054, + "step": 7520 + }, + { + "epoch": 0.4608605177795459, + "grad_norm": 0.2912258207798004, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0083, + "step": 7530 + }, + { + "epoch": 0.46147255033967804, + "grad_norm": 0.2527846097946167, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.006, + "step": 7540 + }, + { + "epoch": 0.46208458289981025, + "grad_norm": 0.3878445029258728, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0079, + "step": 7550 + }, + { + "epoch": 0.46269661545994245, + "grad_norm": 0.3981980085372925, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0081, + "step": 7560 + }, + { + "epoch": 0.46330864802007465, + "grad_norm": 0.48834845423698425, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0086, + "step": 7570 + }, + { + "epoch": 0.46392068058020686, + "grad_norm": 0.3045276701450348, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0085, + "step": 7580 + }, + { + "epoch": 0.46453271314033906, + "grad_norm": 0.23345299065113068, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0072, + "step": 7590 + }, + { + "epoch": 0.46514474570047126, + "grad_norm": 0.3632943034172058, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0085, + "step": 7600 + }, + { + "epoch": 0.46575677826060347, + "grad_norm": 0.19813670217990875, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0073, + "step": 7610 + }, + { + "epoch": 0.46636881082073567, + "grad_norm": 0.36094173789024353, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0085, + "step": 7620 + }, + { + "epoch": 0.46698084338086787, + "grad_norm": 0.30049464106559753, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0085, + "step": 7630 + }, + { + "epoch": 0.4675928759410001, + "grad_norm": 0.27693697810173035, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0057, + "step": 7640 + }, + { + "epoch": 0.4682049085011323, + "grad_norm": 0.3656866252422333, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0064, + "step": 7650 + }, + { + "epoch": 0.4688169410612645, + "grad_norm": 0.602168083190918, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0076, + "step": 7660 + }, + { + "epoch": 0.4694289736213967, + "grad_norm": 0.3553078770637512, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0062, + "step": 7670 + }, + { + "epoch": 0.47004100618152883, + "grad_norm": 0.326695054769516, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0062, + "step": 7680 + }, + { + "epoch": 0.47065303874166103, + "grad_norm": 0.2762170732021332, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0075, + "step": 7690 + }, + { + "epoch": 0.47126507130179324, + "grad_norm": 0.35057321190834045, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0063, + "step": 7700 + }, + { + "epoch": 0.47187710386192544, + "grad_norm": 0.3906462788581848, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0086, + "step": 7710 + }, + { + "epoch": 0.47248913642205764, + "grad_norm": 0.290752112865448, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0087, + "step": 7720 + }, + { + "epoch": 0.47310116898218985, + "grad_norm": 0.2242034673690796, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0068, + "step": 7730 + }, + { + "epoch": 0.47371320154232205, + "grad_norm": 0.3283435106277466, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0068, + "step": 7740 + }, + { + "epoch": 0.47432523410245425, + "grad_norm": 0.24059069156646729, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.0056, + "step": 7750 + }, + { + "epoch": 0.47493726666258645, + "grad_norm": 0.2978667914867401, + "learning_rate": 1.769330275540774e-05, + "loss": 0.007, + "step": 7760 + }, + { + "epoch": 0.47554929922271866, + "grad_norm": 0.2605571150779724, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0084, + "step": 7770 + }, + { + "epoch": 0.47616133178285086, + "grad_norm": 0.4010445475578308, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0075, + "step": 7780 + }, + { + "epoch": 0.47677336434298306, + "grad_norm": 0.31932029128074646, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0076, + "step": 7790 + }, + { + "epoch": 0.47738539690311527, + "grad_norm": 0.3508684039115906, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0067, + "step": 7800 + }, + { + "epoch": 0.47799742946324747, + "grad_norm": 0.2835206091403961, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0064, + "step": 7810 + }, + { + "epoch": 0.4786094620233796, + "grad_norm": 0.2661663293838501, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0051, + "step": 7820 + }, + { + "epoch": 0.4792214945835118, + "grad_norm": 0.4146379828453064, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0073, + "step": 7830 + }, + { + "epoch": 0.479833527143644, + "grad_norm": 0.38621196150779724, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0076, + "step": 7840 + }, + { + "epoch": 0.4804455597037762, + "grad_norm": 0.19052188098430634, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.008, + "step": 7850 + }, + { + "epoch": 0.48105759226390843, + "grad_norm": 0.3699149489402771, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0071, + "step": 7860 + }, + { + "epoch": 0.48166962482404063, + "grad_norm": 0.3756427764892578, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0071, + "step": 7870 + }, + { + "epoch": 0.48228165738417283, + "grad_norm": 0.2987386882305145, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0069, + "step": 7880 + }, + { + "epoch": 0.48289368994430504, + "grad_norm": 0.24891899526119232, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0077, + "step": 7890 + }, + { + "epoch": 0.48350572250443724, + "grad_norm": 0.44080299139022827, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.011, + "step": 7900 + }, + { + "epoch": 0.48411775506456944, + "grad_norm": 0.20801177620887756, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0089, + "step": 7910 + }, + { + "epoch": 0.48472978762470165, + "grad_norm": 0.31475305557250977, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0091, + "step": 7920 + }, + { + "epoch": 0.48534182018483385, + "grad_norm": 0.29783639311790466, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0082, + "step": 7930 + }, + { + "epoch": 0.48595385274496605, + "grad_norm": 0.3330203890800476, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0071, + "step": 7940 + }, + { + "epoch": 0.48656588530509826, + "grad_norm": 0.3537667691707611, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0068, + "step": 7950 + }, + { + "epoch": 0.4871779178652304, + "grad_norm": 0.2810688316822052, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0059, + "step": 7960 + }, + { + "epoch": 0.4877899504253626, + "grad_norm": 0.3359779715538025, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0059, + "step": 7970 + }, + { + "epoch": 0.4884019829854948, + "grad_norm": 0.36015257239341736, + "learning_rate": 1.754802282200567e-05, + "loss": 0.008, + "step": 7980 + }, + { + "epoch": 0.489014015545627, + "grad_norm": 0.2647690176963806, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0065, + "step": 7990 + }, + { + "epoch": 0.4896260481057592, + "grad_norm": 0.23366811871528625, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0068, + "step": 8000 + }, + { + "epoch": 0.4902380806658914, + "grad_norm": 0.2904139757156372, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0054, + "step": 8010 + }, + { + "epoch": 0.4908501132260236, + "grad_norm": 0.30941230058670044, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0069, + "step": 8020 + }, + { + "epoch": 0.4914621457861558, + "grad_norm": 0.1959473341703415, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0063, + "step": 8030 + }, + { + "epoch": 0.492074178346288, + "grad_norm": 0.33349713683128357, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0059, + "step": 8040 + }, + { + "epoch": 0.49268621090642023, + "grad_norm": 0.39017921686172485, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0067, + "step": 8050 + }, + { + "epoch": 0.49329824346655243, + "grad_norm": 0.36401957273483276, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0061, + "step": 8060 + }, + { + "epoch": 0.49391027602668464, + "grad_norm": 0.22296921908855438, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0071, + "step": 8070 + }, + { + "epoch": 0.49452230858681684, + "grad_norm": 0.8712129592895508, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0104, + "step": 8080 + }, + { + "epoch": 0.49513434114694904, + "grad_norm": 0.39942649006843567, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0068, + "step": 8090 + }, + { + "epoch": 0.4957463737070812, + "grad_norm": 0.3821292817592621, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0076, + "step": 8100 + }, + { + "epoch": 0.4963584062672134, + "grad_norm": 0.35861077904701233, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0086, + "step": 8110 + }, + { + "epoch": 0.4969704388273456, + "grad_norm": 0.38629451394081116, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0068, + "step": 8120 + }, + { + "epoch": 0.4975824713874778, + "grad_norm": 3.412374973297119, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0168, + "step": 8130 + }, + { + "epoch": 0.49819450394761, + "grad_norm": 0.2893833816051483, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0065, + "step": 8140 + }, + { + "epoch": 0.4988065365077422, + "grad_norm": 0.37679117918014526, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0058, + "step": 8150 + }, + { + "epoch": 0.4994185690678744, + "grad_norm": 0.2745130658149719, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0082, + "step": 8160 + }, + { + "epoch": 0.5000306016280066, + "grad_norm": 0.30250442028045654, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0065, + "step": 8170 + }, + { + "epoch": 0.5006426341881388, + "grad_norm": 0.19602464139461517, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0056, + "step": 8180 + }, + { + "epoch": 0.501254666748271, + "grad_norm": 0.4736115634441376, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0062, + "step": 8190 + }, + { + "epoch": 0.5018666993084032, + "grad_norm": 0.25439244508743286, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0069, + "step": 8200 + }, + { + "epoch": 0.5024787318685354, + "grad_norm": 0.19290995597839355, + "learning_rate": 1.739216409306913e-05, + "loss": 0.007, + "step": 8210 + }, + { + "epoch": 0.5030907644286676, + "grad_norm": 0.24844267964363098, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0071, + "step": 8220 + }, + { + "epoch": 0.5037027969887998, + "grad_norm": 0.21179668605327606, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0071, + "step": 8230 + }, + { + "epoch": 0.504314829548932, + "grad_norm": 0.29139387607574463, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0082, + "step": 8240 + }, + { + "epoch": 0.5049268621090642, + "grad_norm": 0.2621973752975464, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0068, + "step": 8250 + }, + { + "epoch": 0.5055388946691964, + "grad_norm": 0.23394125699996948, + "learning_rate": 1.735775329110705e-05, + "loss": 0.006, + "step": 8260 + }, + { + "epoch": 0.5061509272293286, + "grad_norm": 0.28399863839149475, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0067, + "step": 8270 + }, + { + "epoch": 0.5067629597894608, + "grad_norm": 0.5048072934150696, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.008, + "step": 8280 + }, + { + "epoch": 0.507374992349593, + "grad_norm": 0.33848801255226135, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0054, + "step": 8290 + }, + { + "epoch": 0.5079870249097252, + "grad_norm": 0.28341951966285706, + "learning_rate": 1.733009030001197e-05, + "loss": 0.008, + "step": 8300 + }, + { + "epoch": 0.5085990574698575, + "grad_norm": 0.3223153054714203, + "learning_rate": 1.732315596014244e-05, + "loss": 0.007, + "step": 8310 + }, + { + "epoch": 0.5092110900299895, + "grad_norm": 0.23227599263191223, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0049, + "step": 8320 + }, + { + "epoch": 0.5098231225901217, + "grad_norm": 0.2847786247730255, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.007, + "step": 8330 + }, + { + "epoch": 0.510435155150254, + "grad_norm": 0.2026357650756836, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.007, + "step": 8340 + }, + { + "epoch": 0.5110471877103862, + "grad_norm": 0.3617453873157501, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0067, + "step": 8350 + }, + { + "epoch": 0.5116592202705184, + "grad_norm": 0.4439109265804291, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0067, + "step": 8360 + }, + { + "epoch": 0.5122712528306506, + "grad_norm": 0.26640209555625916, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0086, + "step": 8370 + }, + { + "epoch": 0.5128832853907828, + "grad_norm": 0.38045984506607056, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0075, + "step": 8380 + }, + { + "epoch": 0.513495317950915, + "grad_norm": 0.23035791516304016, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.006, + "step": 8390 + }, + { + "epoch": 0.5141073505110472, + "grad_norm": 0.40618664026260376, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0071, + "step": 8400 + }, + { + "epoch": 0.5147193830711794, + "grad_norm": 0.2593354880809784, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0064, + "step": 8410 + }, + { + "epoch": 0.5153314156313116, + "grad_norm": 0.27723655104637146, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0062, + "step": 8420 + }, + { + "epoch": 0.5159434481914438, + "grad_norm": 0.3793911039829254, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0059, + "step": 8430 + }, + { + "epoch": 0.516555480751576, + "grad_norm": 0.28634312748908997, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0063, + "step": 8440 + }, + { + "epoch": 0.5171675133117082, + "grad_norm": 0.39417290687561035, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0078, + "step": 8450 + }, + { + "epoch": 0.5177795458718404, + "grad_norm": 0.3043057322502136, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0064, + "step": 8460 + }, + { + "epoch": 0.5183915784319726, + "grad_norm": 0.36794111132621765, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0106, + "step": 8470 + }, + { + "epoch": 0.5190036109921048, + "grad_norm": 0.312161922454834, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0067, + "step": 8480 + }, + { + "epoch": 0.519615643552237, + "grad_norm": 0.39240267872810364, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0065, + "step": 8490 + }, + { + "epoch": 0.5202276761123692, + "grad_norm": 0.4500446915626526, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0073, + "step": 8500 + }, + { + "epoch": 0.5208397086725014, + "grad_norm": 0.22808927297592163, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0072, + "step": 8510 + }, + { + "epoch": 0.5214517412326336, + "grad_norm": 0.3262411057949066, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0065, + "step": 8520 + }, + { + "epoch": 0.5220637737927658, + "grad_norm": 0.472229927778244, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0068, + "step": 8530 + }, + { + "epoch": 0.522675806352898, + "grad_norm": 0.31563568115234375, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0074, + "step": 8540 + }, + { + "epoch": 0.5232878389130302, + "grad_norm": 0.27949750423431396, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0064, + "step": 8550 + }, + { + "epoch": 0.5238998714731624, + "grad_norm": 0.30297499895095825, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0075, + "step": 8560 + }, + { + "epoch": 0.5245119040332946, + "grad_norm": 0.3946770429611206, + "learning_rate": 1.714028248198457e-05, + "loss": 0.011, + "step": 8570 + }, + { + "epoch": 0.5251239365934268, + "grad_norm": 0.3405992090702057, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0082, + "step": 8580 + }, + { + "epoch": 0.525735969153559, + "grad_norm": 0.2963511347770691, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0066, + "step": 8590 + }, + { + "epoch": 0.5263480017136911, + "grad_norm": 0.1909177303314209, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.006, + "step": 8600 + }, + { + "epoch": 0.5269600342738233, + "grad_norm": 0.3378836512565613, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0058, + "step": 8610 + }, + { + "epoch": 0.5275720668339555, + "grad_norm": 0.30862805247306824, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0067, + "step": 8620 + }, + { + "epoch": 0.5281840993940877, + "grad_norm": 0.397293359041214, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0071, + "step": 8630 + }, + { + "epoch": 0.5287961319542199, + "grad_norm": 0.3665411174297333, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0068, + "step": 8640 + }, + { + "epoch": 0.5294081645143521, + "grad_norm": 0.34842419624328613, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0068, + "step": 8650 + }, + { + "epoch": 0.5300201970744843, + "grad_norm": 0.38205671310424805, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0065, + "step": 8660 + }, + { + "epoch": 0.5306322296346165, + "grad_norm": 0.35549092292785645, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0068, + "step": 8670 + }, + { + "epoch": 0.5312442621947487, + "grad_norm": 0.15676020085811615, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0055, + "step": 8680 + }, + { + "epoch": 0.5318562947548809, + "grad_norm": 0.22985056042671204, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0071, + "step": 8690 + }, + { + "epoch": 0.5324683273150131, + "grad_norm": 0.2743426263332367, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0047, + "step": 8700 + }, + { + "epoch": 0.5330803598751453, + "grad_norm": 0.2503803074359894, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0079, + "step": 8710 + }, + { + "epoch": 0.5336923924352776, + "grad_norm": 0.5036469101905823, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0095, + "step": 8720 + }, + { + "epoch": 0.5343044249954098, + "grad_norm": 0.2349964827299118, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0068, + "step": 8730 + }, + { + "epoch": 0.534916457555542, + "grad_norm": 0.28706061840057373, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0065, + "step": 8740 + }, + { + "epoch": 0.5355284901156742, + "grad_norm": 0.21812452375888824, + "learning_rate": 1.701081551967764e-05, + "loss": 0.008, + "step": 8750 + }, + { + "epoch": 0.5361405226758064, + "grad_norm": 0.301618754863739, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0069, + "step": 8760 + }, + { + "epoch": 0.5367525552359386, + "grad_norm": 0.35402950644493103, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0067, + "step": 8770 + }, + { + "epoch": 0.5373645877960708, + "grad_norm": 0.2875203788280487, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0097, + "step": 8780 + }, + { + "epoch": 0.537976620356203, + "grad_norm": 0.2358965128660202, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0053, + "step": 8790 + }, + { + "epoch": 0.5385886529163352, + "grad_norm": 0.14462094008922577, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0059, + "step": 8800 + }, + { + "epoch": 0.5392006854764674, + "grad_norm": 0.17893171310424805, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0062, + "step": 8810 + }, + { + "epoch": 0.5398127180365996, + "grad_norm": 0.2923351526260376, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0078, + "step": 8820 + }, + { + "epoch": 0.5404247505967318, + "grad_norm": 0.3288479745388031, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0067, + "step": 8830 + }, + { + "epoch": 0.541036783156864, + "grad_norm": 0.3996310532093048, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.006, + "step": 8840 + }, + { + "epoch": 0.5416488157169962, + "grad_norm": 0.24345380067825317, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0067, + "step": 8850 + }, + { + "epoch": 0.5422608482771284, + "grad_norm": 0.26688340306282043, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0072, + "step": 8860 + }, + { + "epoch": 0.5428728808372606, + "grad_norm": 0.4816153645515442, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0099, + "step": 8870 + }, + { + "epoch": 0.5434849133973927, + "grad_norm": 0.22544988989830017, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.007, + "step": 8880 + }, + { + "epoch": 0.5440969459575249, + "grad_norm": 0.2820419669151306, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0064, + "step": 8890 + }, + { + "epoch": 0.5447089785176571, + "grad_norm": 0.2758846879005432, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0072, + "step": 8900 + }, + { + "epoch": 0.5453210110777893, + "grad_norm": 0.4620129466056824, + "learning_rate": 1.689381359053773e-05, + "loss": 0.008, + "step": 8910 + }, + { + "epoch": 0.5459330436379215, + "grad_norm": 0.5567039847373962, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0079, + "step": 8920 + }, + { + "epoch": 0.5465450761980537, + "grad_norm": 0.347251832485199, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.006, + "step": 8930 + }, + { + "epoch": 0.5471571087581859, + "grad_norm": 0.31768012046813965, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0065, + "step": 8940 + }, + { + "epoch": 0.5477691413183181, + "grad_norm": 0.24245156347751617, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0052, + "step": 8950 + }, + { + "epoch": 0.5483811738784503, + "grad_norm": 0.2124931961297989, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0074, + "step": 8960 + }, + { + "epoch": 0.5489932064385825, + "grad_norm": 0.18998636305332184, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0056, + "step": 8970 + }, + { + "epoch": 0.5496052389987147, + "grad_norm": 0.2667362689971924, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0056, + "step": 8980 + }, + { + "epoch": 0.5502172715588469, + "grad_norm": 0.4424617886543274, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0091, + "step": 8990 + }, + { + "epoch": 0.5508293041189791, + "grad_norm": 0.33623644709587097, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0061, + "step": 9000 + }, + { + "epoch": 0.5514413366791113, + "grad_norm": 0.29990604519844055, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0059, + "step": 9010 + }, + { + "epoch": 0.5520533692392435, + "grad_norm": 0.4384118914604187, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0059, + "step": 9020 + }, + { + "epoch": 0.5526654017993757, + "grad_norm": 0.3468496799468994, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0068, + "step": 9030 + }, + { + "epoch": 0.5532774343595079, + "grad_norm": 0.3473573327064514, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0062, + "step": 9040 + }, + { + "epoch": 0.5538894669196401, + "grad_norm": 0.36125242710113525, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0062, + "step": 9050 + }, + { + "epoch": 0.5545014994797723, + "grad_norm": 0.2603420615196228, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0091, + "step": 9060 + }, + { + "epoch": 0.5551135320399045, + "grad_norm": 0.27355659008026123, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0074, + "step": 9070 + }, + { + "epoch": 0.5557255646000367, + "grad_norm": 0.24741119146347046, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0064, + "step": 9080 + }, + { + "epoch": 0.556337597160169, + "grad_norm": 0.2001475840806961, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0094, + "step": 9090 + }, + { + "epoch": 0.5569496297203012, + "grad_norm": 0.41522347927093506, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0088, + "step": 9100 + }, + { + "epoch": 0.5575616622804334, + "grad_norm": 0.27282488346099854, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0062, + "step": 9110 + }, + { + "epoch": 0.5581736948405656, + "grad_norm": 0.26905956864356995, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.007, + "step": 9120 + }, + { + "epoch": 0.5587857274006978, + "grad_norm": 0.24747484922409058, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0084, + "step": 9130 + }, + { + "epoch": 0.55939775996083, + "grad_norm": 0.1863871067762375, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0061, + "step": 9140 + }, + { + "epoch": 0.5600097925209622, + "grad_norm": 0.3599740266799927, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0063, + "step": 9150 + }, + { + "epoch": 0.5606218250810943, + "grad_norm": 0.2238125205039978, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0047, + "step": 9160 + }, + { + "epoch": 0.5612338576412265, + "grad_norm": 0.272077351808548, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.006, + "step": 9170 + }, + { + "epoch": 0.5618458902013587, + "grad_norm": 0.2371625155210495, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0064, + "step": 9180 + }, + { + "epoch": 0.5624579227614909, + "grad_norm": 0.12783293426036835, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0054, + "step": 9190 + }, + { + "epoch": 0.5630699553216231, + "grad_norm": 0.3144581615924835, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0064, + "step": 9200 + }, + { + "epoch": 0.5636819878817553, + "grad_norm": 0.31995031237602234, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0082, + "step": 9210 + }, + { + "epoch": 0.5642940204418875, + "grad_norm": 0.31995660066604614, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0083, + "step": 9220 + }, + { + "epoch": 0.5649060530020197, + "grad_norm": 0.5018982291221619, + "learning_rate": 1.665453350687773e-05, + "loss": 0.007, + "step": 9230 + }, + { + "epoch": 0.5655180855621519, + "grad_norm": 0.2927841544151306, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0104, + "step": 9240 + }, + { + "epoch": 0.5661301181222841, + "grad_norm": 0.21124979853630066, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0068, + "step": 9250 + }, + { + "epoch": 0.5667421506824163, + "grad_norm": 0.25787463784217834, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0059, + "step": 9260 + }, + { + "epoch": 0.5673541832425485, + "grad_norm": 0.3194720447063446, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0065, + "step": 9270 + }, + { + "epoch": 0.5679662158026807, + "grad_norm": 0.24165599048137665, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.006, + "step": 9280 + }, + { + "epoch": 0.5685782483628129, + "grad_norm": 0.4880482256412506, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0096, + "step": 9290 + }, + { + "epoch": 0.5691902809229451, + "grad_norm": 0.24660199880599976, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0098, + "step": 9300 + }, + { + "epoch": 0.5698023134830773, + "grad_norm": 0.24707400798797607, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0052, + "step": 9310 + }, + { + "epoch": 0.5704143460432095, + "grad_norm": 0.33855682611465454, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.011, + "step": 9320 + }, + { + "epoch": 0.5710263786033417, + "grad_norm": 0.22913751006126404, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0074, + "step": 9330 + }, + { + "epoch": 0.5716384111634739, + "grad_norm": 0.24127185344696045, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0062, + "step": 9340 + }, + { + "epoch": 0.5722504437236061, + "grad_norm": 0.26104915142059326, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0065, + "step": 9350 + }, + { + "epoch": 0.5728624762837383, + "grad_norm": 0.21698857843875885, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0062, + "step": 9360 + }, + { + "epoch": 0.5734745088438705, + "grad_norm": 0.29092445969581604, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0081, + "step": 9370 + }, + { + "epoch": 0.5740865414040027, + "grad_norm": 0.2534378468990326, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0064, + "step": 9380 + }, + { + "epoch": 0.5746985739641349, + "grad_norm": 0.28900131583213806, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0064, + "step": 9390 + }, + { + "epoch": 0.5753106065242671, + "grad_norm": 0.3028101921081543, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0104, + "step": 9400 + }, + { + "epoch": 0.5759226390843993, + "grad_norm": 0.28851139545440674, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0069, + "step": 9410 + }, + { + "epoch": 0.5765346716445315, + "grad_norm": 0.5735841393470764, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0072, + "step": 9420 + }, + { + "epoch": 0.5771467042046637, + "grad_norm": 0.20355567336082458, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0056, + "step": 9430 + }, + { + "epoch": 0.5777587367647958, + "grad_norm": 0.37027955055236816, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.006, + "step": 9440 + }, + { + "epoch": 0.578370769324928, + "grad_norm": 0.2701684832572937, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0066, + "step": 9450 + }, + { + "epoch": 0.5789828018850602, + "grad_norm": 0.17381855845451355, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0054, + "step": 9460 + }, + { + "epoch": 0.5795948344451924, + "grad_norm": 0.250261515378952, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0096, + "step": 9470 + }, + { + "epoch": 0.5802068670053246, + "grad_norm": 0.22972841560840607, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0058, + "step": 9480 + }, + { + "epoch": 0.5808188995654568, + "grad_norm": 0.22654809057712555, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0061, + "step": 9490 + }, + { + "epoch": 0.581430932125589, + "grad_norm": 0.17165100574493408, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0058, + "step": 9500 + }, + { + "epoch": 0.5820429646857213, + "grad_norm": 0.2462143450975418, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0054, + "step": 9510 + }, + { + "epoch": 0.5826549972458535, + "grad_norm": 0.3970383107662201, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0069, + "step": 9520 + }, + { + "epoch": 0.5832670298059857, + "grad_norm": 0.21578988432884216, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0054, + "step": 9530 + }, + { + "epoch": 0.5838790623661179, + "grad_norm": 0.5680915713310242, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0072, + "step": 9540 + }, + { + "epoch": 0.5844910949262501, + "grad_norm": 0.24070246517658234, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0073, + "step": 9550 + }, + { + "epoch": 0.5851031274863823, + "grad_norm": 0.2524685263633728, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0068, + "step": 9560 + }, + { + "epoch": 0.5857151600465145, + "grad_norm": 0.27286672592163086, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.008, + "step": 9570 + }, + { + "epoch": 0.5863271926066467, + "grad_norm": 0.3459629714488983, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0088, + "step": 9580 + }, + { + "epoch": 0.5869392251667789, + "grad_norm": 0.2964814603328705, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0066, + "step": 9590 + }, + { + "epoch": 0.5875512577269111, + "grad_norm": 0.3559853434562683, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0064, + "step": 9600 + }, + { + "epoch": 0.5881632902870433, + "grad_norm": 0.256898432970047, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0056, + "step": 9610 + }, + { + "epoch": 0.5887753228471755, + "grad_norm": 0.25032711029052734, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0052, + "step": 9620 + }, + { + "epoch": 0.5893873554073077, + "grad_norm": 0.2467224895954132, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0048, + "step": 9630 + }, + { + "epoch": 0.5899993879674399, + "grad_norm": 0.5331161618232727, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0078, + "step": 9640 + }, + { + "epoch": 0.5906114205275721, + "grad_norm": 0.33348897099494934, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0068, + "step": 9650 + }, + { + "epoch": 0.5912234530877043, + "grad_norm": 0.21435993909835815, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0058, + "step": 9660 + }, + { + "epoch": 0.5918354856478365, + "grad_norm": 0.35850396752357483, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0068, + "step": 9670 + }, + { + "epoch": 0.5924475182079687, + "grad_norm": 0.3007623851299286, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0063, + "step": 9680 + }, + { + "epoch": 0.5930595507681009, + "grad_norm": 0.22949714958667755, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0054, + "step": 9690 + }, + { + "epoch": 0.5936715833282331, + "grad_norm": 0.23259367048740387, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0048, + "step": 9700 + }, + { + "epoch": 0.5942836158883653, + "grad_norm": 0.2305079996585846, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0047, + "step": 9710 + }, + { + "epoch": 0.5948956484484974, + "grad_norm": 0.33875930309295654, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0063, + "step": 9720 + }, + { + "epoch": 0.5955076810086296, + "grad_norm": 0.3981896936893463, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0076, + "step": 9730 + }, + { + "epoch": 0.5961197135687618, + "grad_norm": 0.280831515789032, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0075, + "step": 9740 + }, + { + "epoch": 0.596731746128894, + "grad_norm": 0.26045629382133484, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0064, + "step": 9750 + }, + { + "epoch": 0.5973437786890262, + "grad_norm": 0.23102521896362305, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0071, + "step": 9760 + }, + { + "epoch": 0.5979558112491584, + "grad_norm": 0.5013224482536316, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0063, + "step": 9770 + }, + { + "epoch": 0.5985678438092906, + "grad_norm": 0.45689067244529724, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0088, + "step": 9780 + }, + { + "epoch": 0.5991798763694228, + "grad_norm": 0.27118632197380066, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0065, + "step": 9790 + }, + { + "epoch": 0.599791908929555, + "grad_norm": 0.420202374458313, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0076, + "step": 9800 + }, + { + "epoch": 0.6004039414896872, + "grad_norm": 0.35844025015830994, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0056, + "step": 9810 + }, + { + "epoch": 0.6010159740498194, + "grad_norm": 0.2205585241317749, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0082, + "step": 9820 + }, + { + "epoch": 0.6016280066099516, + "grad_norm": 0.18860426545143127, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.007, + "step": 9830 + }, + { + "epoch": 0.6022400391700838, + "grad_norm": 0.25045180320739746, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0082, + "step": 9840 + }, + { + "epoch": 0.602852071730216, + "grad_norm": 0.2581705152988434, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0065, + "step": 9850 + }, + { + "epoch": 0.6034641042903482, + "grad_norm": 0.25894811749458313, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0058, + "step": 9860 + }, + { + "epoch": 0.6040761368504804, + "grad_norm": 0.43305444717407227, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0066, + "step": 9870 + }, + { + "epoch": 0.6046881694106127, + "grad_norm": 0.2295757383108139, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0069, + "step": 9880 + }, + { + "epoch": 0.6053002019707449, + "grad_norm": 0.29785802960395813, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0074, + "step": 9890 + }, + { + "epoch": 0.6059122345308771, + "grad_norm": 0.3353278338909149, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0068, + "step": 9900 + }, + { + "epoch": 0.6065242670910093, + "grad_norm": 0.29115045070648193, + "learning_rate": 1.612387195896372e-05, + "loss": 0.008, + "step": 9910 + }, + { + "epoch": 0.6071362996511415, + "grad_norm": 0.3202555477619171, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0071, + "step": 9920 + }, + { + "epoch": 0.6077483322112737, + "grad_norm": 0.2849314212799072, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.005, + "step": 9930 + }, + { + "epoch": 0.6083603647714059, + "grad_norm": 0.2768756151199341, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0051, + "step": 9940 + }, + { + "epoch": 0.6089723973315381, + "grad_norm": 0.3138035535812378, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0058, + "step": 9950 + }, + { + "epoch": 0.6095844298916703, + "grad_norm": 0.20827682316303253, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0058, + "step": 9960 + }, + { + "epoch": 0.6101964624518025, + "grad_norm": 0.29986995458602905, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0076, + "step": 9970 + }, + { + "epoch": 0.6108084950119347, + "grad_norm": 0.23564326763153076, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0056, + "step": 9980 + }, + { + "epoch": 0.6114205275720669, + "grad_norm": 0.24854765832424164, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0066, + "step": 9990 + }, + { + "epoch": 0.6120325601321991, + "grad_norm": 0.5696694850921631, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0072, + "step": 10000 + }, + { + "epoch": 0.6126445926923312, + "grad_norm": 0.24267911911010742, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.0061, + "step": 10010 + }, + { + "epoch": 0.6132566252524634, + "grad_norm": 0.1955283135175705, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0076, + "step": 10020 + }, + { + "epoch": 0.6138686578125956, + "grad_norm": 0.3427830934524536, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0073, + "step": 10030 + }, + { + "epoch": 0.6144806903727278, + "grad_norm": 0.38532915711402893, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0078, + "step": 10040 + }, + { + "epoch": 0.61509272293286, + "grad_norm": 0.4302294850349426, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0071, + "step": 10050 + }, + { + "epoch": 0.6157047554929922, + "grad_norm": 0.38420233130455017, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0072, + "step": 10060 + }, + { + "epoch": 0.6163167880531244, + "grad_norm": 0.23822636902332306, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.004, + "step": 10070 + }, + { + "epoch": 0.6169288206132566, + "grad_norm": 0.25123289227485657, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0065, + "step": 10080 + }, + { + "epoch": 0.6175408531733888, + "grad_norm": 0.23007746040821075, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0059, + "step": 10090 + }, + { + "epoch": 0.618152885733521, + "grad_norm": 0.24051082134246826, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0089, + "step": 10100 + }, + { + "epoch": 0.6187649182936532, + "grad_norm": 0.26246321201324463, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0052, + "step": 10110 + }, + { + "epoch": 0.6193769508537854, + "grad_norm": 0.3160432279109955, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0059, + "step": 10120 + }, + { + "epoch": 0.6199889834139176, + "grad_norm": 0.42534199357032776, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0071, + "step": 10130 + }, + { + "epoch": 0.6206010159740498, + "grad_norm": 0.22966268658638, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0052, + "step": 10140 + }, + { + "epoch": 0.621213048534182, + "grad_norm": 0.22234882414340973, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0073, + "step": 10150 + }, + { + "epoch": 0.6218250810943142, + "grad_norm": 0.31061676144599915, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0066, + "step": 10160 + }, + { + "epoch": 0.6224371136544464, + "grad_norm": 0.34178492426872253, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0063, + "step": 10170 + }, + { + "epoch": 0.6230491462145786, + "grad_norm": 0.263583779335022, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0079, + "step": 10180 + }, + { + "epoch": 0.6236611787747108, + "grad_norm": 0.3774336278438568, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0066, + "step": 10190 + }, + { + "epoch": 0.624273211334843, + "grad_norm": 0.29274430871009827, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.007, + "step": 10200 + }, + { + "epoch": 0.6248852438949752, + "grad_norm": 0.31850868463516235, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0056, + "step": 10210 + }, + { + "epoch": 0.6254972764551074, + "grad_norm": 0.3084369897842407, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0086, + "step": 10220 + }, + { + "epoch": 0.6261093090152396, + "grad_norm": 0.21596118807792664, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0072, + "step": 10230 + }, + { + "epoch": 0.6267213415753718, + "grad_norm": 0.16397996246814728, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0075, + "step": 10240 + }, + { + "epoch": 0.627333374135504, + "grad_norm": 0.15055827796459198, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0046, + "step": 10250 + }, + { + "epoch": 0.6279454066956363, + "grad_norm": 0.23483684659004211, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0064, + "step": 10260 + }, + { + "epoch": 0.6285574392557685, + "grad_norm": 0.3131091594696045, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0069, + "step": 10270 + }, + { + "epoch": 0.6291694718159007, + "grad_norm": 0.27958226203918457, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0067, + "step": 10280 + }, + { + "epoch": 0.6297815043760328, + "grad_norm": 0.23422567546367645, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0086, + "step": 10290 + }, + { + "epoch": 0.630393536936165, + "grad_norm": 0.4644703269004822, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0067, + "step": 10300 + }, + { + "epoch": 0.6310055694962972, + "grad_norm": 0.45787107944488525, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0068, + "step": 10310 + }, + { + "epoch": 0.6316176020564294, + "grad_norm": 0.21038737893104553, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0074, + "step": 10320 + }, + { + "epoch": 0.6322296346165616, + "grad_norm": 0.23812010884284973, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0054, + "step": 10330 + }, + { + "epoch": 0.6328416671766938, + "grad_norm": 0.36856284737586975, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0061, + "step": 10340 + }, + { + "epoch": 0.633453699736826, + "grad_norm": 0.3540131151676178, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0074, + "step": 10350 + }, + { + "epoch": 0.6340657322969582, + "grad_norm": 0.3004823923110962, + "learning_rate": 1.575723252169281e-05, + "loss": 0.006, + "step": 10360 + }, + { + "epoch": 0.6346777648570904, + "grad_norm": 0.17188489437103271, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0053, + "step": 10370 + }, + { + "epoch": 0.6352897974172226, + "grad_norm": 0.21710847318172455, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0062, + "step": 10380 + }, + { + "epoch": 0.6359018299773548, + "grad_norm": 0.2356785386800766, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0061, + "step": 10390 + }, + { + "epoch": 0.636513862537487, + "grad_norm": 0.2736414670944214, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0063, + "step": 10400 + }, + { + "epoch": 0.6371258950976192, + "grad_norm": 0.23872444033622742, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.006, + "step": 10410 + }, + { + "epoch": 0.6377379276577514, + "grad_norm": 0.24478361010551453, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0073, + "step": 10420 + }, + { + "epoch": 0.6383499602178836, + "grad_norm": 0.2964334487915039, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0048, + "step": 10430 + }, + { + "epoch": 0.6389619927780158, + "grad_norm": 0.2760549783706665, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0051, + "step": 10440 + }, + { + "epoch": 0.639574025338148, + "grad_norm": 0.2598065137863159, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0072, + "step": 10450 + }, + { + "epoch": 0.6401860578982802, + "grad_norm": 0.346999853849411, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0052, + "step": 10460 + }, + { + "epoch": 0.6407980904584124, + "grad_norm": 0.31291016936302185, + "learning_rate": 1.56658563993822e-05, + "loss": 0.007, + "step": 10470 + }, + { + "epoch": 0.6414101230185446, + "grad_norm": 0.2631952166557312, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0058, + "step": 10480 + }, + { + "epoch": 0.6420221555786768, + "grad_norm": 0.30895209312438965, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.006, + "step": 10490 + }, + { + "epoch": 0.642634188138809, + "grad_norm": 0.17614217102527618, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0069, + "step": 10500 + }, + { + "epoch": 0.6432462206989412, + "grad_norm": 0.38792312145233154, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0077, + "step": 10510 + }, + { + "epoch": 0.6438582532590734, + "grad_norm": 0.1722564697265625, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0054, + "step": 10520 + }, + { + "epoch": 0.6444702858192056, + "grad_norm": 0.2741699516773224, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0062, + "step": 10530 + }, + { + "epoch": 0.6450823183793378, + "grad_norm": 0.2059863954782486, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0073, + "step": 10540 + }, + { + "epoch": 0.64569435093947, + "grad_norm": 0.2702447474002838, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0044, + "step": 10550 + }, + { + "epoch": 0.6463063834996022, + "grad_norm": 0.2299312800168991, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0051, + "step": 10560 + }, + { + "epoch": 0.6469184160597343, + "grad_norm": 0.1995723992586136, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0057, + "step": 10570 + }, + { + "epoch": 0.6475304486198665, + "grad_norm": 0.30346980690956116, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0063, + "step": 10580 + }, + { + "epoch": 0.6481424811799987, + "grad_norm": 0.5040738582611084, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0064, + "step": 10590 + }, + { + "epoch": 0.6487545137401309, + "grad_norm": 0.16984818875789642, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0071, + "step": 10600 + }, + { + "epoch": 0.6493665463002631, + "grad_norm": 0.26560020446777344, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0116, + "step": 10610 + }, + { + "epoch": 0.6499785788603953, + "grad_norm": 0.4563823342323303, + "learning_rate": 1.554018740860716e-05, + "loss": 0.008, + "step": 10620 + }, + { + "epoch": 0.6505906114205275, + "grad_norm": 0.23272818326950073, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.006, + "step": 10630 + }, + { + "epoch": 0.6512026439806597, + "grad_norm": 0.19166870415210724, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0066, + "step": 10640 + }, + { + "epoch": 0.651814676540792, + "grad_norm": 0.2822705805301666, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0062, + "step": 10650 + }, + { + "epoch": 0.6524267091009242, + "grad_norm": 0.24001267552375793, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0069, + "step": 10660 + }, + { + "epoch": 0.6530387416610564, + "grad_norm": 0.2563900947570801, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0068, + "step": 10670 + }, + { + "epoch": 0.6536507742211886, + "grad_norm": 0.2747437357902527, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0058, + "step": 10680 + }, + { + "epoch": 0.6542628067813208, + "grad_norm": 0.39710354804992676, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.005, + "step": 10690 + }, + { + "epoch": 0.654874839341453, + "grad_norm": 0.30690231919288635, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0102, + "step": 10700 + }, + { + "epoch": 0.6554868719015852, + "grad_norm": 0.2879253923892975, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0072, + "step": 10710 + }, + { + "epoch": 0.6560989044617174, + "grad_norm": 0.19964110851287842, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0065, + "step": 10720 + }, + { + "epoch": 0.6567109370218496, + "grad_norm": 0.20109151303768158, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0066, + "step": 10730 + }, + { + "epoch": 0.6573229695819818, + "grad_norm": 0.21469832956790924, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0048, + "step": 10740 + }, + { + "epoch": 0.657935002142114, + "grad_norm": 0.19622936844825745, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0057, + "step": 10750 + }, + { + "epoch": 0.6585470347022462, + "grad_norm": 0.2255190759897232, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0052, + "step": 10760 + }, + { + "epoch": 0.6591590672623784, + "grad_norm": 0.47484955191612244, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0066, + "step": 10770 + }, + { + "epoch": 0.6597710998225106, + "grad_norm": 0.32192179560661316, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0067, + "step": 10780 + }, + { + "epoch": 0.6603831323826428, + "grad_norm": 0.33044904470443726, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0061, + "step": 10790 + }, + { + "epoch": 0.660995164942775, + "grad_norm": 0.3206661343574524, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0072, + "step": 10800 + }, + { + "epoch": 0.6616071975029072, + "grad_norm": 0.34903818368911743, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0055, + "step": 10810 + }, + { + "epoch": 0.6622192300630394, + "grad_norm": 0.1982222944498062, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0063, + "step": 10820 + }, + { + "epoch": 0.6628312626231716, + "grad_norm": 0.25388309359550476, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0072, + "step": 10830 + }, + { + "epoch": 0.6634432951833038, + "grad_norm": 0.2325269728899002, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0078, + "step": 10840 + }, + { + "epoch": 0.6640553277434359, + "grad_norm": 0.3364964425563812, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0054, + "step": 10850 + }, + { + "epoch": 0.6646673603035681, + "grad_norm": 0.198661208152771, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0061, + "step": 10860 + }, + { + "epoch": 0.6652793928637003, + "grad_norm": 0.333836168050766, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0058, + "step": 10870 + }, + { + "epoch": 0.6658914254238325, + "grad_norm": 0.21908101439476013, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0087, + "step": 10880 + }, + { + "epoch": 0.6665034579839647, + "grad_norm": 0.3094167709350586, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0062, + "step": 10890 + }, + { + "epoch": 0.6671154905440969, + "grad_norm": 0.28113746643066406, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0066, + "step": 10900 + }, + { + "epoch": 0.6677275231042291, + "grad_norm": 0.20239399373531342, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0071, + "step": 10910 + }, + { + "epoch": 0.6683395556643613, + "grad_norm": 0.32829156517982483, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0056, + "step": 10920 + }, + { + "epoch": 0.6689515882244935, + "grad_norm": 0.2950859069824219, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 0.6695636207846257, + "grad_norm": 0.36404141783714294, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0075, + "step": 10940 + }, + { + "epoch": 0.6701756533447579, + "grad_norm": 0.2479381114244461, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0055, + "step": 10950 + }, + { + "epoch": 0.6707876859048901, + "grad_norm": 0.1934390366077423, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.005, + "step": 10960 + }, + { + "epoch": 0.6713997184650223, + "grad_norm": 0.20912423729896545, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0064, + "step": 10970 + }, + { + "epoch": 0.6720117510251545, + "grad_norm": 0.1781405806541443, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0048, + "step": 10980 + }, + { + "epoch": 0.6726237835852867, + "grad_norm": 0.18812811374664307, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0048, + "step": 10990 + }, + { + "epoch": 0.6732358161454189, + "grad_norm": 0.2006077766418457, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0073, + "step": 11000 + }, + { + "epoch": 0.6738478487055511, + "grad_norm": 0.20471568405628204, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0068, + "step": 11010 + }, + { + "epoch": 0.6744598812656833, + "grad_norm": 0.2979716658592224, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0067, + "step": 11020 + }, + { + "epoch": 0.6750719138258156, + "grad_norm": 0.3256290853023529, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0054, + "step": 11030 + }, + { + "epoch": 0.6756839463859478, + "grad_norm": 0.3346560001373291, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0061, + "step": 11040 + }, + { + "epoch": 0.67629597894608, + "grad_norm": 0.35791122913360596, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0054, + "step": 11050 + }, + { + "epoch": 0.6769080115062122, + "grad_norm": 0.30428826808929443, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0066, + "step": 11060 + }, + { + "epoch": 0.6775200440663444, + "grad_norm": 0.31254154443740845, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0065, + "step": 11070 + }, + { + "epoch": 0.6781320766264766, + "grad_norm": 0.263028621673584, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0062, + "step": 11080 + }, + { + "epoch": 0.6787441091866088, + "grad_norm": 0.22496990859508514, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0056, + "step": 11090 + }, + { + "epoch": 0.679356141746741, + "grad_norm": 0.2647632360458374, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0072, + "step": 11100 + }, + { + "epoch": 0.6799681743068732, + "grad_norm": 0.2517150342464447, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0064, + "step": 11110 + }, + { + "epoch": 0.6805802068670054, + "grad_norm": 0.30550616979599, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0055, + "step": 11120 + }, + { + "epoch": 0.6811922394271375, + "grad_norm": 0.21312931180000305, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0074, + "step": 11130 + }, + { + "epoch": 0.6818042719872697, + "grad_norm": 0.21152199804782867, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0047, + "step": 11140 + }, + { + "epoch": 0.6824163045474019, + "grad_norm": 0.2030613273382187, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0045, + "step": 11150 + }, + { + "epoch": 0.6830283371075341, + "grad_norm": 0.30646151304244995, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0045, + "step": 11160 + }, + { + "epoch": 0.6836403696676663, + "grad_norm": 0.2693783938884735, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0061, + "step": 11170 + }, + { + "epoch": 0.6842524022277985, + "grad_norm": 0.25288495421409607, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0068, + "step": 11180 + }, + { + "epoch": 0.6848644347879307, + "grad_norm": 0.34989964962005615, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.007, + "step": 11190 + }, + { + "epoch": 0.6854764673480629, + "grad_norm": 0.192350834608078, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0064, + "step": 11200 + }, + { + "epoch": 0.6860884999081951, + "grad_norm": 0.3841196894645691, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0069, + "step": 11210 + }, + { + "epoch": 0.6867005324683273, + "grad_norm": 0.2168666571378708, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0063, + "step": 11220 + }, + { + "epoch": 0.6873125650284595, + "grad_norm": 0.2756234109401703, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0068, + "step": 11230 + }, + { + "epoch": 0.6879245975885917, + "grad_norm": 0.1971903294324875, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.006, + "step": 11240 + }, + { + "epoch": 0.6885366301487239, + "grad_norm": 0.3857499659061432, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0063, + "step": 11250 + }, + { + "epoch": 0.6891486627088561, + "grad_norm": 0.194110706448555, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0049, + "step": 11260 + }, + { + "epoch": 0.6897606952689883, + "grad_norm": 0.24935179948806763, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0054, + "step": 11270 + }, + { + "epoch": 0.6903727278291205, + "grad_norm": 0.5208527445793152, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0062, + "step": 11280 + }, + { + "epoch": 0.6909847603892527, + "grad_norm": 0.2917899191379547, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0058, + "step": 11290 + }, + { + "epoch": 0.6915967929493849, + "grad_norm": 0.42692577838897705, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0072, + "step": 11300 + }, + { + "epoch": 0.6922088255095171, + "grad_norm": 0.36888429522514343, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0059, + "step": 11310 + }, + { + "epoch": 0.6928208580696493, + "grad_norm": 0.26246029138565063, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0057, + "step": 11320 + }, + { + "epoch": 0.6934328906297815, + "grad_norm": 0.22163739800453186, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0078, + "step": 11330 + }, + { + "epoch": 0.6940449231899137, + "grad_norm": 0.33411458134651184, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0053, + "step": 11340 + }, + { + "epoch": 0.6946569557500459, + "grad_norm": 0.2792898118495941, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0095, + "step": 11350 + }, + { + "epoch": 0.6952689883101781, + "grad_norm": 0.2770175039768219, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0059, + "step": 11360 + }, + { + "epoch": 0.6958810208703103, + "grad_norm": 0.14913171529769897, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0046, + "step": 11370 + }, + { + "epoch": 0.6964930534304425, + "grad_norm": 0.22906239330768585, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0054, + "step": 11380 + }, + { + "epoch": 0.6971050859905747, + "grad_norm": 0.2854336202144623, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0052, + "step": 11390 + }, + { + "epoch": 0.697717118550707, + "grad_norm": 0.21835818886756897, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0064, + "step": 11400 + }, + { + "epoch": 0.698329151110839, + "grad_norm": 0.42180293798446655, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0071, + "step": 11410 + }, + { + "epoch": 0.6989411836709712, + "grad_norm": 0.3056841492652893, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0092, + "step": 11420 + }, + { + "epoch": 0.6995532162311034, + "grad_norm": 0.15149559080600739, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0049, + "step": 11430 + }, + { + "epoch": 0.7001652487912357, + "grad_norm": 0.15561188757419586, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0051, + "step": 11440 + }, + { + "epoch": 0.7007772813513679, + "grad_norm": 0.2941122055053711, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0065, + "step": 11450 + }, + { + "epoch": 0.7013893139115001, + "grad_norm": 0.3008195757865906, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0059, + "step": 11460 + }, + { + "epoch": 0.7020013464716323, + "grad_norm": 0.3787235617637634, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0068, + "step": 11470 + }, + { + "epoch": 0.7026133790317645, + "grad_norm": 0.2069675624370575, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.005, + "step": 11480 + }, + { + "epoch": 0.7032254115918967, + "grad_norm": 0.33505553007125854, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0058, + "step": 11490 + }, + { + "epoch": 0.7038374441520289, + "grad_norm": 0.281213641166687, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0064, + "step": 11500 + }, + { + "epoch": 0.7044494767121611, + "grad_norm": 0.28471192717552185, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0066, + "step": 11510 + }, + { + "epoch": 0.7050615092722933, + "grad_norm": 0.3166801929473877, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0062, + "step": 11520 + }, + { + "epoch": 0.7056735418324255, + "grad_norm": 0.26893407106399536, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.005, + "step": 11530 + }, + { + "epoch": 0.7062855743925577, + "grad_norm": 0.17421478033065796, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0063, + "step": 11540 + }, + { + "epoch": 0.7068976069526899, + "grad_norm": 0.40999990701675415, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0069, + "step": 11550 + }, + { + "epoch": 0.7075096395128221, + "grad_norm": 0.190180242061615, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0061, + "step": 11560 + }, + { + "epoch": 0.7081216720729543, + "grad_norm": 0.20383603870868683, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0049, + "step": 11570 + }, + { + "epoch": 0.7087337046330865, + "grad_norm": 0.28741395473480225, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0059, + "step": 11580 + }, + { + "epoch": 0.7093457371932187, + "grad_norm": 0.24231962859630585, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.008, + "step": 11590 + }, + { + "epoch": 0.7099577697533509, + "grad_norm": 0.2221115529537201, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0053, + "step": 11600 + }, + { + "epoch": 0.7105698023134831, + "grad_norm": 0.18564820289611816, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0072, + "step": 11610 + }, + { + "epoch": 0.7111818348736153, + "grad_norm": 0.3734343647956848, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0089, + "step": 11620 + }, + { + "epoch": 0.7117938674337475, + "grad_norm": 0.3215912878513336, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0093, + "step": 11630 + }, + { + "epoch": 0.7124058999938797, + "grad_norm": 0.22602899372577667, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0062, + "step": 11640 + }, + { + "epoch": 0.7130179325540119, + "grad_norm": 0.3115978538990021, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.006, + "step": 11650 + }, + { + "epoch": 0.7136299651141441, + "grad_norm": 0.26148155331611633, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0071, + "step": 11660 + }, + { + "epoch": 0.7142419976742763, + "grad_norm": 0.142781600356102, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0051, + "step": 11670 + }, + { + "epoch": 0.7148540302344085, + "grad_norm": 0.21306048333644867, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0068, + "step": 11680 + }, + { + "epoch": 0.7154660627945407, + "grad_norm": 0.3439876437187195, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.006, + "step": 11690 + }, + { + "epoch": 0.7160780953546728, + "grad_norm": 0.4010280966758728, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0062, + "step": 11700 + }, + { + "epoch": 0.716690127914805, + "grad_norm": 0.2760031819343567, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.006, + "step": 11710 + }, + { + "epoch": 0.7173021604749372, + "grad_norm": 0.45097261667251587, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0062, + "step": 11720 + }, + { + "epoch": 0.7179141930350694, + "grad_norm": 0.20118115842342377, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0054, + "step": 11730 + }, + { + "epoch": 0.7185262255952016, + "grad_norm": 0.3090760409832001, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0054, + "step": 11740 + }, + { + "epoch": 0.7191382581553338, + "grad_norm": 0.25016647577285767, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0077, + "step": 11750 + }, + { + "epoch": 0.719750290715466, + "grad_norm": 0.2310703545808792, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0053, + "step": 11760 + }, + { + "epoch": 0.7203623232755982, + "grad_norm": 0.2269359678030014, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.006, + "step": 11770 + }, + { + "epoch": 0.7209743558357304, + "grad_norm": 0.3917788565158844, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0062, + "step": 11780 + }, + { + "epoch": 0.7215863883958626, + "grad_norm": 0.25999465584754944, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0064, + "step": 11790 + }, + { + "epoch": 0.7221984209559948, + "grad_norm": 0.19340357184410095, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0071, + "step": 11800 + }, + { + "epoch": 0.722810453516127, + "grad_norm": 0.25046268105506897, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0078, + "step": 11810 + }, + { + "epoch": 0.7234224860762593, + "grad_norm": 0.19819264113903046, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.005, + "step": 11820 + }, + { + "epoch": 0.7240345186363915, + "grad_norm": 0.43484950065612793, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0078, + "step": 11830 + }, + { + "epoch": 0.7246465511965237, + "grad_norm": 0.29191601276397705, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0064, + "step": 11840 + }, + { + "epoch": 0.7252585837566559, + "grad_norm": 0.21717441082000732, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0056, + "step": 11850 + }, + { + "epoch": 0.7258706163167881, + "grad_norm": 0.3210129737854004, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0072, + "step": 11860 + }, + { + "epoch": 0.7264826488769203, + "grad_norm": 0.33192649483680725, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0061, + "step": 11870 + }, + { + "epoch": 0.7270946814370525, + "grad_norm": 0.14648163318634033, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0083, + "step": 11880 + }, + { + "epoch": 0.7277067139971847, + "grad_norm": 0.20028764009475708, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0052, + "step": 11890 + }, + { + "epoch": 0.7283187465573169, + "grad_norm": 0.21449612081050873, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0063, + "step": 11900 + }, + { + "epoch": 0.7289307791174491, + "grad_norm": 0.27472081780433655, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0062, + "step": 11910 + }, + { + "epoch": 0.7295428116775813, + "grad_norm": 0.2919130027294159, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0048, + "step": 11920 + }, + { + "epoch": 0.7301548442377135, + "grad_norm": 0.153092160820961, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0053, + "step": 11930 + }, + { + "epoch": 0.7307668767978457, + "grad_norm": 0.22820086777210236, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0058, + "step": 11940 + }, + { + "epoch": 0.7313789093579779, + "grad_norm": 0.24281881749629974, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0044, + "step": 11950 + }, + { + "epoch": 0.7319909419181101, + "grad_norm": 0.32581812143325806, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0051, + "step": 11960 + }, + { + "epoch": 0.7326029744782423, + "grad_norm": 0.3139822483062744, + "learning_rate": 1.435930222050582e-05, + "loss": 0.006, + "step": 11970 + }, + { + "epoch": 0.7332150070383744, + "grad_norm": 0.37985655665397644, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0052, + "step": 11980 + }, + { + "epoch": 0.7338270395985066, + "grad_norm": 0.1958508938550949, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.007, + "step": 11990 + }, + { + "epoch": 0.7344390721586388, + "grad_norm": 0.25318172574043274, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0084, + "step": 12000 + }, + { + "epoch": 0.735051104718771, + "grad_norm": 0.33245304226875305, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0051, + "step": 12010 + }, + { + "epoch": 0.7356631372789032, + "grad_norm": 0.2750372290611267, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0057, + "step": 12020 + }, + { + "epoch": 0.7362751698390354, + "grad_norm": 0.2057010382413864, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0057, + "step": 12030 + }, + { + "epoch": 0.7368872023991676, + "grad_norm": 0.30713731050491333, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0067, + "step": 12040 + }, + { + "epoch": 0.7374992349592998, + "grad_norm": 0.20423808693885803, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.006, + "step": 12050 + }, + { + "epoch": 0.738111267519432, + "grad_norm": 0.3129539489746094, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0067, + "step": 12060 + }, + { + "epoch": 0.7387233000795642, + "grad_norm": 0.25026270747184753, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0081, + "step": 12070 + }, + { + "epoch": 0.7393353326396964, + "grad_norm": 0.4147534668445587, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0061, + "step": 12080 + }, + { + "epoch": 0.7399473651998286, + "grad_norm": 0.20954278111457825, + "learning_rate": 1.425047976058418e-05, + "loss": 0.006, + "step": 12090 + }, + { + "epoch": 0.7405593977599608, + "grad_norm": 0.2700798809528351, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 0.741171430320093, + "grad_norm": 0.2597086429595947, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0059, + "step": 12110 + }, + { + "epoch": 0.7417834628802252, + "grad_norm": 0.2674495279788971, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0045, + "step": 12120 + }, + { + "epoch": 0.7423954954403574, + "grad_norm": 0.24583879113197327, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0061, + "step": 12130 + }, + { + "epoch": 0.7430075280004896, + "grad_norm": 0.23704801499843597, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0054, + "step": 12140 + }, + { + "epoch": 0.7436195605606218, + "grad_norm": 0.2381024807691574, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0073, + "step": 12150 + }, + { + "epoch": 0.744231593120754, + "grad_norm": 0.24937355518341064, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0049, + "step": 12160 + }, + { + "epoch": 0.7448436256808862, + "grad_norm": 0.20442882180213928, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0061, + "step": 12170 + }, + { + "epoch": 0.7454556582410184, + "grad_norm": 0.3053426742553711, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0087, + "step": 12180 + }, + { + "epoch": 0.7460676908011507, + "grad_norm": 0.3654315769672394, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0047, + "step": 12190 + }, + { + "epoch": 0.7466797233612829, + "grad_norm": 0.18926535546779633, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0065, + "step": 12200 + }, + { + "epoch": 0.7472917559214151, + "grad_norm": 0.21620485186576843, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0094, + "step": 12210 + }, + { + "epoch": 0.7479037884815473, + "grad_norm": 0.2754563093185425, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0059, + "step": 12220 + }, + { + "epoch": 0.7485158210416795, + "grad_norm": 0.39795419573783875, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.007, + "step": 12230 + }, + { + "epoch": 0.7491278536018117, + "grad_norm": 0.20502857863903046, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0048, + "step": 12240 + }, + { + "epoch": 0.7497398861619439, + "grad_norm": 0.23821429908275604, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0082, + "step": 12250 + }, + { + "epoch": 0.750351918722076, + "grad_norm": 0.45541366934776306, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0071, + "step": 12260 + }, + { + "epoch": 0.7509639512822082, + "grad_norm": 0.24881400167942047, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0061, + "step": 12270 + }, + { + "epoch": 0.7515759838423404, + "grad_norm": 0.2409125715494156, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0061, + "step": 12280 + }, + { + "epoch": 0.7521880164024726, + "grad_norm": 0.2930417060852051, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0054, + "step": 12290 + }, + { + "epoch": 0.7528000489626048, + "grad_norm": 0.30566394329071045, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0054, + "step": 12300 + }, + { + "epoch": 0.753412081522737, + "grad_norm": 0.32679763436317444, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0059, + "step": 12310 + }, + { + "epoch": 0.7540241140828692, + "grad_norm": 0.29273876547813416, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0067, + "step": 12320 + }, + { + "epoch": 0.7546361466430014, + "grad_norm": 0.19642773270606995, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0062, + "step": 12330 + }, + { + "epoch": 0.7552481792031336, + "grad_norm": 0.21928250789642334, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0066, + "step": 12340 + }, + { + "epoch": 0.7558602117632658, + "grad_norm": 0.2534322738647461, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0059, + "step": 12350 + }, + { + "epoch": 0.756472244323398, + "grad_norm": 0.20712649822235107, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0054, + "step": 12360 + }, + { + "epoch": 0.7570842768835302, + "grad_norm": 0.18670639395713806, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0063, + "step": 12370 + }, + { + "epoch": 0.7576963094436624, + "grad_norm": 0.26770254969596863, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0052, + "step": 12380 + }, + { + "epoch": 0.7583083420037946, + "grad_norm": 0.3621291518211365, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0056, + "step": 12390 + }, + { + "epoch": 0.7589203745639268, + "grad_norm": 0.31771939992904663, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0059, + "step": 12400 + }, + { + "epoch": 0.759532407124059, + "grad_norm": 0.44418177008628845, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0065, + "step": 12410 + }, + { + "epoch": 0.7601444396841912, + "grad_norm": 0.2183474898338318, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0046, + "step": 12420 + }, + { + "epoch": 0.7607564722443234, + "grad_norm": 0.4400590658187866, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0061, + "step": 12430 + }, + { + "epoch": 0.7613685048044556, + "grad_norm": 0.296539843082428, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0059, + "step": 12440 + }, + { + "epoch": 0.7619805373645878, + "grad_norm": 0.352870374917984, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0055, + "step": 12450 + }, + { + "epoch": 0.76259256992472, + "grad_norm": 0.19494596123695374, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0061, + "step": 12460 + }, + { + "epoch": 0.7632046024848522, + "grad_norm": 0.3799489438533783, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0057, + "step": 12470 + }, + { + "epoch": 0.7638166350449844, + "grad_norm": 0.3572365641593933, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0053, + "step": 12480 + }, + { + "epoch": 0.7644286676051166, + "grad_norm": 0.2559097707271576, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0062, + "step": 12490 + }, + { + "epoch": 0.7650407001652488, + "grad_norm": 0.13144978880882263, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0065, + "step": 12500 + }, + { + "epoch": 0.765652732725381, + "grad_norm": 0.34635287523269653, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0081, + "step": 12510 + }, + { + "epoch": 0.7662647652855132, + "grad_norm": 0.25615188479423523, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0057, + "step": 12520 + }, + { + "epoch": 0.7668767978456454, + "grad_norm": 0.17619644105434418, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0047, + "step": 12530 + }, + { + "epoch": 0.7674888304057775, + "grad_norm": 0.20169994235038757, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0068, + "step": 12540 + }, + { + "epoch": 0.7681008629659097, + "grad_norm": 0.49686071276664734, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0066, + "step": 12550 + }, + { + "epoch": 0.7687128955260419, + "grad_norm": 0.28179335594177246, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0058, + "step": 12560 + }, + { + "epoch": 0.7693249280861741, + "grad_norm": 0.28156182169914246, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.005, + "step": 12570 + }, + { + "epoch": 0.7699369606463063, + "grad_norm": 0.15054315328598022, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0051, + "step": 12580 + }, + { + "epoch": 0.7705489932064385, + "grad_norm": 0.22872644662857056, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0066, + "step": 12590 + }, + { + "epoch": 0.7711610257665708, + "grad_norm": 0.25821951031684875, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0054, + "step": 12600 + }, + { + "epoch": 0.771773058326703, + "grad_norm": 0.23592771589756012, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0059, + "step": 12610 + }, + { + "epoch": 0.7723850908868352, + "grad_norm": 0.34409141540527344, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0053, + "step": 12620 + }, + { + "epoch": 0.7729971234469674, + "grad_norm": 0.2803158760070801, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0042, + "step": 12630 + }, + { + "epoch": 0.7736091560070996, + "grad_norm": 0.32796284556388855, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0074, + "step": 12640 + }, + { + "epoch": 0.7742211885672318, + "grad_norm": 0.34749120473861694, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0054, + "step": 12650 + }, + { + "epoch": 0.774833221127364, + "grad_norm": 0.34066343307495117, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0082, + "step": 12660 + }, + { + "epoch": 0.7754452536874962, + "grad_norm": 0.4294384717941284, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0062, + "step": 12670 + }, + { + "epoch": 0.7760572862476284, + "grad_norm": 0.2355230748653412, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0069, + "step": 12680 + }, + { + "epoch": 0.7766693188077606, + "grad_norm": 0.3181976079940796, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0068, + "step": 12690 + }, + { + "epoch": 0.7772813513678928, + "grad_norm": 0.2763727605342865, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0052, + "step": 12700 + }, + { + "epoch": 0.777893383928025, + "grad_norm": 0.2938949465751648, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0041, + "step": 12710 + }, + { + "epoch": 0.7785054164881572, + "grad_norm": 0.31331220269203186, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0062, + "step": 12720 + }, + { + "epoch": 0.7791174490482894, + "grad_norm": 0.3389904797077179, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0061, + "step": 12730 + }, + { + "epoch": 0.7797294816084216, + "grad_norm": 0.2848975360393524, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0065, + "step": 12740 + }, + { + "epoch": 0.7803415141685538, + "grad_norm": 0.29838478565216064, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0061, + "step": 12750 + }, + { + "epoch": 0.780953546728686, + "grad_norm": 0.47004032135009766, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0066, + "step": 12760 + }, + { + "epoch": 0.7815655792888182, + "grad_norm": 0.26898056268692017, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0063, + "step": 12770 + }, + { + "epoch": 0.7821776118489504, + "grad_norm": 0.29459917545318604, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0065, + "step": 12780 + }, + { + "epoch": 0.7827896444090826, + "grad_norm": 0.3481508791446686, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0058, + "step": 12790 + }, + { + "epoch": 0.7834016769692148, + "grad_norm": 0.1707627922296524, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0053, + "step": 12800 + }, + { + "epoch": 0.784013709529347, + "grad_norm": 0.14735333621501923, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0058, + "step": 12810 + }, + { + "epoch": 0.7846257420894791, + "grad_norm": 0.28002044558525085, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.006, + "step": 12820 + }, + { + "epoch": 0.7852377746496113, + "grad_norm": 0.39598894119262695, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0062, + "step": 12830 + }, + { + "epoch": 0.7858498072097435, + "grad_norm": 0.19379247725009918, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0058, + "step": 12840 + }, + { + "epoch": 0.7864618397698757, + "grad_norm": 0.27260729670524597, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.006, + "step": 12850 + }, + { + "epoch": 0.7870738723300079, + "grad_norm": 0.2845087945461273, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0052, + "step": 12860 + }, + { + "epoch": 0.7876859048901401, + "grad_norm": 0.37151217460632324, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0043, + "step": 12870 + }, + { + "epoch": 0.7882979374502723, + "grad_norm": 0.3387412130832672, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0046, + "step": 12880 + }, + { + "epoch": 0.7889099700104045, + "grad_norm": 0.42672809958457947, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0065, + "step": 12890 + }, + { + "epoch": 0.7895220025705367, + "grad_norm": 0.20378202199935913, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0062, + "step": 12900 + }, + { + "epoch": 0.7901340351306689, + "grad_norm": 0.16417330503463745, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0045, + "step": 12910 + }, + { + "epoch": 0.7907460676908011, + "grad_norm": 0.1704142540693283, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0054, + "step": 12920 + }, + { + "epoch": 0.7913581002509333, + "grad_norm": 0.21494890749454498, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0061, + "step": 12930 + }, + { + "epoch": 0.7919701328110655, + "grad_norm": 0.3430638909339905, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0046, + "step": 12940 + }, + { + "epoch": 0.7925821653711977, + "grad_norm": 0.22641201317310333, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0049, + "step": 12950 + }, + { + "epoch": 0.79319419793133, + "grad_norm": 0.27153971791267395, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0057, + "step": 12960 + }, + { + "epoch": 0.7938062304914622, + "grad_norm": 0.2648560702800751, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.0048, + "step": 12970 + }, + { + "epoch": 0.7944182630515944, + "grad_norm": 0.2148633897304535, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0048, + "step": 12980 + }, + { + "epoch": 0.7950302956117266, + "grad_norm": 0.35170191526412964, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0069, + "step": 12990 + }, + { + "epoch": 0.7956423281718588, + "grad_norm": 0.3539712429046631, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0067, + "step": 13000 + }, + { + "epoch": 0.796254360731991, + "grad_norm": 0.29938259720802307, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0102, + "step": 13010 + }, + { + "epoch": 0.7968663932921232, + "grad_norm": 0.35241010785102844, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0073, + "step": 13020 + }, + { + "epoch": 0.7974784258522554, + "grad_norm": 0.2929113805294037, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0061, + "step": 13030 + }, + { + "epoch": 0.7980904584123876, + "grad_norm": 0.24052929878234863, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0046, + "step": 13040 + }, + { + "epoch": 0.7987024909725198, + "grad_norm": 0.21611042320728302, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0043, + "step": 13050 + }, + { + "epoch": 0.799314523532652, + "grad_norm": 0.23498570919036865, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0046, + "step": 13060 + }, + { + "epoch": 0.7999265560927842, + "grad_norm": 0.30229923129081726, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0068, + "step": 13070 + }, + { + "epoch": 0.8005385886529164, + "grad_norm": 0.2916681170463562, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0062, + "step": 13080 + }, + { + "epoch": 0.8011506212130486, + "grad_norm": 0.31905195116996765, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0048, + "step": 13090 + }, + { + "epoch": 0.8017626537731807, + "grad_norm": 0.22307109832763672, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0049, + "step": 13100 + }, + { + "epoch": 0.8023746863333129, + "grad_norm": 0.2815198004245758, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0054, + "step": 13110 + }, + { + "epoch": 0.8029867188934451, + "grad_norm": 0.18762829899787903, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0064, + "step": 13120 + }, + { + "epoch": 0.8035987514535773, + "grad_norm": 0.1918255090713501, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0064, + "step": 13130 + }, + { + "epoch": 0.8042107840137095, + "grad_norm": 0.3726229667663574, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0065, + "step": 13140 + }, + { + "epoch": 0.8048228165738417, + "grad_norm": 0.423285573720932, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0062, + "step": 13150 + }, + { + "epoch": 0.8054348491339739, + "grad_norm": 0.1709958165884018, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0052, + "step": 13160 + }, + { + "epoch": 0.8060468816941061, + "grad_norm": 0.3615981936454773, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0054, + "step": 13170 + }, + { + "epoch": 0.8066589142542383, + "grad_norm": 0.2101999819278717, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0053, + "step": 13180 + }, + { + "epoch": 0.8072709468143705, + "grad_norm": 0.14393582940101624, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0047, + "step": 13190 + }, + { + "epoch": 0.8078829793745027, + "grad_norm": 0.3704521656036377, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0056, + "step": 13200 + }, + { + "epoch": 0.8084950119346349, + "grad_norm": 0.23275913298130035, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0051, + "step": 13210 + }, + { + "epoch": 0.8091070444947671, + "grad_norm": 0.18429698050022125, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0045, + "step": 13220 + }, + { + "epoch": 0.8097190770548993, + "grad_norm": 0.21721667051315308, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0052, + "step": 13230 + }, + { + "epoch": 0.8103311096150315, + "grad_norm": 0.29456019401550293, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 0.8109431421751637, + "grad_norm": 0.19854630529880524, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0071, + "step": 13250 + }, + { + "epoch": 0.8115551747352959, + "grad_norm": 0.4318163990974426, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0059, + "step": 13260 + }, + { + "epoch": 0.8121672072954281, + "grad_norm": 0.3421531915664673, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.006, + "step": 13270 + }, + { + "epoch": 0.8127792398555603, + "grad_norm": 0.2370125651359558, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0054, + "step": 13280 + }, + { + "epoch": 0.8133912724156925, + "grad_norm": 0.2996460497379303, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 0.8140033049758247, + "grad_norm": 0.2911904454231262, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0053, + "step": 13300 + }, + { + "epoch": 0.8146153375359569, + "grad_norm": 0.26010408997535706, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0053, + "step": 13310 + }, + { + "epoch": 0.8152273700960891, + "grad_norm": 0.404702752828598, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0081, + "step": 13320 + }, + { + "epoch": 0.8158394026562213, + "grad_norm": 0.25591781735420227, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0057, + "step": 13330 + }, + { + "epoch": 0.8164514352163535, + "grad_norm": 0.1437849998474121, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0064, + "step": 13340 + }, + { + "epoch": 0.8170634677764858, + "grad_norm": 0.12252022325992584, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0047, + "step": 13350 + }, + { + "epoch": 0.817675500336618, + "grad_norm": 0.1861230581998825, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0063, + "step": 13360 + }, + { + "epoch": 0.8182875328967502, + "grad_norm": 0.2313026636838913, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0066, + "step": 13370 + }, + { + "epoch": 0.8188995654568824, + "grad_norm": 0.5445839166641235, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0076, + "step": 13380 + }, + { + "epoch": 0.8195115980170145, + "grad_norm": 0.21818871796131134, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0068, + "step": 13390 + }, + { + "epoch": 0.8201236305771467, + "grad_norm": 0.21823963522911072, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0072, + "step": 13400 + }, + { + "epoch": 0.8207356631372789, + "grad_norm": 0.1730659157037735, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0051, + "step": 13410 + }, + { + "epoch": 0.8213476956974111, + "grad_norm": 0.1301007866859436, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0075, + "step": 13420 + }, + { + "epoch": 0.8219597282575433, + "grad_norm": 0.32452520728111267, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.005, + "step": 13430 + }, + { + "epoch": 0.8225717608176755, + "grad_norm": 0.24771001935005188, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0058, + "step": 13440 + }, + { + "epoch": 0.8231837933778077, + "grad_norm": 0.4575227200984955, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0062, + "step": 13450 + }, + { + "epoch": 0.8237958259379399, + "grad_norm": 0.16441279649734497, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0081, + "step": 13460 + }, + { + "epoch": 0.8244078584980721, + "grad_norm": 0.26582902669906616, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0069, + "step": 13470 + }, + { + "epoch": 0.8250198910582043, + "grad_norm": 0.18871302902698517, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0068, + "step": 13480 + }, + { + "epoch": 0.8256319236183365, + "grad_norm": 0.23244783282279968, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0063, + "step": 13490 + }, + { + "epoch": 0.8262439561784687, + "grad_norm": 0.2399880290031433, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0036, + "step": 13500 + }, + { + "epoch": 0.8268559887386009, + "grad_norm": 0.25766822695732117, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0074, + "step": 13510 + }, + { + "epoch": 0.8274680212987331, + "grad_norm": 0.24792100489139557, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0066, + "step": 13520 + }, + { + "epoch": 0.8280800538588653, + "grad_norm": 0.3371896743774414, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0069, + "step": 13530 + }, + { + "epoch": 0.8286920864189975, + "grad_norm": 0.16249819099903107, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0079, + "step": 13540 + }, + { + "epoch": 0.8293041189791297, + "grad_norm": 0.2705139219760895, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0069, + "step": 13550 + }, + { + "epoch": 0.8299161515392619, + "grad_norm": 0.1905352771282196, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0055, + "step": 13560 + }, + { + "epoch": 0.8305281840993941, + "grad_norm": 0.23938500881195068, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0055, + "step": 13570 + }, + { + "epoch": 0.8311402166595263, + "grad_norm": 0.3562251031398773, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0054, + "step": 13580 + }, + { + "epoch": 0.8317522492196585, + "grad_norm": 0.2934769093990326, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0064, + "step": 13590 + }, + { + "epoch": 0.8323642817797907, + "grad_norm": 0.252366840839386, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0078, + "step": 13600 + }, + { + "epoch": 0.8329763143399229, + "grad_norm": 0.16646964848041534, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0046, + "step": 13610 + }, + { + "epoch": 0.8335883469000551, + "grad_norm": 0.22584658861160278, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0071, + "step": 13620 + }, + { + "epoch": 0.8342003794601873, + "grad_norm": 0.3578774034976959, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0049, + "step": 13630 + }, + { + "epoch": 0.8348124120203195, + "grad_norm": 0.3447739779949188, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0065, + "step": 13640 + }, + { + "epoch": 0.8354244445804517, + "grad_norm": 0.381954550743103, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0057, + "step": 13650 + }, + { + "epoch": 0.8360364771405839, + "grad_norm": 0.3563731908798218, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0065, + "step": 13660 + }, + { + "epoch": 0.836648509700716, + "grad_norm": 0.29516372084617615, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0053, + "step": 13670 + }, + { + "epoch": 0.8372605422608482, + "grad_norm": 0.22686618566513062, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0043, + "step": 13680 + }, + { + "epoch": 0.8378725748209804, + "grad_norm": 0.4608387351036072, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.005, + "step": 13690 + }, + { + "epoch": 0.8384846073811126, + "grad_norm": 0.31025534868240356, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0055, + "step": 13700 + }, + { + "epoch": 0.8390966399412448, + "grad_norm": 0.32904690504074097, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0055, + "step": 13710 + }, + { + "epoch": 0.839708672501377, + "grad_norm": 0.2547053098678589, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0061, + "step": 13720 + }, + { + "epoch": 0.8403207050615092, + "grad_norm": 0.30524104833602905, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.005, + "step": 13730 + }, + { + "epoch": 0.8409327376216414, + "grad_norm": 0.17741642892360687, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0051, + "step": 13740 + }, + { + "epoch": 0.8415447701817736, + "grad_norm": 0.23125578463077545, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0053, + "step": 13750 + }, + { + "epoch": 0.8421568027419059, + "grad_norm": 0.3080023229122162, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0075, + "step": 13760 + }, + { + "epoch": 0.842768835302038, + "grad_norm": 0.2509821951389313, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0053, + "step": 13770 + }, + { + "epoch": 0.8433808678621703, + "grad_norm": 0.17483864724636078, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.005, + "step": 13780 + }, + { + "epoch": 0.8439929004223025, + "grad_norm": 0.3952518403530121, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0056, + "step": 13790 + }, + { + "epoch": 0.8446049329824347, + "grad_norm": 0.2945535480976105, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0055, + "step": 13800 + }, + { + "epoch": 0.8452169655425669, + "grad_norm": 0.13024291396141052, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0073, + "step": 13810 + }, + { + "epoch": 0.8458289981026991, + "grad_norm": 0.1840520054101944, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0061, + "step": 13820 + }, + { + "epoch": 0.8464410306628313, + "grad_norm": 0.2368786782026291, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0058, + "step": 13830 + }, + { + "epoch": 0.8470530632229635, + "grad_norm": 0.2885456085205078, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0055, + "step": 13840 + }, + { + "epoch": 0.8476650957830957, + "grad_norm": 0.2782488167285919, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0046, + "step": 13850 + }, + { + "epoch": 0.8482771283432279, + "grad_norm": 0.1711442470550537, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0055, + "step": 13860 + }, + { + "epoch": 0.8488891609033601, + "grad_norm": 0.22235877811908722, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0056, + "step": 13870 + }, + { + "epoch": 0.8495011934634923, + "grad_norm": 0.1937183290719986, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0047, + "step": 13880 + }, + { + "epoch": 0.8501132260236245, + "grad_norm": 0.33960190415382385, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0063, + "step": 13890 + }, + { + "epoch": 0.8507252585837567, + "grad_norm": 0.1983388215303421, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0045, + "step": 13900 + }, + { + "epoch": 0.8513372911438889, + "grad_norm": 0.2968246638774872, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0051, + "step": 13910 + }, + { + "epoch": 0.8519493237040211, + "grad_norm": 0.25328314304351807, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0057, + "step": 13920 + }, + { + "epoch": 0.8525613562641533, + "grad_norm": 0.2435184270143509, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0056, + "step": 13930 + }, + { + "epoch": 0.8531733888242855, + "grad_norm": 0.24512560665607452, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0053, + "step": 13940 + }, + { + "epoch": 0.8537854213844176, + "grad_norm": 0.22028976678848267, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.006, + "step": 13950 + }, + { + "epoch": 0.8543974539445498, + "grad_norm": 0.24743935465812683, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0065, + "step": 13960 + }, + { + "epoch": 0.855009486504682, + "grad_norm": 0.1393810361623764, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0081, + "step": 13970 + }, + { + "epoch": 0.8556215190648142, + "grad_norm": 0.25975972414016724, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0063, + "step": 13980 + }, + { + "epoch": 0.8562335516249464, + "grad_norm": 0.1944616585969925, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0049, + "step": 13990 + }, + { + "epoch": 0.8568455841850786, + "grad_norm": 0.21936742961406708, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0056, + "step": 14000 + }, + { + "epoch": 0.8574576167452108, + "grad_norm": 0.1556629091501236, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0047, + "step": 14010 + }, + { + "epoch": 0.858069649305343, + "grad_norm": 0.23696991801261902, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.006, + "step": 14020 + }, + { + "epoch": 0.8586816818654752, + "grad_norm": 0.32507795095443726, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0061, + "step": 14030 + }, + { + "epoch": 0.8592937144256074, + "grad_norm": 0.35332199931144714, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0058, + "step": 14040 + }, + { + "epoch": 0.8599057469857396, + "grad_norm": 0.1835644394159317, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0046, + "step": 14050 + }, + { + "epoch": 0.8605177795458718, + "grad_norm": 0.19127517938613892, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0044, + "step": 14060 + }, + { + "epoch": 0.861129812106004, + "grad_norm": 0.30748996138572693, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0055, + "step": 14070 + }, + { + "epoch": 0.8617418446661362, + "grad_norm": 0.178785502910614, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0049, + "step": 14080 + }, + { + "epoch": 0.8623538772262684, + "grad_norm": 0.16979056596755981, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0044, + "step": 14090 + }, + { + "epoch": 0.8629659097864006, + "grad_norm": 0.19519983232021332, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0063, + "step": 14100 + }, + { + "epoch": 0.8635779423465328, + "grad_norm": 0.2722550928592682, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0057, + "step": 14110 + }, + { + "epoch": 0.864189974906665, + "grad_norm": 0.1956222504377365, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0054, + "step": 14120 + }, + { + "epoch": 0.8648020074667973, + "grad_norm": 0.32274308800697327, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0066, + "step": 14130 + }, + { + "epoch": 0.8654140400269295, + "grad_norm": 0.25953641533851624, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.0056, + "step": 14140 + }, + { + "epoch": 0.8660260725870617, + "grad_norm": 0.3293299674987793, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0057, + "step": 14150 + }, + { + "epoch": 0.8666381051471939, + "grad_norm": 0.35404127836227417, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0072, + "step": 14160 + }, + { + "epoch": 0.8672501377073261, + "grad_norm": 0.24674376845359802, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0064, + "step": 14170 + }, + { + "epoch": 0.8678621702674583, + "grad_norm": 0.23506462574005127, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0047, + "step": 14180 + }, + { + "epoch": 0.8684742028275905, + "grad_norm": 0.30500903725624084, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0059, + "step": 14190 + }, + { + "epoch": 0.8690862353877227, + "grad_norm": 0.23000167310237885, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0051, + "step": 14200 + }, + { + "epoch": 0.8696982679478549, + "grad_norm": 0.17339368164539337, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0041, + "step": 14210 + }, + { + "epoch": 0.8703103005079871, + "grad_norm": 0.2505367696285248, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.0059, + "step": 14220 + }, + { + "epoch": 0.8709223330681192, + "grad_norm": 0.22645734250545502, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0044, + "step": 14230 + }, + { + "epoch": 0.8715343656282514, + "grad_norm": 0.3509127199649811, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0043, + "step": 14240 + }, + { + "epoch": 0.8721463981883836, + "grad_norm": 0.2758972644805908, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0055, + "step": 14250 + }, + { + "epoch": 0.8727584307485158, + "grad_norm": 0.1943834275007248, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.006, + "step": 14260 + }, + { + "epoch": 0.873370463308648, + "grad_norm": 0.32881075143814087, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0061, + "step": 14270 + }, + { + "epoch": 0.8739824958687802, + "grad_norm": 0.35203438997268677, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0054, + "step": 14280 + }, + { + "epoch": 0.8745945284289124, + "grad_norm": 0.13618917763233185, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0044, + "step": 14290 + }, + { + "epoch": 0.8752065609890446, + "grad_norm": 0.22939404845237732, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0044, + "step": 14300 + }, + { + "epoch": 0.8758185935491768, + "grad_norm": 0.2027491182088852, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0051, + "step": 14310 + }, + { + "epoch": 0.876430626109309, + "grad_norm": 0.21950028836727142, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0105, + "step": 14320 + }, + { + "epoch": 0.8770426586694412, + "grad_norm": 0.307913213968277, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0051, + "step": 14330 + }, + { + "epoch": 0.8776546912295734, + "grad_norm": 0.1669110357761383, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0058, + "step": 14340 + }, + { + "epoch": 0.8782667237897056, + "grad_norm": 0.3033636808395386, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0066, + "step": 14350 + }, + { + "epoch": 0.8788787563498378, + "grad_norm": 0.25514236092567444, + "learning_rate": 1.210961823379053e-05, + "loss": 0.005, + "step": 14360 + }, + { + "epoch": 0.87949078890997, + "grad_norm": 0.2574418783187866, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0069, + "step": 14370 + }, + { + "epoch": 0.8801028214701022, + "grad_norm": 0.17803016304969788, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.006, + "step": 14380 + }, + { + "epoch": 0.8807148540302344, + "grad_norm": 0.31375741958618164, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0077, + "step": 14390 + }, + { + "epoch": 0.8813268865903666, + "grad_norm": 0.18031778931617737, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0052, + "step": 14400 + }, + { + "epoch": 0.8819389191504988, + "grad_norm": 0.18077519536018372, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0055, + "step": 14410 + }, + { + "epoch": 0.882550951710631, + "grad_norm": 0.22171644866466522, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0059, + "step": 14420 + }, + { + "epoch": 0.8831629842707632, + "grad_norm": 0.16187389194965363, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0044, + "step": 14430 + }, + { + "epoch": 0.8837750168308954, + "grad_norm": 0.27667325735092163, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0063, + "step": 14440 + }, + { + "epoch": 0.8843870493910276, + "grad_norm": 0.2493051290512085, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0072, + "step": 14450 + }, + { + "epoch": 0.8849990819511598, + "grad_norm": 0.3519611656665802, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 0.885611114511292, + "grad_norm": 0.17942464351654053, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0057, + "step": 14470 + }, + { + "epoch": 0.8862231470714242, + "grad_norm": 0.24518658220767975, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0044, + "step": 14480 + }, + { + "epoch": 0.8868351796315564, + "grad_norm": 0.28493785858154297, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0055, + "step": 14490 + }, + { + "epoch": 0.8874472121916887, + "grad_norm": 0.22260263562202454, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0062, + "step": 14500 + }, + { + "epoch": 0.8880592447518207, + "grad_norm": 0.2804561257362366, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0051, + "step": 14510 + }, + { + "epoch": 0.888671277311953, + "grad_norm": 0.24349385499954224, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0045, + "step": 14520 + }, + { + "epoch": 0.8892833098720851, + "grad_norm": 0.262207955121994, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0082, + "step": 14530 + }, + { + "epoch": 0.8898953424322174, + "grad_norm": 0.15527820587158203, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0061, + "step": 14540 + }, + { + "epoch": 0.8905073749923496, + "grad_norm": 0.23850804567337036, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0042, + "step": 14550 + }, + { + "epoch": 0.8911194075524818, + "grad_norm": 0.2665582001209259, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0053, + "step": 14560 + }, + { + "epoch": 0.891731440112614, + "grad_norm": 0.2652167081832886, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 0.8923434726727462, + "grad_norm": 0.21386243402957916, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0072, + "step": 14580 + }, + { + "epoch": 0.8929555052328784, + "grad_norm": 0.3087247312068939, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0082, + "step": 14590 + }, + { + "epoch": 0.8935675377930106, + "grad_norm": 0.2003909796476364, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0048, + "step": 14600 + }, + { + "epoch": 0.8941795703531428, + "grad_norm": 0.2214624583721161, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0062, + "step": 14610 + }, + { + "epoch": 0.894791602913275, + "grad_norm": 0.2500647306442261, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0052, + "step": 14620 + }, + { + "epoch": 0.8954036354734072, + "grad_norm": 0.2615419030189514, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0054, + "step": 14630 + }, + { + "epoch": 0.8960156680335394, + "grad_norm": 0.21347551047801971, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0048, + "step": 14640 + }, + { + "epoch": 0.8966277005936716, + "grad_norm": 0.35483887791633606, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0054, + "step": 14650 + }, + { + "epoch": 0.8972397331538038, + "grad_norm": 0.2423439472913742, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0055, + "step": 14660 + }, + { + "epoch": 0.897851765713936, + "grad_norm": 0.16826359927654266, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0067, + "step": 14670 + }, + { + "epoch": 0.8984637982740682, + "grad_norm": 0.3589499294757843, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0059, + "step": 14680 + }, + { + "epoch": 0.8990758308342004, + "grad_norm": 0.3081042468547821, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0057, + "step": 14690 + }, + { + "epoch": 0.8996878633943326, + "grad_norm": 0.31996914744377136, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0063, + "step": 14700 + }, + { + "epoch": 0.9002998959544648, + "grad_norm": 0.301209419965744, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0073, + "step": 14710 + }, + { + "epoch": 0.900911928514597, + "grad_norm": 0.19257168471813202, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0055, + "step": 14720 + }, + { + "epoch": 0.9015239610747292, + "grad_norm": 0.15221600234508514, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0053, + "step": 14730 + }, + { + "epoch": 0.9021359936348614, + "grad_norm": 0.21519577503204346, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0055, + "step": 14740 + }, + { + "epoch": 0.9027480261949936, + "grad_norm": 0.23772196471691132, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.006, + "step": 14750 + }, + { + "epoch": 0.9033600587551258, + "grad_norm": 0.2872219979763031, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0057, + "step": 14760 + }, + { + "epoch": 0.903972091315258, + "grad_norm": 0.2589483857154846, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0056, + "step": 14770 + }, + { + "epoch": 0.9045841238753902, + "grad_norm": 0.31850162148475647, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0051, + "step": 14780 + }, + { + "epoch": 0.9051961564355223, + "grad_norm": 0.27179282903671265, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0051, + "step": 14790 + }, + { + "epoch": 0.9058081889956545, + "grad_norm": 0.4132739007472992, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.005, + "step": 14800 + }, + { + "epoch": 0.9064202215557867, + "grad_norm": 0.19336774945259094, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0041, + "step": 14810 + }, + { + "epoch": 0.9070322541159189, + "grad_norm": 0.20783282816410065, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0065, + "step": 14820 + }, + { + "epoch": 0.9076442866760511, + "grad_norm": 0.26141899824142456, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0069, + "step": 14830 + }, + { + "epoch": 0.9082563192361833, + "grad_norm": 0.2158539742231369, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0081, + "step": 14840 + }, + { + "epoch": 0.9088683517963155, + "grad_norm": 0.3233732581138611, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0065, + "step": 14850 + }, + { + "epoch": 0.9094803843564477, + "grad_norm": 0.23924769461154938, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0042, + "step": 14860 + }, + { + "epoch": 0.9100924169165799, + "grad_norm": 0.17663812637329102, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.004, + "step": 14870 + }, + { + "epoch": 0.9107044494767121, + "grad_norm": 0.34379643201828003, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.005, + "step": 14880 + }, + { + "epoch": 0.9113164820368443, + "grad_norm": 0.29971349239349365, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0059, + "step": 14890 + }, + { + "epoch": 0.9119285145969765, + "grad_norm": 0.24832949042320251, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0042, + "step": 14900 + }, + { + "epoch": 0.9125405471571088, + "grad_norm": 0.22288024425506592, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0065, + "step": 14910 + }, + { + "epoch": 0.913152579717241, + "grad_norm": 0.2806689441204071, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0043, + "step": 14920 + }, + { + "epoch": 0.9137646122773732, + "grad_norm": 0.3908274173736572, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0047, + "step": 14930 + }, + { + "epoch": 0.9143766448375054, + "grad_norm": 0.16255778074264526, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0068, + "step": 14940 + }, + { + "epoch": 0.9149886773976376, + "grad_norm": 0.430791437625885, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0085, + "step": 14950 + }, + { + "epoch": 0.9156007099577698, + "grad_norm": 0.1739969551563263, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0056, + "step": 14960 + }, + { + "epoch": 0.916212742517902, + "grad_norm": 0.24298283457756042, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0055, + "step": 14970 + }, + { + "epoch": 0.9168247750780342, + "grad_norm": 0.21269915997982025, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0051, + "step": 14980 + }, + { + "epoch": 0.9174368076381664, + "grad_norm": 0.263388991355896, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0058, + "step": 14990 + }, + { + "epoch": 0.9180488401982986, + "grad_norm": 0.28030532598495483, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0068, + "step": 15000 + }, + { + "epoch": 0.9186608727584308, + "grad_norm": 0.17051894962787628, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 0.919272905318563, + "grad_norm": 0.2763383388519287, + "learning_rate": 1.146875176249365e-05, + "loss": 0.004, + "step": 15020 + }, + { + "epoch": 0.9198849378786952, + "grad_norm": 0.2616822421550751, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0052, + "step": 15030 + }, + { + "epoch": 0.9204969704388274, + "grad_norm": 0.21407093107700348, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0062, + "step": 15040 + }, + { + "epoch": 0.9211090029989596, + "grad_norm": 0.23936578631401062, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0073, + "step": 15050 + }, + { + "epoch": 0.9217210355590918, + "grad_norm": 0.26383110880851746, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.006, + "step": 15060 + }, + { + "epoch": 0.922333068119224, + "grad_norm": 0.19477945566177368, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0043, + "step": 15070 + }, + { + "epoch": 0.9229451006793561, + "grad_norm": 0.16677282750606537, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0061, + "step": 15080 + }, + { + "epoch": 0.9235571332394883, + "grad_norm": 0.26856037974357605, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0065, + "step": 15090 + }, + { + "epoch": 0.9241691657996205, + "grad_norm": 0.20086173713207245, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0056, + "step": 15100 + }, + { + "epoch": 0.9247811983597527, + "grad_norm": 0.26998719573020935, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0034, + "step": 15110 + }, + { + "epoch": 0.9253932309198849, + "grad_norm": 0.12727728486061096, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0043, + "step": 15120 + }, + { + "epoch": 0.9260052634800171, + "grad_norm": 0.11288347095251083, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0055, + "step": 15130 + }, + { + "epoch": 0.9266172960401493, + "grad_norm": 0.1109771579504013, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0048, + "step": 15140 + }, + { + "epoch": 0.9272293286002815, + "grad_norm": 0.2556479275226593, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0056, + "step": 15150 + }, + { + "epoch": 0.9278413611604137, + "grad_norm": 0.2149561196565628, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.005, + "step": 15160 + }, + { + "epoch": 0.9284533937205459, + "grad_norm": 0.16953054070472717, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0063, + "step": 15170 + }, + { + "epoch": 0.9290654262806781, + "grad_norm": 0.18306049704551697, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.004, + "step": 15180 + }, + { + "epoch": 0.9296774588408103, + "grad_norm": 0.15755385160446167, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0035, + "step": 15190 + }, + { + "epoch": 0.9302894914009425, + "grad_norm": 0.21062517166137695, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0049, + "step": 15200 + }, + { + "epoch": 0.9309015239610747, + "grad_norm": 0.1403888463973999, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0051, + "step": 15210 + }, + { + "epoch": 0.9315135565212069, + "grad_norm": 0.4044550359249115, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0062, + "step": 15220 + }, + { + "epoch": 0.9321255890813391, + "grad_norm": 0.22543896734714508, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 0.9327376216414713, + "grad_norm": 0.2025403380393982, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0055, + "step": 15240 + }, + { + "epoch": 0.9333496542016035, + "grad_norm": 1.0549683570861816, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0092, + "step": 15250 + }, + { + "epoch": 0.9339616867617357, + "grad_norm": 0.3442397117614746, + "learning_rate": 1.123494277220359e-05, + "loss": 0.005, + "step": 15260 + }, + { + "epoch": 0.934573719321868, + "grad_norm": 0.1678813248872757, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.005, + "step": 15270 + }, + { + "epoch": 0.9351857518820001, + "grad_norm": 0.31081119179725647, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0052, + "step": 15280 + }, + { + "epoch": 0.9357977844421324, + "grad_norm": 0.25498780608177185, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.006, + "step": 15290 + }, + { + "epoch": 0.9364098170022646, + "grad_norm": 0.21825125813484192, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0054, + "step": 15300 + }, + { + "epoch": 0.9370218495623968, + "grad_norm": 0.19719983637332916, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0074, + "step": 15310 + }, + { + "epoch": 0.937633882122529, + "grad_norm": 0.32297465205192566, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0058, + "step": 15320 + }, + { + "epoch": 0.9382459146826612, + "grad_norm": 0.2717733383178711, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0035, + "step": 15330 + }, + { + "epoch": 0.9388579472427934, + "grad_norm": 0.22138433158397675, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0048, + "step": 15340 + }, + { + "epoch": 0.9394699798029256, + "grad_norm": 0.1943465769290924, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0063, + "step": 15350 + }, + { + "epoch": 0.9400820123630577, + "grad_norm": 0.18422184884548187, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0054, + "step": 15360 + }, + { + "epoch": 0.9406940449231899, + "grad_norm": 0.17614246904850006, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0054, + "step": 15370 + }, + { + "epoch": 0.9413060774833221, + "grad_norm": 0.17661592364311218, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0053, + "step": 15380 + }, + { + "epoch": 0.9419181100434543, + "grad_norm": 0.42976850271224976, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0054, + "step": 15390 + }, + { + "epoch": 0.9425301426035865, + "grad_norm": 0.34272316098213196, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0044, + "step": 15400 + }, + { + "epoch": 0.9431421751637187, + "grad_norm": 0.3346613645553589, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0042, + "step": 15410 + }, + { + "epoch": 0.9437542077238509, + "grad_norm": 0.15300114452838898, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0057, + "step": 15420 + }, + { + "epoch": 0.9443662402839831, + "grad_norm": 0.23935656249523163, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0084, + "step": 15430 + }, + { + "epoch": 0.9449782728441153, + "grad_norm": 0.21595227718353271, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0051, + "step": 15440 + }, + { + "epoch": 0.9455903054042475, + "grad_norm": 0.2670149505138397, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0059, + "step": 15450 + }, + { + "epoch": 0.9462023379643797, + "grad_norm": 0.2214009314775467, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0054, + "step": 15460 + }, + { + "epoch": 0.9468143705245119, + "grad_norm": 0.3491996228694916, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 0.9474264030846441, + "grad_norm": 0.28213024139404297, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0054, + "step": 15480 + }, + { + "epoch": 0.9480384356447763, + "grad_norm": 0.30218765139579773, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0049, + "step": 15490 + }, + { + "epoch": 0.9486504682049085, + "grad_norm": 0.17068025469779968, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0046, + "step": 15500 + }, + { + "epoch": 0.9492625007650407, + "grad_norm": 0.23325121402740479, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0054, + "step": 15510 + }, + { + "epoch": 0.9498745333251729, + "grad_norm": 0.22118528187274933, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0035, + "step": 15520 + }, + { + "epoch": 0.9504865658853051, + "grad_norm": 0.20202121138572693, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0043, + "step": 15530 + }, + { + "epoch": 0.9510985984454373, + "grad_norm": 0.28455010056495667, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0039, + "step": 15540 + }, + { + "epoch": 0.9517106310055695, + "grad_norm": 0.26871445775032043, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0046, + "step": 15550 + }, + { + "epoch": 0.9523226635657017, + "grad_norm": 0.33665943145751953, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0058, + "step": 15560 + }, + { + "epoch": 0.9529346961258339, + "grad_norm": 0.3182595670223236, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0053, + "step": 15570 + }, + { + "epoch": 0.9535467286859661, + "grad_norm": 0.2867930829524994, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0068, + "step": 15580 + }, + { + "epoch": 0.9541587612460983, + "grad_norm": 0.21562239527702332, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.0051, + "step": 15590 + }, + { + "epoch": 0.9547707938062305, + "grad_norm": 0.19122859835624695, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0046, + "step": 15600 + }, + { + "epoch": 0.9553828263663627, + "grad_norm": 0.24596959352493286, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.005, + "step": 15610 + }, + { + "epoch": 0.9559948589264949, + "grad_norm": 0.182195246219635, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0038, + "step": 15620 + }, + { + "epoch": 0.9566068914866271, + "grad_norm": 0.3122585415840149, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0053, + "step": 15630 + }, + { + "epoch": 0.9572189240467592, + "grad_norm": 0.25725093483924866, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0063, + "step": 15640 + }, + { + "epoch": 0.9578309566068914, + "grad_norm": 0.19965514540672302, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0053, + "step": 15650 + }, + { + "epoch": 0.9584429891670236, + "grad_norm": 0.3474758267402649, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.006, + "step": 15660 + }, + { + "epoch": 0.9590550217271558, + "grad_norm": 0.18151336908340454, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0048, + "step": 15670 + }, + { + "epoch": 0.959667054287288, + "grad_norm": 0.18923020362854004, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0037, + "step": 15680 + }, + { + "epoch": 0.9602790868474202, + "grad_norm": 0.19792871177196503, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0049, + "step": 15690 + }, + { + "epoch": 0.9608911194075525, + "grad_norm": 0.20296797156333923, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0047, + "step": 15700 + }, + { + "epoch": 0.9615031519676847, + "grad_norm": 0.2556051015853882, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0054, + "step": 15710 + }, + { + "epoch": 0.9621151845278169, + "grad_norm": 0.35538288950920105, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0037, + "step": 15720 + }, + { + "epoch": 0.9627272170879491, + "grad_norm": 0.45357266068458557, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0065, + "step": 15730 + }, + { + "epoch": 0.9633392496480813, + "grad_norm": 0.23721693456172943, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0046, + "step": 15740 + }, + { + "epoch": 0.9639512822082135, + "grad_norm": 0.2727845013141632, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0052, + "step": 15750 + }, + { + "epoch": 0.9645633147683457, + "grad_norm": 0.2647950351238251, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0054, + "step": 15760 + }, + { + "epoch": 0.9651753473284779, + "grad_norm": 0.23364882171154022, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.005, + "step": 15770 + }, + { + "epoch": 0.9657873798886101, + "grad_norm": 0.2035825401544571, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0054, + "step": 15780 + }, + { + "epoch": 0.9663994124487423, + "grad_norm": 0.2411692589521408, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0062, + "step": 15790 + }, + { + "epoch": 0.9670114450088745, + "grad_norm": 0.23559266328811646, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 0.9676234775690067, + "grad_norm": 0.23872418701648712, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0063, + "step": 15810 + }, + { + "epoch": 0.9682355101291389, + "grad_norm": 0.27072128653526306, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0052, + "step": 15820 + }, + { + "epoch": 0.9688475426892711, + "grad_norm": 0.42610588669776917, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0056, + "step": 15830 + }, + { + "epoch": 0.9694595752494033, + "grad_norm": 0.13065233826637268, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0044, + "step": 15840 + }, + { + "epoch": 0.9700716078095355, + "grad_norm": 0.2479996383190155, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0049, + "step": 15850 + }, + { + "epoch": 0.9706836403696677, + "grad_norm": 0.22867974638938904, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 0.9712956729297999, + "grad_norm": 0.21570387482643127, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0048, + "step": 15870 + }, + { + "epoch": 0.9719077054899321, + "grad_norm": 0.26354169845581055, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0073, + "step": 15880 + }, + { + "epoch": 0.9725197380500643, + "grad_norm": 0.19785451889038086, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0044, + "step": 15890 + }, + { + "epoch": 0.9731317706101965, + "grad_norm": 0.09346124529838562, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0051, + "step": 15900 + }, + { + "epoch": 0.9737438031703287, + "grad_norm": 0.18946298956871033, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0049, + "step": 15910 + }, + { + "epoch": 0.9743558357304608, + "grad_norm": 0.1761726588010788, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0057, + "step": 15920 + }, + { + "epoch": 0.974967868290593, + "grad_norm": 0.2610328495502472, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0061, + "step": 15930 + }, + { + "epoch": 0.9755799008507252, + "grad_norm": 0.1841743141412735, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0046, + "step": 15940 + }, + { + "epoch": 0.9761919334108574, + "grad_norm": 0.14279355108737946, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0038, + "step": 15950 + }, + { + "epoch": 0.9768039659709896, + "grad_norm": 0.1717681884765625, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0035, + "step": 15960 + }, + { + "epoch": 0.9774159985311218, + "grad_norm": 0.2102527618408203, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.007, + "step": 15970 + }, + { + "epoch": 0.978028031091254, + "grad_norm": 0.29462379217147827, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0058, + "step": 15980 + }, + { + "epoch": 0.9786400636513862, + "grad_norm": 0.1863207072019577, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0058, + "step": 15990 + }, + { + "epoch": 0.9792520962115184, + "grad_norm": 0.2764773964881897, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0051, + "step": 16000 + }, + { + "epoch": 0.9798641287716506, + "grad_norm": 0.2723250091075897, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0056, + "step": 16010 + }, + { + "epoch": 0.9804761613317828, + "grad_norm": 0.21564331650733948, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0048, + "step": 16020 + }, + { + "epoch": 0.981088193891915, + "grad_norm": 0.20242232084274292, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0058, + "step": 16030 + }, + { + "epoch": 0.9817002264520472, + "grad_norm": 0.21522754430770874, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0039, + "step": 16040 + }, + { + "epoch": 0.9823122590121794, + "grad_norm": 0.20013833045959473, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0051, + "step": 16050 + }, + { + "epoch": 0.9829242915723116, + "grad_norm": 0.3008810579776764, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 0.9835363241324439, + "grad_norm": 0.2994979918003082, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0033, + "step": 16070 + }, + { + "epoch": 0.984148356692576, + "grad_norm": 0.22704628109931946, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0046, + "step": 16080 + }, + { + "epoch": 0.9847603892527083, + "grad_norm": 0.3253551423549652, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0043, + "step": 16090 + }, + { + "epoch": 0.9853724218128405, + "grad_norm": 0.14902091026306152, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0042, + "step": 16100 + }, + { + "epoch": 0.9859844543729727, + "grad_norm": 0.15155524015426636, + "learning_rate": 1.04066696184376e-05, + "loss": 0.005, + "step": 16110 + }, + { + "epoch": 0.9865964869331049, + "grad_norm": 0.1859518140554428, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0063, + "step": 16120 + }, + { + "epoch": 0.9872085194932371, + "grad_norm": 0.5434902906417847, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0072, + "step": 16130 + }, + { + "epoch": 0.9878205520533693, + "grad_norm": 0.19308103621006012, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0046, + "step": 16140 + }, + { + "epoch": 0.9884325846135015, + "grad_norm": 0.21260593831539154, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0077, + "step": 16150 + }, + { + "epoch": 0.9890446171736337, + "grad_norm": 0.15255668759346008, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0059, + "step": 16160 + }, + { + "epoch": 0.9896566497337659, + "grad_norm": 0.18739885091781616, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0047, + "step": 16170 + }, + { + "epoch": 0.9902686822938981, + "grad_norm": 0.2112029641866684, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0049, + "step": 16180 + }, + { + "epoch": 0.9908807148540303, + "grad_norm": 0.35941991209983826, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.005, + "step": 16190 + }, + { + "epoch": 0.9914927474141624, + "grad_norm": 0.16792108118534088, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0051, + "step": 16200 + }, + { + "epoch": 0.9921047799742946, + "grad_norm": 0.1985466182231903, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0063, + "step": 16210 + }, + { + "epoch": 0.9927168125344268, + "grad_norm": 0.17579570412635803, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0046, + "step": 16220 + }, + { + "epoch": 0.993328845094559, + "grad_norm": 0.23352178931236267, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0061, + "step": 16230 + }, + { + "epoch": 0.9939408776546912, + "grad_norm": 0.3543553054332733, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0054, + "step": 16240 + }, + { + "epoch": 0.9945529102148234, + "grad_norm": 0.18603719770908356, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0049, + "step": 16250 + }, + { + "epoch": 0.9951649427749556, + "grad_norm": 0.31745344400405884, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0061, + "step": 16260 + }, + { + "epoch": 0.9957769753350878, + "grad_norm": 0.1416773498058319, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0063, + "step": 16270 + }, + { + "epoch": 0.99638900789522, + "grad_norm": 0.18451642990112305, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0055, + "step": 16280 + }, + { + "epoch": 0.9970010404553522, + "grad_norm": 0.13422183692455292, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0047, + "step": 16290 + }, + { + "epoch": 0.9976130730154844, + "grad_norm": 0.15831588208675385, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0045, + "step": 16300 + }, + { + "epoch": 0.9982251055756166, + "grad_norm": 0.42520084977149963, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0053, + "step": 16310 + }, + { + "epoch": 0.9988371381357488, + "grad_norm": 0.20889437198638916, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0043, + "step": 16320 + }, + { + "epoch": 0.999449170695881, + "grad_norm": 0.17016667127609253, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0072, + "step": 16330 + }, + { + "epoch": 1.0000612032560132, + "grad_norm": 0.3129214346408844, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0054, + "step": 16340 + }, + { + "epoch": 1.0006732358161454, + "grad_norm": 0.334224134683609, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.0037, + "step": 16350 + }, + { + "epoch": 1.0012852683762776, + "grad_norm": 0.28502705693244934, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0065, + "step": 16360 + }, + { + "epoch": 1.0018973009364098, + "grad_norm": 0.21431966125965118, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0046, + "step": 16370 + }, + { + "epoch": 1.002509333496542, + "grad_norm": 0.22898051142692566, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.006, + "step": 16380 + }, + { + "epoch": 1.0031213660566742, + "grad_norm": 0.41625624895095825, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0068, + "step": 16390 + }, + { + "epoch": 1.0037333986168064, + "grad_norm": 0.2510327398777008, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0043, + "step": 16400 + }, + { + "epoch": 1.0043454311769386, + "grad_norm": 0.23560962080955505, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0049, + "step": 16410 + }, + { + "epoch": 1.0049574637370708, + "grad_norm": 0.2081199437379837, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0061, + "step": 16420 + }, + { + "epoch": 1.005569496297203, + "grad_norm": 0.12456244230270386, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0057, + "step": 16430 + }, + { + "epoch": 1.0061815288573353, + "grad_norm": 0.22212636470794678, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0052, + "step": 16440 + }, + { + "epoch": 1.0067935614174675, + "grad_norm": 0.27772897481918335, + "learning_rate": 1.007637577910799e-05, + "loss": 0.007, + "step": 16450 + }, + { + "epoch": 1.0074055939775997, + "grad_norm": 0.40040507912635803, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0051, + "step": 16460 + }, + { + "epoch": 1.0080176265377319, + "grad_norm": 0.19763565063476562, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0047, + "step": 16470 + }, + { + "epoch": 1.008629659097864, + "grad_norm": 0.2906181514263153, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0055, + "step": 16480 + }, + { + "epoch": 1.0092416916579963, + "grad_norm": 0.29949888586997986, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0045, + "step": 16490 + }, + { + "epoch": 1.0098537242181285, + "grad_norm": 0.3900962769985199, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0053, + "step": 16500 + }, + { + "epoch": 1.0104657567782607, + "grad_norm": 0.22380846738815308, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0043, + "step": 16510 + }, + { + "epoch": 1.0110777893383929, + "grad_norm": 0.3426673412322998, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0052, + "step": 16520 + }, + { + "epoch": 1.011689821898525, + "grad_norm": 0.2452230006456375, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0055, + "step": 16530 + }, + { + "epoch": 1.0123018544586573, + "grad_norm": 0.24280408024787903, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0042, + "step": 16540 + }, + { + "epoch": 1.0129138870187895, + "grad_norm": 0.18271701037883759, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0047, + "step": 16550 + }, + { + "epoch": 1.0135259195789217, + "grad_norm": 0.2874322235584259, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0059, + "step": 16560 + }, + { + "epoch": 1.0141379521390539, + "grad_norm": 0.17367394268512726, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0048, + "step": 16570 + }, + { + "epoch": 1.014749984699186, + "grad_norm": 0.167460098862648, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0074, + "step": 16580 + }, + { + "epoch": 1.0153620172593183, + "grad_norm": 0.21867765486240387, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0042, + "step": 16590 + }, + { + "epoch": 1.0159740498194505, + "grad_norm": 0.2539086639881134, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0057, + "step": 16600 + }, + { + "epoch": 1.0165860823795827, + "grad_norm": 0.1415795534849167, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0044, + "step": 16610 + }, + { + "epoch": 1.0171981149397147, + "grad_norm": 0.12702493369579315, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0038, + "step": 16620 + }, + { + "epoch": 1.0178101474998469, + "grad_norm": 0.16548305749893188, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0042, + "step": 16630 + }, + { + "epoch": 1.018422180059979, + "grad_norm": 0.4413173496723175, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0059, + "step": 16640 + }, + { + "epoch": 1.0190342126201113, + "grad_norm": 0.30871614813804626, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0045, + "step": 16650 + }, + { + "epoch": 1.0196462451802435, + "grad_norm": 0.259650319814682, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0044, + "step": 16660 + }, + { + "epoch": 1.0202582777403757, + "grad_norm": 0.36035388708114624, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0068, + "step": 16670 + }, + { + "epoch": 1.020870310300508, + "grad_norm": 0.3487808406352997, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0038, + "step": 16680 + }, + { + "epoch": 1.02148234286064, + "grad_norm": 0.2898370623588562, + "learning_rate": 9.843955128197274e-06, + "loss": 0.004, + "step": 16690 + }, + { + "epoch": 1.0220943754207723, + "grad_norm": 0.2942182719707489, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0042, + "step": 16700 + }, + { + "epoch": 1.0227064079809045, + "grad_norm": 0.27839869260787964, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0042, + "step": 16710 + }, + { + "epoch": 1.0233184405410367, + "grad_norm": 0.17199957370758057, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0059, + "step": 16720 + }, + { + "epoch": 1.023930473101169, + "grad_norm": 0.2521669566631317, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0073, + "step": 16730 + }, + { + "epoch": 1.0245425056613011, + "grad_norm": 0.19908513128757477, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0047, + "step": 16740 + }, + { + "epoch": 1.0251545382214333, + "grad_norm": 0.23300328850746155, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0055, + "step": 16750 + }, + { + "epoch": 1.0257665707815655, + "grad_norm": 0.24671277403831482, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0043, + "step": 16760 + }, + { + "epoch": 1.0263786033416977, + "grad_norm": 0.23183101415634155, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0052, + "step": 16770 + }, + { + "epoch": 1.02699063590183, + "grad_norm": 0.13460612297058105, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0035, + "step": 16780 + }, + { + "epoch": 1.0276026684619621, + "grad_norm": 0.1990940123796463, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0044, + "step": 16790 + }, + { + "epoch": 1.0282147010220943, + "grad_norm": 0.21223406493663788, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0036, + "step": 16800 + }, + { + "epoch": 1.0288267335822265, + "grad_norm": 0.2649106979370117, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0044, + "step": 16810 + }, + { + "epoch": 1.0294387661423587, + "grad_norm": 0.2524845600128174, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0048, + "step": 16820 + }, + { + "epoch": 1.030050798702491, + "grad_norm": 0.22169779241085052, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 1.0306628312626231, + "grad_norm": 0.16642418503761292, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0048, + "step": 16840 + }, + { + "epoch": 1.0312748638227553, + "grad_norm": 0.22939598560333252, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0048, + "step": 16850 + }, + { + "epoch": 1.0318868963828876, + "grad_norm": 0.2131129503250122, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0055, + "step": 16860 + }, + { + "epoch": 1.0324989289430198, + "grad_norm": 0.20492705702781677, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0041, + "step": 16870 + }, + { + "epoch": 1.033110961503152, + "grad_norm": 0.2988845705986023, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0042, + "step": 16880 + }, + { + "epoch": 1.0337229940632842, + "grad_norm": 0.18579600751399994, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0042, + "step": 16890 + }, + { + "epoch": 1.0343350266234164, + "grad_norm": 0.2553490698337555, + "learning_rate": 9.641222698101725e-06, + "loss": 0.005, + "step": 16900 + }, + { + "epoch": 1.0349470591835486, + "grad_norm": 0.338440865278244, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0036, + "step": 16910 + }, + { + "epoch": 1.0355590917436808, + "grad_norm": 0.12755723297595978, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0044, + "step": 16920 + }, + { + "epoch": 1.036171124303813, + "grad_norm": 0.12222232669591904, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0037, + "step": 16930 + }, + { + "epoch": 1.0367831568639452, + "grad_norm": 0.20246204733848572, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0055, + "step": 16940 + }, + { + "epoch": 1.0373951894240774, + "grad_norm": 0.36903291940689087, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0051, + "step": 16950 + }, + { + "epoch": 1.0380072219842096, + "grad_norm": 0.3166116178035736, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0045, + "step": 16960 + }, + { + "epoch": 1.0386192545443418, + "grad_norm": 0.2777375280857086, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0041, + "step": 16970 + }, + { + "epoch": 1.039231287104474, + "grad_norm": 0.3173989951610565, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0053, + "step": 16980 + }, + { + "epoch": 1.0398433196646062, + "grad_norm": 0.2135571539402008, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0051, + "step": 16990 + }, + { + "epoch": 1.0404553522247384, + "grad_norm": 0.18536782264709473, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0037, + "step": 17000 + }, + { + "epoch": 1.0410673847848706, + "grad_norm": 0.17782410979270935, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0052, + "step": 17010 + }, + { + "epoch": 1.0416794173450028, + "grad_norm": 0.31509512662887573, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0099, + "step": 17020 + }, + { + "epoch": 1.042291449905135, + "grad_norm": 0.22748225927352905, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 1.0429034824652672, + "grad_norm": 0.14924705028533936, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0053, + "step": 17040 + }, + { + "epoch": 1.0435155150253994, + "grad_norm": 0.21390999853610992, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0044, + "step": 17050 + }, + { + "epoch": 1.0441275475855316, + "grad_norm": 0.25828516483306885, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0042, + "step": 17060 + }, + { + "epoch": 1.0447395801456638, + "grad_norm": 0.24069662392139435, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0069, + "step": 17070 + }, + { + "epoch": 1.045351612705796, + "grad_norm": 0.1090504601597786, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0036, + "step": 17080 + }, + { + "epoch": 1.0459636452659282, + "grad_norm": 0.17990687489509583, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0049, + "step": 17090 + }, + { + "epoch": 1.0465756778260604, + "grad_norm": 0.21505555510520935, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0051, + "step": 17100 + }, + { + "epoch": 1.0471877103861926, + "grad_norm": 0.2157493680715561, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0063, + "step": 17110 + }, + { + "epoch": 1.0477997429463248, + "grad_norm": 0.30865493416786194, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0053, + "step": 17120 + }, + { + "epoch": 1.048411775506457, + "grad_norm": 0.16882938146591187, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0043, + "step": 17130 + }, + { + "epoch": 1.0490238080665892, + "grad_norm": 0.14921846985816956, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0043, + "step": 17140 + }, + { + "epoch": 1.0496358406267214, + "grad_norm": 0.15723800659179688, + "learning_rate": 9.400800085133245e-06, + "loss": 0.005, + "step": 17150 + }, + { + "epoch": 1.0502478731868536, + "grad_norm": 0.19597285985946655, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0046, + "step": 17160 + }, + { + "epoch": 1.0508599057469858, + "grad_norm": 0.1684723198413849, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0053, + "step": 17170 + }, + { + "epoch": 1.051471938307118, + "grad_norm": 0.1733175367116928, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0053, + "step": 17180 + }, + { + "epoch": 1.0520839708672503, + "grad_norm": 0.23111647367477417, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0048, + "step": 17190 + }, + { + "epoch": 1.0526960034273822, + "grad_norm": 0.36174628138542175, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0049, + "step": 17200 + }, + { + "epoch": 1.0533080359875144, + "grad_norm": 0.15791575610637665, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0048, + "step": 17210 + }, + { + "epoch": 1.0539200685476466, + "grad_norm": 0.16026809811592102, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0047, + "step": 17220 + }, + { + "epoch": 1.0545321011077788, + "grad_norm": 0.13964296877384186, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0033, + "step": 17230 + }, + { + "epoch": 1.055144133667911, + "grad_norm": 0.22623896598815918, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0041, + "step": 17240 + }, + { + "epoch": 1.0557561662280432, + "grad_norm": 0.15534555912017822, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0067, + "step": 17250 + }, + { + "epoch": 1.0563681987881754, + "grad_norm": 0.09519665688276291, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0035, + "step": 17260 + }, + { + "epoch": 1.0569802313483077, + "grad_norm": 0.19323785603046417, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0045, + "step": 17270 + }, + { + "epoch": 1.0575922639084399, + "grad_norm": 0.21194952726364136, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0047, + "step": 17280 + }, + { + "epoch": 1.058204296468572, + "grad_norm": 0.28977999091148376, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0049, + "step": 17290 + }, + { + "epoch": 1.0588163290287043, + "grad_norm": 0.1739121824502945, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0062, + "step": 17300 + }, + { + "epoch": 1.0594283615888365, + "grad_norm": 0.23189865052700043, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0055, + "step": 17310 + }, + { + "epoch": 1.0600403941489687, + "grad_norm": 0.15705449879169464, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0033, + "step": 17320 + }, + { + "epoch": 1.0606524267091009, + "grad_norm": 0.23189882934093475, + "learning_rate": 9.228411903689187e-06, + "loss": 0.003, + "step": 17330 + }, + { + "epoch": 1.061264459269233, + "grad_norm": 0.19559095799922943, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0051, + "step": 17340 + }, + { + "epoch": 1.0618764918293653, + "grad_norm": 0.2560543715953827, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0049, + "step": 17350 + }, + { + "epoch": 1.0624885243894975, + "grad_norm": 0.35167232155799866, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0042, + "step": 17360 + }, + { + "epoch": 1.0631005569496297, + "grad_norm": 0.17626497149467468, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0039, + "step": 17370 + }, + { + "epoch": 1.0637125895097619, + "grad_norm": 0.18818546831607819, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0043, + "step": 17380 + }, + { + "epoch": 1.064324622069894, + "grad_norm": 0.10237561911344528, + "learning_rate": 9.171095634265995e-06, + "loss": 0.0053, + "step": 17390 + }, + { + "epoch": 1.0649366546300263, + "grad_norm": 0.21828459203243256, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0042, + "step": 17400 + }, + { + "epoch": 1.0655486871901585, + "grad_norm": 0.09354235231876373, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0034, + "step": 17410 + }, + { + "epoch": 1.0661607197502907, + "grad_norm": 0.18106088042259216, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0051, + "step": 17420 + }, + { + "epoch": 1.066772752310423, + "grad_norm": 0.21538101136684418, + "learning_rate": 9.132927564918328e-06, + "loss": 0.0056, + "step": 17430 + }, + { + "epoch": 1.067384784870555, + "grad_norm": 0.18729519844055176, + "learning_rate": 9.1233909973763e-06, + "loss": 0.004, + "step": 17440 + }, + { + "epoch": 1.0679968174306873, + "grad_norm": 0.3791484832763672, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0052, + "step": 17450 + }, + { + "epoch": 1.0686088499908195, + "grad_norm": 0.19206254184246063, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0042, + "step": 17460 + }, + { + "epoch": 1.0692208825509517, + "grad_norm": 0.15434518456459045, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0061, + "step": 17470 + }, + { + "epoch": 1.069832915111084, + "grad_norm": 0.17898093163967133, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0045, + "step": 17480 + }, + { + "epoch": 1.0704449476712161, + "grad_norm": 0.21975649893283844, + "learning_rate": 9.07574141798717e-06, + "loss": 0.005, + "step": 17490 + }, + { + "epoch": 1.0710569802313483, + "grad_norm": 0.1380346417427063, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0032, + "step": 17500 + }, + { + "epoch": 1.0716690127914805, + "grad_norm": 0.28567400574684143, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0044, + "step": 17510 + }, + { + "epoch": 1.0722810453516127, + "grad_norm": 0.22925534844398499, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0041, + "step": 17520 + }, + { + "epoch": 1.072893077911745, + "grad_norm": 0.27094215154647827, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0047, + "step": 17530 + }, + { + "epoch": 1.0735051104718771, + "grad_norm": 0.32299691438674927, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0048, + "step": 17540 + }, + { + "epoch": 1.0741171430320093, + "grad_norm": 0.26789531111717224, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0047, + "step": 17550 + }, + { + "epoch": 1.0747291755921415, + "grad_norm": 0.3175952434539795, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0077, + "step": 17560 + }, + { + "epoch": 1.0753412081522737, + "grad_norm": 0.24784249067306519, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0048, + "step": 17570 + }, + { + "epoch": 1.075953240712406, + "grad_norm": 0.3081960380077362, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0046, + "step": 17580 + }, + { + "epoch": 1.0765652732725381, + "grad_norm": 0.25334152579307556, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0039, + "step": 17590 + }, + { + "epoch": 1.0771773058326704, + "grad_norm": 0.24747619032859802, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0059, + "step": 17600 + }, + { + "epoch": 1.0777893383928026, + "grad_norm": 0.19048908352851868, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0049, + "step": 17610 + }, + { + "epoch": 1.0784013709529348, + "grad_norm": 0.18883349001407623, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0047, + "step": 17620 + }, + { + "epoch": 1.079013403513067, + "grad_norm": 0.18653099238872528, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0044, + "step": 17630 + }, + { + "epoch": 1.0796254360731992, + "grad_norm": 0.1320251226425171, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0042, + "step": 17640 + }, + { + "epoch": 1.0802374686333314, + "grad_norm": 0.14996238052845, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0041, + "step": 17650 + }, + { + "epoch": 1.0808495011934636, + "grad_norm": 0.4576573073863983, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0059, + "step": 17660 + }, + { + "epoch": 1.0814615337535958, + "grad_norm": 0.19582511484622955, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0051, + "step": 17670 + }, + { + "epoch": 1.082073566313728, + "grad_norm": 0.21973003447055817, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0059, + "step": 17680 + }, + { + "epoch": 1.0826855988738602, + "grad_norm": 0.18183568120002747, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0056, + "step": 17690 + }, + { + "epoch": 1.0832976314339924, + "grad_norm": 0.1761978417634964, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0049, + "step": 17700 + }, + { + "epoch": 1.0839096639941246, + "grad_norm": 0.10185366123914719, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0041, + "step": 17710 + }, + { + "epoch": 1.0845216965542568, + "grad_norm": 0.262513130903244, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0046, + "step": 17720 + }, + { + "epoch": 1.0851337291143888, + "grad_norm": 0.36413198709487915, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0043, + "step": 17730 + }, + { + "epoch": 1.085745761674521, + "grad_norm": 0.2258218675851822, + "learning_rate": 8.83836825410936e-06, + "loss": 0.005, + "step": 17740 + }, + { + "epoch": 1.0863577942346532, + "grad_norm": 0.20840497314929962, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0067, + "step": 17750 + }, + { + "epoch": 1.0869698267947854, + "grad_norm": 0.33392995595932007, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0063, + "step": 17760 + }, + { + "epoch": 1.0875818593549176, + "grad_norm": 0.18477876484394073, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0039, + "step": 17770 + }, + { + "epoch": 1.0881938919150498, + "grad_norm": 0.14785899221897125, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0063, + "step": 17780 + }, + { + "epoch": 1.088805924475182, + "grad_norm": 0.12930043041706085, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0055, + "step": 17790 + }, + { + "epoch": 1.0894179570353142, + "grad_norm": 0.1541786789894104, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0035, + "step": 17800 + }, + { + "epoch": 1.0900299895954464, + "grad_norm": 0.1781499683856964, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0054, + "step": 17810 + }, + { + "epoch": 1.0906420221555786, + "grad_norm": 0.13659314811229706, + "learning_rate": 8.762735374981932e-06, + "loss": 0.0047, + "step": 17820 + }, + { + "epoch": 1.0912540547157108, + "grad_norm": 0.18936918675899506, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0041, + "step": 17830 + }, + { + "epoch": 1.091866087275843, + "grad_norm": 0.24795638024806976, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0043, + "step": 17840 + }, + { + "epoch": 1.0924781198359752, + "grad_norm": 0.28090324997901917, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0036, + "step": 17850 + }, + { + "epoch": 1.0930901523961074, + "grad_norm": 0.3130576014518738, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0042, + "step": 17860 + }, + { + "epoch": 1.0937021849562396, + "grad_norm": 0.19758646190166473, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0047, + "step": 17870 + }, + { + "epoch": 1.0943142175163718, + "grad_norm": 0.20309071242809296, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0042, + "step": 17880 + }, + { + "epoch": 1.094926250076504, + "grad_norm": 0.19741898775100708, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0057, + "step": 17890 + }, + { + "epoch": 1.0955382826366362, + "grad_norm": 0.19182747602462769, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0042, + "step": 17900 + }, + { + "epoch": 1.0961503151967684, + "grad_norm": 0.14508575201034546, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0044, + "step": 17910 + }, + { + "epoch": 1.0967623477569006, + "grad_norm": 0.19854849576950073, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0064, + "step": 17920 + }, + { + "epoch": 1.0973743803170328, + "grad_norm": 0.15055720508098602, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0049, + "step": 17930 + }, + { + "epoch": 1.097986412877165, + "grad_norm": 0.1855372190475464, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0043, + "step": 17940 + }, + { + "epoch": 1.0985984454372972, + "grad_norm": 0.13770940899848938, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0058, + "step": 17950 + }, + { + "epoch": 1.0992104779974294, + "grad_norm": 0.24905221164226532, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0048, + "step": 17960 + }, + { + "epoch": 1.0998225105575616, + "grad_norm": 0.1951165348291397, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0043, + "step": 17970 + }, + { + "epoch": 1.1004345431176938, + "grad_norm": 0.18365852534770966, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0051, + "step": 17980 + }, + { + "epoch": 1.101046575677826, + "grad_norm": 0.16304127871990204, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0034, + "step": 17990 + }, + { + "epoch": 1.1016586082379582, + "grad_norm": 0.262677401304245, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0042, + "step": 18000 + }, + { + "epoch": 1.1022706407980905, + "grad_norm": 0.6157310605049133, + "learning_rate": 8.583791146965244e-06, + "loss": 0.007, + "step": 18010 + }, + { + "epoch": 1.1028826733582227, + "grad_norm": 0.2832951247692108, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0056, + "step": 18020 + }, + { + "epoch": 1.1034947059183549, + "grad_norm": 0.1781810224056244, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0049, + "step": 18030 + }, + { + "epoch": 1.104106738478487, + "grad_norm": 0.23228950798511505, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0045, + "step": 18040 + }, + { + "epoch": 1.1047187710386193, + "grad_norm": 0.2573170065879822, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0048, + "step": 18050 + }, + { + "epoch": 1.1053308035987515, + "grad_norm": 0.30996036529541016, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0054, + "step": 18060 + }, + { + "epoch": 1.1059428361588837, + "grad_norm": 0.24979132413864136, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0045, + "step": 18070 + }, + { + "epoch": 1.1065548687190159, + "grad_norm": 0.17564314603805542, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0033, + "step": 18080 + }, + { + "epoch": 1.107166901279148, + "grad_norm": 0.14539776742458344, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0047, + "step": 18090 + }, + { + "epoch": 1.1077789338392803, + "grad_norm": 0.2530387341976166, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0058, + "step": 18100 + }, + { + "epoch": 1.1083909663994125, + "grad_norm": 0.2038760781288147, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0052, + "step": 18110 + }, + { + "epoch": 1.1090029989595447, + "grad_norm": 0.1769075244665146, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0043, + "step": 18120 + }, + { + "epoch": 1.1096150315196769, + "grad_norm": 0.1686626374721527, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0055, + "step": 18130 + }, + { + "epoch": 1.110227064079809, + "grad_norm": 0.21752336621284485, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0052, + "step": 18140 + }, + { + "epoch": 1.1108390966399413, + "grad_norm": 0.2739295959472656, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0039, + "step": 18150 + }, + { + "epoch": 1.1114511292000735, + "grad_norm": 0.18259567022323608, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0038, + "step": 18160 + }, + { + "epoch": 1.1120631617602057, + "grad_norm": 0.21565310657024384, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0043, + "step": 18170 + }, + { + "epoch": 1.112675194320338, + "grad_norm": 0.2141607403755188, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0059, + "step": 18180 + }, + { + "epoch": 1.11328722688047, + "grad_norm": 0.3017563819885254, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0044, + "step": 18190 + }, + { + "epoch": 1.1138992594406023, + "grad_norm": 0.2021455019712448, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0044, + "step": 18200 + }, + { + "epoch": 1.1145112920007345, + "grad_norm": 0.2113070785999298, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0048, + "step": 18210 + }, + { + "epoch": 1.1151233245608667, + "grad_norm": 0.18945784866809845, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0029, + "step": 18220 + }, + { + "epoch": 1.115735357120999, + "grad_norm": 0.15259192883968353, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0043, + "step": 18230 + }, + { + "epoch": 1.1163473896811311, + "grad_norm": 0.17555822432041168, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0039, + "step": 18240 + }, + { + "epoch": 1.1169594222412633, + "grad_norm": 0.20105648040771484, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0058, + "step": 18250 + }, + { + "epoch": 1.1175714548013955, + "grad_norm": 0.31626567244529724, + "learning_rate": 8.349909816537207e-06, + "loss": 0.0044, + "step": 18260 + }, + { + "epoch": 1.1181834873615277, + "grad_norm": 0.16219007968902588, + "learning_rate": 8.340593854157868e-06, + "loss": 0.005, + "step": 18270 + }, + { + "epoch": 1.11879551992166, + "grad_norm": 0.2174186110496521, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0055, + "step": 18280 + }, + { + "epoch": 1.1194075524817921, + "grad_norm": 0.13639339804649353, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0043, + "step": 18290 + }, + { + "epoch": 1.1200195850419243, + "grad_norm": 0.15100249648094177, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0042, + "step": 18300 + }, + { + "epoch": 1.1206316176020565, + "grad_norm": 0.2114904671907425, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0043, + "step": 18310 + }, + { + "epoch": 1.1212436501621887, + "grad_norm": 0.2941966950893402, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0052, + "step": 18320 + }, + { + "epoch": 1.1218556827223207, + "grad_norm": 0.21695150434970856, + "learning_rate": 8.28476400245882e-06, + "loss": 0.005, + "step": 18330 + }, + { + "epoch": 1.122467715282453, + "grad_norm": 0.11768218129873276, + "learning_rate": 8.275470116190976e-06, + "loss": 0.005, + "step": 18340 + }, + { + "epoch": 1.1230797478425851, + "grad_norm": 0.1427483856678009, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0039, + "step": 18350 + }, + { + "epoch": 1.1236917804027173, + "grad_norm": 0.1837971955537796, + "learning_rate": 8.256891946721157e-06, + "loss": 0.004, + "step": 18360 + }, + { + "epoch": 1.1243038129628495, + "grad_norm": 0.30968883633613586, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0037, + "step": 18370 + }, + { + "epoch": 1.1249158455229817, + "grad_norm": 0.13366396725177765, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0042, + "step": 18380 + }, + { + "epoch": 1.125527878083114, + "grad_norm": 0.1829235553741455, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0039, + "step": 18390 + }, + { + "epoch": 1.1261399106432461, + "grad_norm": 0.3106991648674011, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0052, + "step": 18400 + }, + { + "epoch": 1.1267519432033783, + "grad_norm": 0.38655754923820496, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0046, + "step": 18410 + }, + { + "epoch": 1.1273639757635106, + "grad_norm": 0.23598383367061615, + "learning_rate": 8.201235047388747e-06, + "loss": 0.004, + "step": 18420 + }, + { + "epoch": 1.1279760083236428, + "grad_norm": 0.17428012192249298, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0046, + "step": 18430 + }, + { + "epoch": 1.128588040883775, + "grad_norm": 0.1847466081380844, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0043, + "step": 18440 + }, + { + "epoch": 1.1292000734439072, + "grad_norm": 0.14917762577533722, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0049, + "step": 18450 + }, + { + "epoch": 1.1298121060040394, + "grad_norm": 0.2882528305053711, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0046, + "step": 18460 + }, + { + "epoch": 1.1304241385641716, + "grad_norm": 0.36186549067497253, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0065, + "step": 18470 + }, + { + "epoch": 1.1310361711243038, + "grad_norm": 0.1604463905096054, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0037, + "step": 18480 + }, + { + "epoch": 1.131648203684436, + "grad_norm": 0.17751921713352203, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0034, + "step": 18490 + }, + { + "epoch": 1.1322602362445682, + "grad_norm": 0.15355733036994934, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0059, + "step": 18500 + }, + { + "epoch": 1.1328722688047004, + "grad_norm": 0.21558596193790436, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0044, + "step": 18510 + }, + { + "epoch": 1.1334843013648326, + "grad_norm": 0.20114412903785706, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 1.1340963339249648, + "grad_norm": 0.17260855436325073, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0053, + "step": 18530 + }, + { + "epoch": 1.134708366485097, + "grad_norm": 0.16089287400245667, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0032, + "step": 18540 + }, + { + "epoch": 1.1353203990452292, + "grad_norm": 0.14655937254428864, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0043, + "step": 18550 + }, + { + "epoch": 1.1359324316053614, + "grad_norm": 0.16373249888420105, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0058, + "step": 18560 + }, + { + "epoch": 1.1365444641654936, + "grad_norm": 0.14543801546096802, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0043, + "step": 18570 + }, + { + "epoch": 1.1371564967256258, + "grad_norm": 0.3515278100967407, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0043, + "step": 18580 + }, + { + "epoch": 1.137768529285758, + "grad_norm": 0.21776945888996124, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0053, + "step": 18590 + }, + { + "epoch": 1.1383805618458902, + "grad_norm": 0.21879829466342926, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0051, + "step": 18600 + }, + { + "epoch": 1.1389925944060224, + "grad_norm": 0.16967973113059998, + "learning_rate": 8.025779439806006e-06, + "loss": 0.0048, + "step": 18610 + }, + { + "epoch": 1.1396046269661546, + "grad_norm": 0.4298441410064697, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0056, + "step": 18620 + }, + { + "epoch": 1.1402166595262868, + "grad_norm": 0.1858961284160614, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0067, + "step": 18630 + }, + { + "epoch": 1.140828692086419, + "grad_norm": 0.25853803753852844, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0057, + "step": 18640 + }, + { + "epoch": 1.1414407246465512, + "grad_norm": 0.18566234409809113, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0048, + "step": 18650 + }, + { + "epoch": 1.1420527572066834, + "grad_norm": 0.3471083343029022, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0042, + "step": 18660 + }, + { + "epoch": 1.1426647897668156, + "grad_norm": 0.2092636376619339, + "learning_rate": 7.970630670012853e-06, + "loss": 0.004, + "step": 18670 + }, + { + "epoch": 1.1432768223269478, + "grad_norm": 0.3432580828666687, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0044, + "step": 18680 + }, + { + "epoch": 1.14388885488708, + "grad_norm": 0.14227882027626038, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0041, + "step": 18690 + }, + { + "epoch": 1.1445008874472122, + "grad_norm": 0.2128007709980011, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0037, + "step": 18700 + }, + { + "epoch": 1.1451129200073444, + "grad_norm": 0.25377482175827026, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0049, + "step": 18710 + }, + { + "epoch": 1.1457249525674766, + "grad_norm": 0.1905982494354248, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0037, + "step": 18720 + }, + { + "epoch": 1.1463369851276088, + "grad_norm": 0.3090096712112427, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0079, + "step": 18730 + }, + { + "epoch": 1.146949017687741, + "grad_norm": 0.15604345500469208, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0037, + "step": 18740 + }, + { + "epoch": 1.1475610502478732, + "grad_norm": 0.21756386756896973, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0043, + "step": 18750 + }, + { + "epoch": 1.1481730828080055, + "grad_norm": 0.23869304358959198, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0038, + "step": 18760 + }, + { + "epoch": 1.1487851153681377, + "grad_norm": 0.18082380294799805, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0073, + "step": 18770 + }, + { + "epoch": 1.1493971479282699, + "grad_norm": 0.4032754898071289, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0061, + "step": 18780 + }, + { + "epoch": 1.150009180488402, + "grad_norm": 0.3173290491104126, + "learning_rate": 7.860719408056385e-06, + "loss": 0.004, + "step": 18790 + }, + { + "epoch": 1.1506212130485343, + "grad_norm": 0.18892645835876465, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0044, + "step": 18800 + }, + { + "epoch": 1.1512332456086665, + "grad_norm": 0.26740241050720215, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0056, + "step": 18810 + }, + { + "epoch": 1.1518452781687987, + "grad_norm": 0.3046218752861023, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0072, + "step": 18820 + }, + { + "epoch": 1.1524573107289309, + "grad_norm": 0.17181983590126038, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0034, + "step": 18830 + }, + { + "epoch": 1.1530693432890629, + "grad_norm": 0.22095724940299988, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0045, + "step": 18840 + }, + { + "epoch": 1.153681375849195, + "grad_norm": 0.1514609307050705, + "learning_rate": 7.80596155940873e-06, + "loss": 0.004, + "step": 18850 + }, + { + "epoch": 1.1542934084093273, + "grad_norm": 0.15244366228580475, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0047, + "step": 18860 + }, + { + "epoch": 1.1549054409694595, + "grad_norm": 0.24359947443008423, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0039, + "step": 18870 + }, + { + "epoch": 1.1555174735295917, + "grad_norm": 0.15558156371116638, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0036, + "step": 18880 + }, + { + "epoch": 1.1561295060897239, + "grad_norm": 0.33679234981536865, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0041, + "step": 18890 + }, + { + "epoch": 1.156741538649856, + "grad_norm": 0.15811999142169952, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0062, + "step": 18900 + }, + { + "epoch": 1.1573535712099883, + "grad_norm": 0.14838527143001556, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0029, + "step": 18910 + }, + { + "epoch": 1.1579656037701205, + "grad_norm": 0.23024815320968628, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0038, + "step": 18920 + }, + { + "epoch": 1.1585776363302527, + "grad_norm": 0.18455618619918823, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0044, + "step": 18930 + }, + { + "epoch": 1.1591896688903849, + "grad_norm": 0.20213079452514648, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0038, + "step": 18940 + }, + { + "epoch": 1.159801701450517, + "grad_norm": 0.19000643491744995, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0043, + "step": 18950 + }, + { + "epoch": 1.1604137340106493, + "grad_norm": 0.14075686037540436, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0047, + "step": 18960 + }, + { + "epoch": 1.1610257665707815, + "grad_norm": 0.22101792693138123, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0045, + "step": 18970 + }, + { + "epoch": 1.1616377991309137, + "grad_norm": 0.1097906231880188, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0037, + "step": 18980 + }, + { + "epoch": 1.162249831691046, + "grad_norm": 0.16169370710849762, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0033, + "step": 18990 + }, + { + "epoch": 1.162861864251178, + "grad_norm": 0.32931753993034363, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0052, + "step": 19000 + }, + { + "epoch": 1.1634738968113103, + "grad_norm": 0.2494741678237915, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0057, + "step": 19010 + }, + { + "epoch": 1.1640859293714425, + "grad_norm": 0.18492171168327332, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0056, + "step": 19020 + }, + { + "epoch": 1.1646979619315747, + "grad_norm": 0.18830963969230652, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0036, + "step": 19030 + }, + { + "epoch": 1.165309994491707, + "grad_norm": 0.1331586092710495, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0046, + "step": 19040 + }, + { + "epoch": 1.1659220270518391, + "grad_norm": 0.2433806210756302, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0053, + "step": 19050 + }, + { + "epoch": 1.1665340596119713, + "grad_norm": 0.24491485953330994, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0037, + "step": 19060 + }, + { + "epoch": 1.1671460921721035, + "grad_norm": 0.1789211630821228, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0046, + "step": 19070 + }, + { + "epoch": 1.1677581247322357, + "grad_norm": 0.2729121148586273, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0043, + "step": 19080 + }, + { + "epoch": 1.168370157292368, + "grad_norm": 0.19535189867019653, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0056, + "step": 19090 + }, + { + "epoch": 1.1689821898525001, + "grad_norm": 0.2282983660697937, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0048, + "step": 19100 + }, + { + "epoch": 1.1695942224126323, + "grad_norm": 0.1281195729970932, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0045, + "step": 19110 + }, + { + "epoch": 1.1702062549727645, + "grad_norm": 0.2850968539714813, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0034, + "step": 19120 + }, + { + "epoch": 1.1708182875328967, + "grad_norm": 0.12891536951065063, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0054, + "step": 19130 + }, + { + "epoch": 1.171430320093029, + "grad_norm": 0.13464727997779846, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0033, + "step": 19140 + }, + { + "epoch": 1.1720423526531611, + "grad_norm": 0.2415568083524704, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0041, + "step": 19150 + }, + { + "epoch": 1.1726543852132933, + "grad_norm": 0.15686331689357758, + "learning_rate": 7.525246655150879e-06, + "loss": 0.004, + "step": 19160 + }, + { + "epoch": 1.1732664177734256, + "grad_norm": 0.15490666031837463, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0039, + "step": 19170 + }, + { + "epoch": 1.1738784503335578, + "grad_norm": 0.14095450937747955, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0034, + "step": 19180 + }, + { + "epoch": 1.17449048289369, + "grad_norm": 0.19024531543254852, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0084, + "step": 19190 + }, + { + "epoch": 1.1751025154538222, + "grad_norm": 0.2583692669868469, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0042, + "step": 19200 + }, + { + "epoch": 1.1757145480139544, + "grad_norm": 0.19117654860019684, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0038, + "step": 19210 + }, + { + "epoch": 1.1763265805740866, + "grad_norm": 0.15838374197483063, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0036, + "step": 19220 + }, + { + "epoch": 1.1769386131342188, + "grad_norm": 0.30352044105529785, + "learning_rate": 7.4623904967312e-06, + "loss": 0.0052, + "step": 19230 + }, + { + "epoch": 1.177550645694351, + "grad_norm": 0.229969322681427, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0043, + "step": 19240 + }, + { + "epoch": 1.1781626782544832, + "grad_norm": 0.17781461775302887, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0046, + "step": 19250 + }, + { + "epoch": 1.1787747108146154, + "grad_norm": 0.1306339055299759, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0032, + "step": 19260 + }, + { + "epoch": 1.1793867433747476, + "grad_norm": 0.15727253258228302, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0045, + "step": 19270 + }, + { + "epoch": 1.1799987759348798, + "grad_norm": 0.24909166991710663, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0045, + "step": 19280 + }, + { + "epoch": 1.180610808495012, + "grad_norm": 0.4604126811027527, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0053, + "step": 19290 + }, + { + "epoch": 1.1812228410551442, + "grad_norm": 0.12739762663841248, + "learning_rate": 7.399737764864619e-06, + "loss": 0.004, + "step": 19300 + }, + { + "epoch": 1.1818348736152764, + "grad_norm": 0.2849223017692566, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0043, + "step": 19310 + }, + { + "epoch": 1.1824469061754086, + "grad_norm": 0.26089897751808167, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0044, + "step": 19320 + }, + { + "epoch": 1.1830589387355408, + "grad_norm": 0.1752242147922516, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0076, + "step": 19330 + }, + { + "epoch": 1.183670971295673, + "grad_norm": 0.14917130768299103, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0097, + "step": 19340 + }, + { + "epoch": 1.1842830038558052, + "grad_norm": 0.1599114090204239, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0061, + "step": 19350 + }, + { + "epoch": 1.1848950364159374, + "grad_norm": 0.16370004415512085, + "learning_rate": 7.346200065486093e-06, + "loss": 0.0035, + "step": 19360 + }, + { + "epoch": 1.1855070689760696, + "grad_norm": 0.19354844093322754, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0032, + "step": 19370 + }, + { + "epoch": 1.1861191015362018, + "grad_norm": 0.19689561426639557, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0067, + "step": 19380 + }, + { + "epoch": 1.186731134096334, + "grad_norm": 0.22203278541564941, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0041, + "step": 19390 + }, + { + "epoch": 1.1873431666564662, + "grad_norm": 0.13579773902893066, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0048, + "step": 19400 + }, + { + "epoch": 1.1879551992165984, + "grad_norm": 0.12321218848228455, + "learning_rate": 7.301703138094429e-06, + "loss": 0.004, + "step": 19410 + }, + { + "epoch": 1.1885672317767306, + "grad_norm": 0.28819525241851807, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0042, + "step": 19420 + }, + { + "epoch": 1.1891792643368628, + "grad_norm": 0.2577916085720062, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0039, + "step": 19430 + }, + { + "epoch": 1.189791296896995, + "grad_norm": 0.26840633153915405, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0062, + "step": 19440 + }, + { + "epoch": 1.1904033294571272, + "grad_norm": 0.24222144484519958, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0044, + "step": 19450 + }, + { + "epoch": 1.1910153620172594, + "grad_norm": 0.157009556889534, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0038, + "step": 19460 + }, + { + "epoch": 1.1916273945773916, + "grad_norm": 0.19925500452518463, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0042, + "step": 19470 + }, + { + "epoch": 1.1922394271375236, + "grad_norm": 0.19200846552848816, + "learning_rate": 7.239590017751423e-06, + "loss": 0.004, + "step": 19480 + }, + { + "epoch": 1.1928514596976558, + "grad_norm": 0.18441490828990936, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0056, + "step": 19490 + }, + { + "epoch": 1.193463492257788, + "grad_norm": 0.27565324306488037, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0046, + "step": 19500 + }, + { + "epoch": 1.1940755248179202, + "grad_norm": 0.17830556631088257, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0043, + "step": 19510 + }, + { + "epoch": 1.1946875573780524, + "grad_norm": 0.2769330143928528, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0037, + "step": 19520 + }, + { + "epoch": 1.1952995899381846, + "grad_norm": 0.168451189994812, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0039, + "step": 19530 + }, + { + "epoch": 1.1959116224983168, + "grad_norm": 0.31246763467788696, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0046, + "step": 19540 + }, + { + "epoch": 1.196523655058449, + "grad_norm": 0.21112671494483948, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0041, + "step": 19550 + }, + { + "epoch": 1.1971356876185812, + "grad_norm": 0.31681302189826965, + "learning_rate": 7.168868583990693e-06, + "loss": 0.005, + "step": 19560 + }, + { + "epoch": 1.1977477201787134, + "grad_norm": 0.18634411692619324, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0042, + "step": 19570 + }, + { + "epoch": 1.1983597527388457, + "grad_norm": 0.17780153453350067, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0057, + "step": 19580 + }, + { + "epoch": 1.1989717852989779, + "grad_norm": 0.19183002412319183, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0043, + "step": 19590 + }, + { + "epoch": 1.19958381785911, + "grad_norm": 0.28469574451446533, + "learning_rate": 7.133615440411572e-06, + "loss": 0.004, + "step": 19600 + }, + { + "epoch": 1.2001958504192423, + "grad_norm": 0.22470368444919586, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0044, + "step": 19610 + }, + { + "epoch": 1.2008078829793745, + "grad_norm": 0.23563240468502045, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0041, + "step": 19620 + }, + { + "epoch": 1.2014199155395067, + "grad_norm": 0.18467430770397186, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0053, + "step": 19630 + }, + { + "epoch": 1.2020319480996389, + "grad_norm": 0.12539178133010864, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0047, + "step": 19640 + }, + { + "epoch": 1.202643980659771, + "grad_norm": 0.2552005648612976, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.004, + "step": 19650 + }, + { + "epoch": 1.2032560132199033, + "grad_norm": 0.13963459432125092, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0035, + "step": 19660 + }, + { + "epoch": 1.2038680457800355, + "grad_norm": 0.17387327551841736, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0038, + "step": 19670 + }, + { + "epoch": 1.2044800783401677, + "grad_norm": 0.1284111589193344, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0044, + "step": 19680 + }, + { + "epoch": 1.2050921109002999, + "grad_norm": 0.22337380051612854, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0041, + "step": 19690 + }, + { + "epoch": 1.205704143460432, + "grad_norm": 0.2254808247089386, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0033, + "step": 19700 + }, + { + "epoch": 1.2063161760205643, + "grad_norm": 0.19316980242729187, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0041, + "step": 19710 + }, + { + "epoch": 1.2069282085806965, + "grad_norm": 0.17951075732707977, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0038, + "step": 19720 + }, + { + "epoch": 1.2075402411408287, + "grad_norm": 0.3105165660381317, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0043, + "step": 19730 + }, + { + "epoch": 1.208152273700961, + "grad_norm": 0.21083533763885498, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0039, + "step": 19740 + }, + { + "epoch": 1.208764306261093, + "grad_norm": 0.20121195912361145, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0035, + "step": 19750 + }, + { + "epoch": 1.2093763388212253, + "grad_norm": 0.20067447423934937, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0047, + "step": 19760 + }, + { + "epoch": 1.2099883713813575, + "grad_norm": 0.15943066775798798, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0039, + "step": 19770 + }, + { + "epoch": 1.2106004039414897, + "grad_norm": 0.21581032872200012, + "learning_rate": 6.975884226362e-06, + "loss": 0.0045, + "step": 19780 + }, + { + "epoch": 1.211212436501622, + "grad_norm": 0.16258753836154938, + "learning_rate": 6.967165692827958e-06, + "loss": 0.004, + "step": 19790 + }, + { + "epoch": 1.2118244690617541, + "grad_norm": 0.18742400407791138, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0047, + "step": 19800 + }, + { + "epoch": 1.2124365016218863, + "grad_norm": 0.09035168588161469, + "learning_rate": 6.949742834253074e-06, + "loss": 0.004, + "step": 19810 + }, + { + "epoch": 1.2130485341820185, + "grad_norm": 0.21749694645404816, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0054, + "step": 19820 + }, + { + "epoch": 1.2136605667421507, + "grad_norm": 0.3189448416233063, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0043, + "step": 19830 + }, + { + "epoch": 1.214272599302283, + "grad_norm": 0.26815512776374817, + "learning_rate": 6.923644220932124e-06, + "loss": 0.005, + "step": 19840 + }, + { + "epoch": 1.2148846318624151, + "grad_norm": 0.19533704221248627, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0037, + "step": 19850 + }, + { + "epoch": 1.2154966644225473, + "grad_norm": 0.36249589920043945, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0064, + "step": 19860 + }, + { + "epoch": 1.2161086969826795, + "grad_norm": 0.19801265001296997, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0042, + "step": 19870 + }, + { + "epoch": 1.2167207295428117, + "grad_norm": 0.10341386497020721, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0053, + "step": 19880 + }, + { + "epoch": 1.217332762102944, + "grad_norm": 0.17985381186008453, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0045, + "step": 19890 + }, + { + "epoch": 1.2179447946630761, + "grad_norm": 0.18160982429981232, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0061, + "step": 19900 + }, + { + "epoch": 1.2185568272232083, + "grad_norm": 0.15552182495594025, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0047, + "step": 19910 + }, + { + "epoch": 1.2191688597833406, + "grad_norm": 0.34908807277679443, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0046, + "step": 19920 + }, + { + "epoch": 1.2197808923434728, + "grad_norm": 0.14835652709007263, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0042, + "step": 19930 + }, + { + "epoch": 1.220392924903605, + "grad_norm": 0.23276430368423462, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0042, + "step": 19940 + }, + { + "epoch": 1.2210049574637372, + "grad_norm": 0.1900823563337326, + "learning_rate": 6.828319751504063e-06, + "loss": 0.004, + "step": 19950 + }, + { + "epoch": 1.2216169900238694, + "grad_norm": 0.134046271443367, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0039, + "step": 19960 + }, + { + "epoch": 1.2222290225840013, + "grad_norm": 0.17264600098133087, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0036, + "step": 19970 + }, + { + "epoch": 1.2228410551441335, + "grad_norm": 0.24845834076404572, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0044, + "step": 19980 + }, + { + "epoch": 1.2234530877042658, + "grad_norm": 0.14805762469768524, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0049, + "step": 19990 + }, + { + "epoch": 1.224065120264398, + "grad_norm": 0.228907972574234, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0043, + "step": 20000 + }, + { + "epoch": 1.2246771528245302, + "grad_norm": 0.16869507730007172, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0041, + "step": 20010 + }, + { + "epoch": 1.2252891853846624, + "grad_norm": 0.1983603835105896, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0041, + "step": 20020 + }, + { + "epoch": 1.2259012179447946, + "grad_norm": 0.17656362056732178, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0028, + "step": 20030 + }, + { + "epoch": 1.2265132505049268, + "grad_norm": 0.1360313892364502, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0069, + "step": 20040 + }, + { + "epoch": 1.227125283065059, + "grad_norm": 0.21057721972465515, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0051, + "step": 20050 + }, + { + "epoch": 1.2277373156251912, + "grad_norm": 0.138632670044899, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0038, + "step": 20060 + }, + { + "epoch": 1.2283493481853234, + "grad_norm": 0.17815573513507843, + "learning_rate": 6.725005485342219e-06, + "loss": 0.003, + "step": 20070 + }, + { + "epoch": 1.2289613807454556, + "grad_norm": 0.1769353598356247, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0066, + "step": 20080 + }, + { + "epoch": 1.2295734133055878, + "grad_norm": 0.23068928718566895, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0048, + "step": 20090 + }, + { + "epoch": 1.23018544586572, + "grad_norm": 0.25139328837394714, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0049, + "step": 20100 + }, + { + "epoch": 1.2307974784258522, + "grad_norm": 0.09128634631633759, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0042, + "step": 20110 + }, + { + "epoch": 1.2314095109859844, + "grad_norm": 0.20516613125801086, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0031, + "step": 20120 + }, + { + "epoch": 1.2320215435461166, + "grad_norm": 0.1518358588218689, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0049, + "step": 20130 + }, + { + "epoch": 1.2326335761062488, + "grad_norm": 0.1673758625984192, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0044, + "step": 20140 + }, + { + "epoch": 1.233245608666381, + "grad_norm": 0.14084585011005402, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0053, + "step": 20150 + }, + { + "epoch": 1.2338576412265132, + "grad_norm": 0.23316942155361176, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0042, + "step": 20160 + }, + { + "epoch": 1.2344696737866454, + "grad_norm": 0.23793813586235046, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0045, + "step": 20170 + }, + { + "epoch": 1.2350817063467776, + "grad_norm": 0.4269389510154724, + "learning_rate": 6.630934952049143e-06, + "loss": 0.005, + "step": 20180 + }, + { + "epoch": 1.2356937389069098, + "grad_norm": 0.15654191374778748, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0039, + "step": 20190 + }, + { + "epoch": 1.236305771467042, + "grad_norm": 0.19204623997211456, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0032, + "step": 20200 + }, + { + "epoch": 1.2369178040271742, + "grad_norm": 0.15817691385746002, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0044, + "step": 20210 + }, + { + "epoch": 1.2375298365873064, + "grad_norm": 0.12637947499752045, + "learning_rate": 6.596880604028027e-06, + "loss": 0.004, + "step": 20220 + }, + { + "epoch": 1.2381418691474386, + "grad_norm": 0.26657921075820923, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0041, + "step": 20230 + }, + { + "epoch": 1.2387539017075708, + "grad_norm": 0.15207791328430176, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0045, + "step": 20240 + }, + { + "epoch": 1.239365934267703, + "grad_norm": 0.32583367824554443, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0042, + "step": 20250 + }, + { + "epoch": 1.2399779668278352, + "grad_norm": 0.15617726743221283, + "learning_rate": 6.562908932779455e-06, + "loss": 0.004, + "step": 20260 + }, + { + "epoch": 1.2405899993879674, + "grad_norm": 0.1935809850692749, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0041, + "step": 20270 + }, + { + "epoch": 1.2412020319480996, + "grad_norm": 0.17422369122505188, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0035, + "step": 20280 + }, + { + "epoch": 1.2418140645082318, + "grad_norm": 0.15332955121994019, + "learning_rate": 6.53748481975927e-06, + "loss": 0.0049, + "step": 20290 + }, + { + "epoch": 1.242426097068364, + "grad_norm": 0.16183018684387207, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0042, + "step": 20300 + }, + { + "epoch": 1.2430381296284962, + "grad_norm": 0.28421106934547424, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0045, + "step": 20310 + }, + { + "epoch": 1.2436501621886284, + "grad_norm": 0.23288874328136444, + "learning_rate": 6.512107839793337e-06, + "loss": 0.004, + "step": 20320 + }, + { + "epoch": 1.2442621947487607, + "grad_norm": 0.17955242097377777, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0036, + "step": 20330 + }, + { + "epoch": 1.2448742273088929, + "grad_norm": 0.20192117989063263, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0055, + "step": 20340 + }, + { + "epoch": 1.245486259869025, + "grad_norm": 0.15365810692310333, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0034, + "step": 20350 + }, + { + "epoch": 1.2460982924291573, + "grad_norm": 0.25220832228660583, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0039, + "step": 20360 + }, + { + "epoch": 1.2467103249892895, + "grad_norm": 0.25777462124824524, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0053, + "step": 20370 + }, + { + "epoch": 1.2473223575494217, + "grad_norm": 0.2693277895450592, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0053, + "step": 20380 + }, + { + "epoch": 1.2479343901095539, + "grad_norm": 0.22846420109272003, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0033, + "step": 20390 + }, + { + "epoch": 1.248546422669686, + "grad_norm": 0.17022505402565002, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0046, + "step": 20400 + }, + { + "epoch": 1.2491584552298183, + "grad_norm": 0.08295682072639465, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0035, + "step": 20410 + }, + { + "epoch": 1.2497704877899505, + "grad_norm": 0.2745625972747803, + "learning_rate": 6.427861749601945e-06, + "loss": 0.0044, + "step": 20420 + }, + { + "epoch": 1.2503825203500827, + "grad_norm": 0.12855033576488495, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0042, + "step": 20430 + }, + { + "epoch": 1.2509945529102149, + "grad_norm": 0.30358386039733887, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0049, + "step": 20440 + }, + { + "epoch": 1.251606585470347, + "grad_norm": 0.15514959394931793, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0048, + "step": 20450 + }, + { + "epoch": 1.2522186180304793, + "grad_norm": 0.1414988487958908, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0088, + "step": 20460 + }, + { + "epoch": 1.2528306505906115, + "grad_norm": 0.17399665713310242, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0046, + "step": 20470 + }, + { + "epoch": 1.2534426831507437, + "grad_norm": 0.22629426419734955, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0038, + "step": 20480 + }, + { + "epoch": 1.254054715710876, + "grad_norm": 0.30595293641090393, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0027, + "step": 20490 + }, + { + "epoch": 1.254666748271008, + "grad_norm": 0.17980262637138367, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0034, + "step": 20500 + }, + { + "epoch": 1.2552787808311403, + "grad_norm": 0.19016452133655548, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0029, + "step": 20510 + }, + { + "epoch": 1.2558908133912725, + "grad_norm": 0.20200394093990326, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0037, + "step": 20520 + }, + { + "epoch": 1.2565028459514047, + "grad_norm": 0.15347513556480408, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0041, + "step": 20530 + }, + { + "epoch": 1.257114878511537, + "grad_norm": 0.1851687729358673, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0042, + "step": 20540 + }, + { + "epoch": 1.2577269110716691, + "grad_norm": 0.2529662549495697, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0037, + "step": 20550 + }, + { + "epoch": 1.2583389436318013, + "grad_norm": 0.18209592998027802, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0037, + "step": 20560 + }, + { + "epoch": 1.2589509761919335, + "grad_norm": 0.18981963396072388, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0036, + "step": 20570 + }, + { + "epoch": 1.2595630087520657, + "grad_norm": 0.13232728838920593, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0035, + "step": 20580 + }, + { + "epoch": 1.260175041312198, + "grad_norm": 0.133514404296875, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0039, + "step": 20590 + }, + { + "epoch": 1.2607870738723301, + "grad_norm": 0.14339123666286469, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0043, + "step": 20600 + }, + { + "epoch": 1.2613991064324623, + "grad_norm": 0.48857489228248596, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0045, + "step": 20610 + }, + { + "epoch": 1.2620111389925945, + "grad_norm": 0.1513262242078781, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0029, + "step": 20620 + }, + { + "epoch": 1.2626231715527267, + "grad_norm": 0.1497354805469513, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0039, + "step": 20630 + }, + { + "epoch": 1.2632352041128587, + "grad_norm": 0.132791206240654, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0037, + "step": 20640 + }, + { + "epoch": 1.263847236672991, + "grad_norm": 0.13804496824741364, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0035, + "step": 20650 + }, + { + "epoch": 1.2644592692331231, + "grad_norm": 0.19393391907215118, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0049, + "step": 20660 + }, + { + "epoch": 1.2650713017932553, + "grad_norm": 0.17623338103294373, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0038, + "step": 20670 + }, + { + "epoch": 1.2656833343533875, + "grad_norm": 0.26931124925613403, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0042, + "step": 20680 + }, + { + "epoch": 1.2662953669135197, + "grad_norm": 0.17984439432621002, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0036, + "step": 20690 + }, + { + "epoch": 1.266907399473652, + "grad_norm": 0.19648219645023346, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0046, + "step": 20700 + }, + { + "epoch": 1.2675194320337841, + "grad_norm": 0.1464766263961792, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0024, + "step": 20710 + }, + { + "epoch": 1.2681314645939163, + "grad_norm": 0.1271074265241623, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0042, + "step": 20720 + }, + { + "epoch": 1.2687434971540485, + "grad_norm": 0.15960967540740967, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0079, + "step": 20730 + }, + { + "epoch": 1.2693555297141808, + "grad_norm": 0.13636153936386108, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0046, + "step": 20740 + }, + { + "epoch": 1.269967562274313, + "grad_norm": 0.19099050760269165, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0046, + "step": 20750 + }, + { + "epoch": 1.2705795948344452, + "grad_norm": 0.28632739186286926, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0036, + "step": 20760 + }, + { + "epoch": 1.2711916273945774, + "grad_norm": 0.2565019726753235, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0055, + "step": 20770 + }, + { + "epoch": 1.2718036599547096, + "grad_norm": 0.24443399906158447, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0053, + "step": 20780 + }, + { + "epoch": 1.2724156925148418, + "grad_norm": 0.1396762877702713, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0029, + "step": 20790 + }, + { + "epoch": 1.273027725074974, + "grad_norm": 0.3028377890586853, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0061, + "step": 20800 + }, + { + "epoch": 1.2736397576351062, + "grad_norm": 0.18195804953575134, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0034, + "step": 20810 + }, + { + "epoch": 1.2742517901952384, + "grad_norm": 0.16194652020931244, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0054, + "step": 20820 + }, + { + "epoch": 1.2748638227553706, + "grad_norm": 0.13011956214904785, + "learning_rate": 6.08816828695283e-06, + "loss": 0.003, + "step": 20830 + }, + { + "epoch": 1.2754758553155028, + "grad_norm": 0.23294220864772797, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0041, + "step": 20840 + }, + { + "epoch": 1.276087887875635, + "grad_norm": 0.1892961710691452, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0031, + "step": 20850 + }, + { + "epoch": 1.2766999204357672, + "grad_norm": 0.1984476000070572, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0046, + "step": 20860 + }, + { + "epoch": 1.2773119529958994, + "grad_norm": 0.158709317445755, + "learning_rate": 6.055535530104466e-06, + "loss": 0.003, + "step": 20870 + }, + { + "epoch": 1.2779239855560316, + "grad_norm": 0.16505110263824463, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0039, + "step": 20880 + }, + { + "epoch": 1.2785360181161638, + "grad_norm": 0.18332232534885406, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0036, + "step": 20890 + }, + { + "epoch": 1.279148050676296, + "grad_norm": 0.1797804981470108, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0049, + "step": 20900 + }, + { + "epoch": 1.2797600832364282, + "grad_norm": 0.19247964024543762, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0053, + "step": 20910 + }, + { + "epoch": 1.2803721157965604, + "grad_norm": 0.17845408618450165, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0045, + "step": 20920 + }, + { + "epoch": 1.2809841483566926, + "grad_norm": 0.09454555809497833, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0027, + "step": 20930 + }, + { + "epoch": 1.2815961809168248, + "grad_norm": 0.12647129595279694, + "learning_rate": 5.998651973182953e-06, + "loss": 0.004, + "step": 20940 + }, + { + "epoch": 1.282208213476957, + "grad_norm": 0.39115941524505615, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0051, + "step": 20950 + }, + { + "epoch": 1.2828202460370892, + "grad_norm": 0.29081296920776367, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0044, + "step": 20960 + }, + { + "epoch": 1.2834322785972214, + "grad_norm": 0.1849275827407837, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0042, + "step": 20970 + }, + { + "epoch": 1.2840443111573536, + "grad_norm": 0.24075689911842346, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0031, + "step": 20980 + }, + { + "epoch": 1.2846563437174858, + "grad_norm": 0.12463482469320297, + "learning_rate": 5.958196751005967e-06, + "loss": 0.003, + "step": 20990 + }, + { + "epoch": 1.285268376277618, + "grad_norm": 0.16987742483615875, + "learning_rate": 5.950123419134817e-06, + "loss": 0.004, + "step": 21000 + }, + { + "epoch": 1.2858804088377502, + "grad_norm": 0.20316782593727112, + "learning_rate": 5.942056013575106e-06, + "loss": 0.004, + "step": 21010 + }, + { + "epoch": 1.2864924413978824, + "grad_norm": 0.20989514887332916, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0053, + "step": 21020 + }, + { + "epoch": 1.2871044739580146, + "grad_norm": 0.33795273303985596, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0048, + "step": 21030 + }, + { + "epoch": 1.2877165065181468, + "grad_norm": 0.13918501138687134, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.004, + "step": 21040 + }, + { + "epoch": 1.288328539078279, + "grad_norm": 0.2992899715900421, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0038, + "step": 21050 + }, + { + "epoch": 1.288940571638411, + "grad_norm": 0.2540164589881897, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0037, + "step": 21060 + }, + { + "epoch": 1.2895526041985432, + "grad_norm": 0.161032035946846, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0047, + "step": 21070 + }, + { + "epoch": 1.2901646367586754, + "grad_norm": 0.1743200421333313, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0037, + "step": 21080 + }, + { + "epoch": 1.2907766693188076, + "grad_norm": 0.26604363322257996, + "learning_rate": 5.877731250949785e-06, + "loss": 0.004, + "step": 21090 + }, + { + "epoch": 1.2913887018789398, + "grad_norm": 0.275696724653244, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0044, + "step": 21100 + }, + { + "epoch": 1.292000734439072, + "grad_norm": 0.16888457536697388, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0042, + "step": 21110 + }, + { + "epoch": 1.2926127669992042, + "grad_norm": 0.12902231514453888, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0048, + "step": 21120 + }, + { + "epoch": 1.2932247995593364, + "grad_norm": 0.14577728509902954, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0046, + "step": 21130 + }, + { + "epoch": 1.2938368321194686, + "grad_norm": 0.1544434279203415, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0031, + "step": 21140 + }, + { + "epoch": 1.2944488646796009, + "grad_norm": 0.09238115698099136, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0035, + "step": 21150 + }, + { + "epoch": 1.295060897239733, + "grad_norm": 0.1770051270723343, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0033, + "step": 21160 + }, + { + "epoch": 1.2956729297998653, + "grad_norm": 0.20360831916332245, + "learning_rate": 5.813791207086085e-06, + "loss": 0.0037, + "step": 21170 + }, + { + "epoch": 1.2962849623599975, + "grad_norm": 0.18503794074058533, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0045, + "step": 21180 + }, + { + "epoch": 1.2968969949201297, + "grad_norm": 0.12918968498706818, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0048, + "step": 21190 + }, + { + "epoch": 1.2975090274802619, + "grad_norm": 0.14289438724517822, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0041, + "step": 21200 + }, + { + "epoch": 1.298121060040394, + "grad_norm": 0.17546117305755615, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0042, + "step": 21210 + }, + { + "epoch": 1.2987330926005263, + "grad_norm": 0.2919277846813202, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0051, + "step": 21220 + }, + { + "epoch": 1.2993451251606585, + "grad_norm": 0.0988069474697113, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0044, + "step": 21230 + }, + { + "epoch": 1.2999571577207907, + "grad_norm": 0.19284513592720032, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0037, + "step": 21240 + }, + { + "epoch": 1.3005691902809229, + "grad_norm": 0.12894058227539062, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0031, + "step": 21250 + }, + { + "epoch": 1.301181222841055, + "grad_norm": 0.14740346372127533, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0037, + "step": 21260 + }, + { + "epoch": 1.3017932554011873, + "grad_norm": 0.16817794740200043, + "learning_rate": 5.734414476316747e-06, + "loss": 0.005, + "step": 21270 + }, + { + "epoch": 1.3024052879613195, + "grad_norm": 0.29237234592437744, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0039, + "step": 21280 + }, + { + "epoch": 1.3030173205214517, + "grad_norm": 0.12649856507778168, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0035, + "step": 21290 + }, + { + "epoch": 1.303629353081584, + "grad_norm": 0.11057443916797638, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0039, + "step": 21300 + }, + { + "epoch": 1.304241385641716, + "grad_norm": 0.13494674861431122, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0035, + "step": 21310 + }, + { + "epoch": 1.3048534182018483, + "grad_norm": 0.3079472482204437, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0042, + "step": 21320 + }, + { + "epoch": 1.3054654507619805, + "grad_norm": 0.13513535261154175, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0048, + "step": 21330 + }, + { + "epoch": 1.3060774833221127, + "grad_norm": 0.39266663789749146, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0046, + "step": 21340 + }, + { + "epoch": 1.306689515882245, + "grad_norm": 0.15097978711128235, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0047, + "step": 21350 + }, + { + "epoch": 1.3073015484423771, + "grad_norm": 0.25206202268600464, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0049, + "step": 21360 + }, + { + "epoch": 1.3079135810025093, + "grad_norm": 0.16765817999839783, + "learning_rate": 5.655655685355026e-06, + "loss": 0.005, + "step": 21370 + }, + { + "epoch": 1.3085256135626415, + "grad_norm": 0.2137158215045929, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0048, + "step": 21380 + }, + { + "epoch": 1.3091376461227737, + "grad_norm": 0.19711454212665558, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0043, + "step": 21390 + }, + { + "epoch": 1.309749678682906, + "grad_norm": 0.1722051054239273, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0044, + "step": 21400 + }, + { + "epoch": 1.3103617112430381, + "grad_norm": 0.1807536482810974, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0045, + "step": 21410 + }, + { + "epoch": 1.3109737438031703, + "grad_norm": 0.15052185952663422, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.004, + "step": 21420 + }, + { + "epoch": 1.3115857763633025, + "grad_norm": 0.1485220491886139, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0036, + "step": 21430 + }, + { + "epoch": 1.3121978089234347, + "grad_norm": 0.15065325796604156, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0037, + "step": 21440 + }, + { + "epoch": 1.312809841483567, + "grad_norm": 0.17903591692447662, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0047, + "step": 21450 + }, + { + "epoch": 1.3134218740436991, + "grad_norm": 0.14310622215270996, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0043, + "step": 21460 + }, + { + "epoch": 1.3140339066038313, + "grad_norm": 0.12117830663919449, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0053, + "step": 21470 + }, + { + "epoch": 1.3146459391639636, + "grad_norm": 0.1484573632478714, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0036, + "step": 21480 + }, + { + "epoch": 1.3152579717240958, + "grad_norm": 0.16559219360351562, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0037, + "step": 21490 + }, + { + "epoch": 1.315870004284228, + "grad_norm": 0.21626432240009308, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0031, + "step": 21500 + }, + { + "epoch": 1.3164820368443602, + "grad_norm": 0.08177383989095688, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0052, + "step": 21510 + }, + { + "epoch": 1.3170940694044924, + "grad_norm": 0.18640732765197754, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0062, + "step": 21520 + }, + { + "epoch": 1.3177061019646246, + "grad_norm": 0.2599853277206421, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0039, + "step": 21530 + }, + { + "epoch": 1.3183181345247568, + "grad_norm": 0.1591203212738037, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0034, + "step": 21540 + }, + { + "epoch": 1.318930167084889, + "grad_norm": 0.2834412455558777, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0037, + "step": 21550 + }, + { + "epoch": 1.3195421996450212, + "grad_norm": 0.13853803277015686, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0038, + "step": 21560 + }, + { + "epoch": 1.3201542322051534, + "grad_norm": 0.14707128703594208, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0042, + "step": 21570 + }, + { + "epoch": 1.3207662647652856, + "grad_norm": 0.12561920285224915, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0038, + "step": 21580 + }, + { + "epoch": 1.3213782973254178, + "grad_norm": 0.4156799018383026, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0051, + "step": 21590 + }, + { + "epoch": 1.32199032988555, + "grad_norm": 0.11400662362575531, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0031, + "step": 21600 + }, + { + "epoch": 1.3226023624456822, + "grad_norm": 0.15658807754516602, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0052, + "step": 21610 + }, + { + "epoch": 1.3232143950058144, + "grad_norm": 0.1212862953543663, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0034, + "step": 21620 + }, + { + "epoch": 1.3238264275659466, + "grad_norm": 0.2201654314994812, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0036, + "step": 21630 + }, + { + "epoch": 1.3244384601260788, + "grad_norm": 0.11623375117778778, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0032, + "step": 21640 + }, + { + "epoch": 1.325050492686211, + "grad_norm": 0.13092897832393646, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0035, + "step": 21650 + }, + { + "epoch": 1.3256625252463432, + "grad_norm": 0.15409153699874878, + "learning_rate": 5.430834687545416e-06, + "loss": 0.004, + "step": 21660 + }, + { + "epoch": 1.3262745578064754, + "grad_norm": 0.3148297369480133, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0031, + "step": 21670 + }, + { + "epoch": 1.3268865903666076, + "grad_norm": 0.13435055315494537, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0033, + "step": 21680 + }, + { + "epoch": 1.3274986229267398, + "grad_norm": 0.17878089845180511, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0029, + "step": 21690 + }, + { + "epoch": 1.328110655486872, + "grad_norm": 0.1823783665895462, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0039, + "step": 21700 + }, + { + "epoch": 1.3287226880470042, + "grad_norm": 0.14492660760879517, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0033, + "step": 21710 + }, + { + "epoch": 1.3293347206071364, + "grad_norm": 0.1730341762304306, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0041, + "step": 21720 + }, + { + "epoch": 1.3299467531672686, + "grad_norm": 0.07961586117744446, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0039, + "step": 21730 + }, + { + "epoch": 1.3305587857274008, + "grad_norm": 0.14440582692623138, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0038, + "step": 21740 + }, + { + "epoch": 1.331170818287533, + "grad_norm": 0.22034496068954468, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0023, + "step": 21750 + }, + { + "epoch": 1.3317828508476652, + "grad_norm": 0.1861305832862854, + "learning_rate": 5.354573491223212e-06, + "loss": 0.005, + "step": 21760 + }, + { + "epoch": 1.3323948834077972, + "grad_norm": 0.15587164461612701, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.0044, + "step": 21770 + }, + { + "epoch": 1.3330069159679294, + "grad_norm": 0.6852900981903076, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0075, + "step": 21780 + }, + { + "epoch": 1.3336189485280616, + "grad_norm": 0.14315280318260193, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0034, + "step": 21790 + }, + { + "epoch": 1.3342309810881938, + "grad_norm": 0.350981205701828, + "learning_rate": 5.324254018551227e-06, + "loss": 0.004, + "step": 21800 + }, + { + "epoch": 1.334843013648326, + "grad_norm": 0.12344911694526672, + "learning_rate": 5.316690780174352e-06, + "loss": 0.004, + "step": 21810 + }, + { + "epoch": 1.3354550462084582, + "grad_norm": 0.18744061887264252, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0035, + "step": 21820 + }, + { + "epoch": 1.3360670787685904, + "grad_norm": 0.22747837007045746, + "learning_rate": 5.301584321328435e-06, + "loss": 0.004, + "step": 21830 + }, + { + "epoch": 1.3366791113287226, + "grad_norm": 0.22695699334144592, + "learning_rate": 5.294041118587667e-06, + "loss": 0.0042, + "step": 21840 + }, + { + "epoch": 1.3372911438888548, + "grad_norm": 0.17258964478969574, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0044, + "step": 21850 + }, + { + "epoch": 1.337903176448987, + "grad_norm": 0.1523793637752533, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0047, + "step": 21860 + }, + { + "epoch": 1.3385152090091192, + "grad_norm": 0.1983587145805359, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0037, + "step": 21870 + }, + { + "epoch": 1.3391272415692514, + "grad_norm": 0.1263747215270996, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0034, + "step": 21880 + }, + { + "epoch": 1.3397392741293837, + "grad_norm": 0.1550009399652481, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0037, + "step": 21890 + }, + { + "epoch": 1.3403513066895159, + "grad_norm": 0.14963915944099426, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0038, + "step": 21900 + }, + { + "epoch": 1.340963339249648, + "grad_norm": 0.17783671617507935, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0036, + "step": 21910 + }, + { + "epoch": 1.3415753718097803, + "grad_norm": 0.2715896964073181, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0036, + "step": 21920 + }, + { + "epoch": 1.3421874043699125, + "grad_norm": 0.22924886643886566, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0037, + "step": 21930 + }, + { + "epoch": 1.3427994369300447, + "grad_norm": 0.13689789175987244, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0033, + "step": 21940 + }, + { + "epoch": 1.3434114694901769, + "grad_norm": 0.09137748926877975, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0031, + "step": 21950 + }, + { + "epoch": 1.344023502050309, + "grad_norm": 0.17097881436347961, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0031, + "step": 21960 + }, + { + "epoch": 1.3446355346104413, + "grad_norm": 0.23919200897216797, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0046, + "step": 21970 + }, + { + "epoch": 1.3452475671705735, + "grad_norm": 0.14261527359485626, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0037, + "step": 21980 + }, + { + "epoch": 1.3458595997307057, + "grad_norm": 0.156734898686409, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0043, + "step": 21990 + }, + { + "epoch": 1.3464716322908379, + "grad_norm": 0.21755588054656982, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0032, + "step": 22000 + }, + { + "epoch": 1.34708366485097, + "grad_norm": 0.1373317390680313, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0033, + "step": 22010 + }, + { + "epoch": 1.3476956974111023, + "grad_norm": 0.1646856814622879, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0047, + "step": 22020 + }, + { + "epoch": 1.3483077299712345, + "grad_norm": 0.1908850073814392, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0044, + "step": 22030 + }, + { + "epoch": 1.3489197625313667, + "grad_norm": 0.24862833321094513, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0041, + "step": 22040 + }, + { + "epoch": 1.349531795091499, + "grad_norm": 0.15980397164821625, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0033, + "step": 22050 + }, + { + "epoch": 1.350143827651631, + "grad_norm": 0.1157977357506752, + "learning_rate": 5.129800405815733e-06, + "loss": 0.0036, + "step": 22060 + }, + { + "epoch": 1.3507558602117633, + "grad_norm": 0.11186888068914413, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0046, + "step": 22070 + }, + { + "epoch": 1.3513678927718955, + "grad_norm": 0.17715996503829956, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0035, + "step": 22080 + }, + { + "epoch": 1.3519799253320277, + "grad_norm": 0.1265174001455307, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0048, + "step": 22090 + }, + { + "epoch": 1.35259195789216, + "grad_norm": 0.13969522714614868, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0028, + "step": 22100 + }, + { + "epoch": 1.3532039904522921, + "grad_norm": 0.13246525824069977, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0026, + "step": 22110 + }, + { + "epoch": 1.3538160230124243, + "grad_norm": 0.14675064384937286, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0082, + "step": 22120 + }, + { + "epoch": 1.3544280555725565, + "grad_norm": 0.15810683369636536, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0031, + "step": 22130 + }, + { + "epoch": 1.3550400881326887, + "grad_norm": 0.20675864815711975, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0035, + "step": 22140 + }, + { + "epoch": 1.355652120692821, + "grad_norm": 0.1921442300081253, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0038, + "step": 22150 + }, + { + "epoch": 1.3562641532529531, + "grad_norm": 0.14300711452960968, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0035, + "step": 22160 + }, + { + "epoch": 1.3568761858130853, + "grad_norm": 0.0656728520989418, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0047, + "step": 22170 + }, + { + "epoch": 1.3574882183732175, + "grad_norm": 0.148203507065773, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0041, + "step": 22180 + }, + { + "epoch": 1.3581002509333495, + "grad_norm": 0.15472126007080078, + "learning_rate": 5.034310349217475e-06, + "loss": 0.004, + "step": 22190 + }, + { + "epoch": 1.3587122834934817, + "grad_norm": 0.12006669491529465, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0034, + "step": 22200 + }, + { + "epoch": 1.359324316053614, + "grad_norm": 0.15345145761966705, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0032, + "step": 22210 + }, + { + "epoch": 1.3599363486137461, + "grad_norm": 0.17429186403751373, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0039, + "step": 22220 + }, + { + "epoch": 1.3605483811738783, + "grad_norm": 0.20691345632076263, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0029, + "step": 22230 + }, + { + "epoch": 1.3611604137340105, + "grad_norm": 0.1874946504831314, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0042, + "step": 22240 + }, + { + "epoch": 1.3617724462941427, + "grad_norm": 0.12159912288188934, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0033, + "step": 22250 + }, + { + "epoch": 1.362384478854275, + "grad_norm": 0.29434919357299805, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0044, + "step": 22260 + }, + { + "epoch": 1.3629965114144071, + "grad_norm": 0.06661798804998398, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0031, + "step": 22270 + }, + { + "epoch": 1.3636085439745393, + "grad_norm": 0.14819994568824768, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0039, + "step": 22280 + }, + { + "epoch": 1.3642205765346715, + "grad_norm": 0.17289887368679047, + "learning_rate": 4.961660586405147e-06, + "loss": 0.0035, + "step": 22290 + }, + { + "epoch": 1.3648326090948038, + "grad_norm": 0.18789313733577728, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0036, + "step": 22300 + }, + { + "epoch": 1.365444641654936, + "grad_norm": 0.1877586394548416, + "learning_rate": 4.947215397583639e-06, + "loss": 0.004, + "step": 22310 + }, + { + "epoch": 1.3660566742150682, + "grad_norm": 0.11696574836969376, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0036, + "step": 22320 + }, + { + "epoch": 1.3666687067752004, + "grad_norm": 0.2511763274669647, + "learning_rate": 4.932798621873274e-06, + "loss": 0.004, + "step": 22330 + }, + { + "epoch": 1.3672807393353326, + "grad_norm": 0.15005314350128174, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0046, + "step": 22340 + }, + { + "epoch": 1.3678927718954648, + "grad_norm": 0.16856855154037476, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0056, + "step": 22350 + }, + { + "epoch": 1.368504804455597, + "grad_norm": 0.24532385170459747, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0035, + "step": 22360 + }, + { + "epoch": 1.3691168370157292, + "grad_norm": 0.29320162534713745, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0047, + "step": 22370 + }, + { + "epoch": 1.3697288695758614, + "grad_norm": 0.1518300473690033, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0041, + "step": 22380 + }, + { + "epoch": 1.3703409021359936, + "grad_norm": 0.13431201875209808, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0033, + "step": 22390 + }, + { + "epoch": 1.3709529346961258, + "grad_norm": 0.17390409111976624, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0039, + "step": 22400 + }, + { + "epoch": 1.371564967256258, + "grad_norm": 0.16482478380203247, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.007, + "step": 22410 + }, + { + "epoch": 1.3721769998163902, + "grad_norm": 0.11469490826129913, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0041, + "step": 22420 + }, + { + "epoch": 1.3727890323765224, + "grad_norm": 0.2327135056257248, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0043, + "step": 22430 + }, + { + "epoch": 1.3734010649366546, + "grad_norm": 0.1373092532157898, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0036, + "step": 22440 + }, + { + "epoch": 1.3740130974967868, + "grad_norm": 0.1534084528684616, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0028, + "step": 22450 + }, + { + "epoch": 1.374625130056919, + "grad_norm": 0.3217960596084595, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0044, + "step": 22460 + }, + { + "epoch": 1.3752371626170512, + "grad_norm": 0.14245563745498657, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0039, + "step": 22470 + }, + { + "epoch": 1.3758491951771834, + "grad_norm": 0.17652876675128937, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0031, + "step": 22480 + }, + { + "epoch": 1.3764612277373156, + "grad_norm": 0.1996244192123413, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0034, + "step": 22490 + }, + { + "epoch": 1.3770732602974478, + "grad_norm": 0.1658472716808319, + "learning_rate": 4.81141273556404e-06, + "loss": 0.003, + "step": 22500 + }, + { + "epoch": 1.37768529285758, + "grad_norm": 0.16233472526073456, + "learning_rate": 4.804337352679613e-06, + "loss": 0.004, + "step": 22510 + }, + { + "epoch": 1.3782973254177122, + "grad_norm": 0.13045033812522888, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0049, + "step": 22520 + }, + { + "epoch": 1.3789093579778444, + "grad_norm": 0.1195274218916893, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0042, + "step": 22530 + }, + { + "epoch": 1.3795213905379766, + "grad_norm": 0.14395804703235626, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0036, + "step": 22540 + }, + { + "epoch": 1.3801334230981088, + "grad_norm": 0.24495497345924377, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0047, + "step": 22550 + }, + { + "epoch": 1.380745455658241, + "grad_norm": 0.14288006722927094, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0044, + "step": 22560 + }, + { + "epoch": 1.3813574882183732, + "grad_norm": 0.16967979073524475, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0051, + "step": 22570 + }, + { + "epoch": 1.3819695207785054, + "grad_norm": 0.2023036777973175, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0032, + "step": 22580 + }, + { + "epoch": 1.3825815533386376, + "grad_norm": 0.1191902756690979, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0026, + "step": 22590 + }, + { + "epoch": 1.3831935858987698, + "grad_norm": 0.16922403872013092, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0037, + "step": 22600 + }, + { + "epoch": 1.383805618458902, + "grad_norm": 0.12394976615905762, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0038, + "step": 22610 + }, + { + "epoch": 1.3844176510190342, + "grad_norm": 0.23889753222465515, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0041, + "step": 22620 + }, + { + "epoch": 1.3850296835791664, + "grad_norm": 0.31215062737464905, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0036, + "step": 22630 + }, + { + "epoch": 1.3856417161392987, + "grad_norm": 0.1519152820110321, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0042, + "step": 22640 + }, + { + "epoch": 1.3862537486994309, + "grad_norm": 0.3375433683395386, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0039, + "step": 22650 + }, + { + "epoch": 1.386865781259563, + "grad_norm": 0.21715323626995087, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0027, + "step": 22660 + }, + { + "epoch": 1.3874778138196953, + "grad_norm": 0.2066027969121933, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0033, + "step": 22670 + }, + { + "epoch": 1.3880898463798275, + "grad_norm": 0.11542408168315887, + "learning_rate": 4.6851750421442e-06, + "loss": 0.004, + "step": 22680 + }, + { + "epoch": 1.3887018789399597, + "grad_norm": 0.1183561235666275, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0037, + "step": 22690 + }, + { + "epoch": 1.3893139115000919, + "grad_norm": 0.24478662014007568, + "learning_rate": 4.67129597392514e-06, + "loss": 0.004, + "step": 22700 + }, + { + "epoch": 1.389925944060224, + "grad_norm": 0.28880801796913147, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0039, + "step": 22710 + }, + { + "epoch": 1.3905379766203563, + "grad_norm": 0.14014701545238495, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0034, + "step": 22720 + }, + { + "epoch": 1.3911500091804885, + "grad_norm": 0.1549793928861618, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0033, + "step": 22730 + }, + { + "epoch": 1.3917620417406207, + "grad_norm": 0.1423012614250183, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0041, + "step": 22740 + }, + { + "epoch": 1.3923740743007529, + "grad_norm": 0.291273832321167, + "learning_rate": 4.636728419531758e-06, + "loss": 0.004, + "step": 22750 + }, + { + "epoch": 1.392986106860885, + "grad_norm": 0.38278621435165405, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0045, + "step": 22760 + }, + { + "epoch": 1.3935981394210173, + "grad_norm": 0.20528365671634674, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0042, + "step": 22770 + }, + { + "epoch": 1.3942101719811495, + "grad_norm": 0.11913729459047318, + "learning_rate": 4.616077433849538e-06, + "loss": 0.003, + "step": 22780 + }, + { + "epoch": 1.3948222045412817, + "grad_norm": 0.21683627367019653, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0027, + "step": 22790 + }, + { + "epoch": 1.395434237101414, + "grad_norm": 0.12143554538488388, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0031, + "step": 22800 + }, + { + "epoch": 1.396046269661546, + "grad_norm": 0.14171159267425537, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0039, + "step": 22810 + }, + { + "epoch": 1.3966583022216783, + "grad_norm": 0.19254790246486664, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0043, + "step": 22820 + }, + { + "epoch": 1.3972703347818105, + "grad_norm": 0.12295825034379959, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0045, + "step": 22830 + }, + { + "epoch": 1.3978823673419427, + "grad_norm": 0.1274985820055008, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0037, + "step": 22840 + }, + { + "epoch": 1.398494399902075, + "grad_norm": 0.2940427362918854, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0059, + "step": 22850 + }, + { + "epoch": 1.3991064324622071, + "grad_norm": 0.15357589721679688, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0032, + "step": 22860 + }, + { + "epoch": 1.3997184650223393, + "grad_norm": 0.12781603634357452, + "learning_rate": 4.554529907376127e-06, + "loss": 0.003, + "step": 22870 + }, + { + "epoch": 1.4003304975824715, + "grad_norm": 0.34976109862327576, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0047, + "step": 22880 + }, + { + "epoch": 1.4009425301426035, + "grad_norm": 0.1797824203968048, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0034, + "step": 22890 + }, + { + "epoch": 1.4015545627027357, + "grad_norm": 0.13750647008419037, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0046, + "step": 22900 + }, + { + "epoch": 1.402166595262868, + "grad_norm": 0.22893266379833221, + "learning_rate": 4.527371771040039e-06, + "loss": 0.005, + "step": 22910 + }, + { + "epoch": 1.4027786278230001, + "grad_norm": 0.1595923751592636, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0045, + "step": 22920 + }, + { + "epoch": 1.4033906603831323, + "grad_norm": 0.11474192142486572, + "learning_rate": 4.513838246961138e-06, + "loss": 0.003, + "step": 22930 + }, + { + "epoch": 1.4040026929432645, + "grad_norm": 0.12208060175180435, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0038, + "step": 22940 + }, + { + "epoch": 1.4046147255033967, + "grad_norm": 0.2919016480445862, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0036, + "step": 22950 + }, + { + "epoch": 1.405226758063529, + "grad_norm": 0.19161155819892883, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0049, + "step": 22960 + }, + { + "epoch": 1.4058387906236611, + "grad_norm": 0.1454700380563736, + "learning_rate": 4.486862604628113e-06, + "loss": 0.004, + "step": 22970 + }, + { + "epoch": 1.4064508231837933, + "grad_norm": 0.227305606007576, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0043, + "step": 22980 + }, + { + "epoch": 1.4070628557439255, + "grad_norm": 0.09430288523435593, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0051, + "step": 22990 + }, + { + "epoch": 1.4076748883040577, + "grad_norm": 0.09664178639650345, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0036, + "step": 23000 + }, + { + "epoch": 1.40828692086419, + "grad_norm": 0.21268269419670105, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0031, + "step": 23010 + }, + { + "epoch": 1.4088989534243221, + "grad_norm": 0.09796992689371109, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0041, + "step": 23020 + }, + { + "epoch": 1.4095109859844543, + "grad_norm": 0.18376071751117706, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0039, + "step": 23030 + }, + { + "epoch": 1.4101230185445865, + "grad_norm": 0.10276145488023758, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0035, + "step": 23040 + }, + { + "epoch": 1.4107350511047188, + "grad_norm": 0.16089564561843872, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0051, + "step": 23050 + }, + { + "epoch": 1.411347083664851, + "grad_norm": 0.1825491487979889, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0036, + "step": 23060 + }, + { + "epoch": 1.4119591162249832, + "grad_norm": 0.24405492842197418, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0028, + "step": 23070 + }, + { + "epoch": 1.4125711487851154, + "grad_norm": 0.14085668325424194, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0039, + "step": 23080 + }, + { + "epoch": 1.4131831813452476, + "grad_norm": 0.11708472669124603, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0035, + "step": 23090 + }, + { + "epoch": 1.4137952139053798, + "grad_norm": 0.12108796834945679, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0036, + "step": 23100 + }, + { + "epoch": 1.414407246465512, + "grad_norm": 0.14601854979991913, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0036, + "step": 23110 + }, + { + "epoch": 1.4150192790256442, + "grad_norm": 0.10614772886037827, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0031, + "step": 23120 + }, + { + "epoch": 1.4156313115857764, + "grad_norm": 0.09014416486024857, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0027, + "step": 23130 + }, + { + "epoch": 1.4162433441459086, + "grad_norm": 0.15246634185314178, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0031, + "step": 23140 + }, + { + "epoch": 1.4168553767060408, + "grad_norm": 0.20104879140853882, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0075, + "step": 23150 + }, + { + "epoch": 1.417467409266173, + "grad_norm": 0.1359969973564148, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0036, + "step": 23160 + }, + { + "epoch": 1.4180794418263052, + "grad_norm": 0.19849587976932526, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0054, + "step": 23170 + }, + { + "epoch": 1.4186914743864374, + "grad_norm": 0.12617377936840057, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0024, + "step": 23180 + }, + { + "epoch": 1.4193035069465696, + "grad_norm": 0.15024134516716003, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0034, + "step": 23190 + }, + { + "epoch": 1.4199155395067018, + "grad_norm": 0.2345605194568634, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0036, + "step": 23200 + }, + { + "epoch": 1.420527572066834, + "grad_norm": 0.13125917315483093, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0035, + "step": 23210 + }, + { + "epoch": 1.4211396046269662, + "grad_norm": 0.20977836847305298, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0038, + "step": 23220 + }, + { + "epoch": 1.4217516371870984, + "grad_norm": 0.3925677537918091, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0043, + "step": 23230 + }, + { + "epoch": 1.4223636697472306, + "grad_norm": 0.17691555619239807, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0048, + "step": 23240 + }, + { + "epoch": 1.4229757023073628, + "grad_norm": 0.18366187810897827, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0033, + "step": 23250 + }, + { + "epoch": 1.423587734867495, + "grad_norm": 0.15539205074310303, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0039, + "step": 23260 + }, + { + "epoch": 1.4241997674276272, + "grad_norm": 0.15048520267009735, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0032, + "step": 23270 + }, + { + "epoch": 1.4248117999877594, + "grad_norm": 0.2631739675998688, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0061, + "step": 23280 + }, + { + "epoch": 1.4254238325478916, + "grad_norm": 0.18545641005039215, + "learning_rate": 4.275502195405868e-06, + "loss": 0.005, + "step": 23290 + }, + { + "epoch": 1.4260358651080238, + "grad_norm": 0.25486356019973755, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0033, + "step": 23300 + }, + { + "epoch": 1.426647897668156, + "grad_norm": 0.2514204978942871, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0043, + "step": 23310 + }, + { + "epoch": 1.427259930228288, + "grad_norm": 0.12997376918792725, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0034, + "step": 23320 + }, + { + "epoch": 1.4278719627884202, + "grad_norm": 0.26096200942993164, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0047, + "step": 23330 + }, + { + "epoch": 1.4284839953485524, + "grad_norm": 0.2292930781841278, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0038, + "step": 23340 + }, + { + "epoch": 1.4290960279086846, + "grad_norm": 0.20056717097759247, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0037, + "step": 23350 + }, + { + "epoch": 1.4297080604688168, + "grad_norm": 0.1608581393957138, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0032, + "step": 23360 + }, + { + "epoch": 1.430320093028949, + "grad_norm": 0.235102578997612, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0034, + "step": 23370 + }, + { + "epoch": 1.4309321255890812, + "grad_norm": 0.11869259178638458, + "learning_rate": 4.217502203129258e-06, + "loss": 0.005, + "step": 23380 + }, + { + "epoch": 1.4315441581492134, + "grad_norm": 0.167036771774292, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0045, + "step": 23390 + }, + { + "epoch": 1.4321561907093456, + "grad_norm": 0.13766071200370789, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0044, + "step": 23400 + }, + { + "epoch": 1.4327682232694778, + "grad_norm": 0.15444986522197723, + "learning_rate": 4.198311874248223e-06, + "loss": 0.004, + "step": 23410 + }, + { + "epoch": 1.43338025582961, + "grad_norm": 0.11997724324464798, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0034, + "step": 23420 + }, + { + "epoch": 1.4339922883897422, + "grad_norm": 0.1533307433128357, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0038, + "step": 23430 + }, + { + "epoch": 1.4346043209498744, + "grad_norm": 0.10954161733388901, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0045, + "step": 23440 + }, + { + "epoch": 1.4352163535100066, + "grad_norm": 0.16601058840751648, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0039, + "step": 23450 + }, + { + "epoch": 1.4358283860701389, + "grad_norm": 0.1756889373064041, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0035, + "step": 23460 + }, + { + "epoch": 1.436440418630271, + "grad_norm": 0.12633845210075378, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0046, + "step": 23470 + }, + { + "epoch": 1.4370524511904033, + "grad_norm": 0.15678541362285614, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0025, + "step": 23480 + }, + { + "epoch": 1.4376644837505355, + "grad_norm": 0.13923659920692444, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0052, + "step": 23490 + }, + { + "epoch": 1.4382765163106677, + "grad_norm": 0.28792211413383484, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0053, + "step": 23500 + }, + { + "epoch": 1.4388885488707999, + "grad_norm": 0.16125047206878662, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0037, + "step": 23510 + }, + { + "epoch": 1.439500581430932, + "grad_norm": 0.2653597593307495, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0067, + "step": 23520 + }, + { + "epoch": 1.4401126139910643, + "grad_norm": 0.2692917585372925, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0031, + "step": 23530 + }, + { + "epoch": 1.4407246465511965, + "grad_norm": 0.2234862893819809, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0044, + "step": 23540 + }, + { + "epoch": 1.4413366791113287, + "grad_norm": 0.17526887357234955, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0025, + "step": 23550 + }, + { + "epoch": 1.4419487116714609, + "grad_norm": 0.10404029488563538, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0041, + "step": 23560 + }, + { + "epoch": 1.442560744231593, + "grad_norm": 0.1385052353143692, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0027, + "step": 23570 + }, + { + "epoch": 1.4431727767917253, + "grad_norm": 0.30865412950515747, + "learning_rate": 4.090929556079854e-06, + "loss": 0.004, + "step": 23580 + }, + { + "epoch": 1.4437848093518575, + "grad_norm": 0.10908320546150208, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0041, + "step": 23590 + }, + { + "epoch": 1.4443968419119897, + "grad_norm": 0.09885916113853455, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0045, + "step": 23600 + }, + { + "epoch": 1.445008874472122, + "grad_norm": 0.1685211956501007, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0031, + "step": 23610 + }, + { + "epoch": 1.445620907032254, + "grad_norm": 0.0967954769730568, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0031, + "step": 23620 + }, + { + "epoch": 1.4462329395923863, + "grad_norm": 0.07489120960235596, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0038, + "step": 23630 + }, + { + "epoch": 1.4468449721525185, + "grad_norm": 0.20616063475608826, + "learning_rate": 4.053587511509546e-06, + "loss": 0.0043, + "step": 23640 + }, + { + "epoch": 1.4474570047126507, + "grad_norm": 0.15788249671459198, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0031, + "step": 23650 + }, + { + "epoch": 1.448069037272783, + "grad_norm": 0.10360633581876755, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0027, + "step": 23660 + }, + { + "epoch": 1.4486810698329151, + "grad_norm": 0.2871163785457611, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0028, + "step": 23670 + }, + { + "epoch": 1.4492931023930473, + "grad_norm": 0.15280364453792572, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0042, + "step": 23680 + }, + { + "epoch": 1.4499051349531795, + "grad_norm": 0.17502477765083313, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0031, + "step": 23690 + }, + { + "epoch": 1.4505171675133117, + "grad_norm": 0.2154005616903305, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0036, + "step": 23700 + }, + { + "epoch": 1.451129200073444, + "grad_norm": 0.15002919733524323, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0033, + "step": 23710 + }, + { + "epoch": 1.4517412326335761, + "grad_norm": 0.10422170162200928, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0032, + "step": 23720 + }, + { + "epoch": 1.4523532651937083, + "grad_norm": 0.15197636187076569, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0043, + "step": 23730 + }, + { + "epoch": 1.4529652977538405, + "grad_norm": 0.2571481466293335, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0039, + "step": 23740 + }, + { + "epoch": 1.4535773303139727, + "grad_norm": 0.12697578966617584, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0034, + "step": 23750 + }, + { + "epoch": 1.454189362874105, + "grad_norm": 0.14347535371780396, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0051, + "step": 23760 + }, + { + "epoch": 1.4548013954342371, + "grad_norm": 0.1494351178407669, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0037, + "step": 23770 + }, + { + "epoch": 1.4554134279943693, + "grad_norm": 0.23901797831058502, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0032, + "step": 23780 + }, + { + "epoch": 1.4560254605545015, + "grad_norm": 0.1434790939092636, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0036, + "step": 23790 + }, + { + "epoch": 1.4566374931146338, + "grad_norm": 0.1456829458475113, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0034, + "step": 23800 + }, + { + "epoch": 1.457249525674766, + "grad_norm": 0.33969590067863464, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0038, + "step": 23810 + }, + { + "epoch": 1.4578615582348982, + "grad_norm": 0.1768753081560135, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0027, + "step": 23820 + }, + { + "epoch": 1.4584735907950304, + "grad_norm": 0.15212708711624146, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0032, + "step": 23830 + }, + { + "epoch": 1.4590856233551626, + "grad_norm": 0.10870973765850067, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0033, + "step": 23840 + }, + { + "epoch": 1.4596976559152948, + "grad_norm": 0.17898528277873993, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0022, + "step": 23850 + }, + { + "epoch": 1.460309688475427, + "grad_norm": 0.15515227615833282, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0034, + "step": 23860 + }, + { + "epoch": 1.4609217210355592, + "grad_norm": 0.11047070473432541, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0032, + "step": 23870 + }, + { + "epoch": 1.4615337535956914, + "grad_norm": 0.08628113567829132, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0032, + "step": 23880 + }, + { + "epoch": 1.4621457861558236, + "grad_norm": 0.358903706073761, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0043, + "step": 23890 + }, + { + "epoch": 1.4627578187159558, + "grad_norm": 0.13986052572727203, + "learning_rate": 3.895183209452123e-06, + "loss": 0.003, + "step": 23900 + }, + { + "epoch": 1.463369851276088, + "grad_norm": 0.09236793220043182, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0029, + "step": 23910 + }, + { + "epoch": 1.4639818838362202, + "grad_norm": 0.14616963267326355, + "learning_rate": 3.883230136754435e-06, + "loss": 0.005, + "step": 23920 + }, + { + "epoch": 1.4645939163963524, + "grad_norm": 0.0754290223121643, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0031, + "step": 23930 + }, + { + "epoch": 1.4652059489564846, + "grad_norm": 0.16520163416862488, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0037, + "step": 23940 + }, + { + "epoch": 1.4658179815166168, + "grad_norm": 0.06801608204841614, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0023, + "step": 23950 + }, + { + "epoch": 1.466430014076749, + "grad_norm": 0.3087909519672394, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0042, + "step": 23960 + }, + { + "epoch": 1.4670420466368812, + "grad_norm": 0.23470532894134521, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0046, + "step": 23970 + }, + { + "epoch": 1.4676540791970134, + "grad_norm": 0.10248749703168869, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0025, + "step": 23980 + }, + { + "epoch": 1.4682661117571456, + "grad_norm": 0.12478570640087128, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0035, + "step": 23990 + }, + { + "epoch": 1.4688781443172778, + "grad_norm": 0.16669252514839172, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0034, + "step": 24000 + }, + { + "epoch": 1.46949017687741, + "grad_norm": 0.12477939575910568, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0033, + "step": 24010 + }, + { + "epoch": 1.470102209437542, + "grad_norm": 0.1738445907831192, + "learning_rate": 3.823967005382315e-06, + "loss": 0.003, + "step": 24020 + }, + { + "epoch": 1.4707142419976742, + "grad_norm": 0.11228524148464203, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0029, + "step": 24030 + }, + { + "epoch": 1.4713262745578064, + "grad_norm": 0.28472721576690674, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0035, + "step": 24040 + }, + { + "epoch": 1.4719383071179386, + "grad_norm": 0.18087328970432281, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0037, + "step": 24050 + }, + { + "epoch": 1.4725503396780708, + "grad_norm": 0.39030423760414124, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0043, + "step": 24060 + }, + { + "epoch": 1.473162372238203, + "grad_norm": 0.164345845580101, + "learning_rate": 3.794650811106129e-06, + "loss": 0.0029, + "step": 24070 + }, + { + "epoch": 1.4737744047983352, + "grad_norm": 0.14081600308418274, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.0038, + "step": 24080 + }, + { + "epoch": 1.4743864373584674, + "grad_norm": 0.27649205923080444, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0037, + "step": 24090 + }, + { + "epoch": 1.4749984699185996, + "grad_norm": 0.08673480153083801, + "learning_rate": 3.777162510056721e-06, + "loss": 0.004, + "step": 24100 + }, + { + "epoch": 1.4756105024787318, + "grad_norm": 0.11770286411046982, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0033, + "step": 24110 + }, + { + "epoch": 1.476222535038864, + "grad_norm": 0.11967290937900543, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0035, + "step": 24120 + }, + { + "epoch": 1.4768345675989962, + "grad_norm": 0.12635833024978638, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0043, + "step": 24130 + }, + { + "epoch": 1.4774466001591284, + "grad_norm": 0.13505803048610687, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.0034, + "step": 24140 + }, + { + "epoch": 1.4780586327192606, + "grad_norm": 0.17781652510166168, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0031, + "step": 24150 + }, + { + "epoch": 1.4786706652793928, + "grad_norm": 0.18974725902080536, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0051, + "step": 24160 + }, + { + "epoch": 1.479282697839525, + "grad_norm": 0.12072815746068954, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0027, + "step": 24170 + }, + { + "epoch": 1.4798947303996572, + "grad_norm": 0.10813914984464645, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0023, + "step": 24180 + }, + { + "epoch": 1.4805067629597894, + "grad_norm": 0.07975378632545471, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0035, + "step": 24190 + }, + { + "epoch": 1.4811187955199216, + "grad_norm": 0.0948014184832573, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0033, + "step": 24200 + }, + { + "epoch": 1.4817308280800539, + "grad_norm": 0.11943913251161575, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0038, + "step": 24210 + }, + { + "epoch": 1.482342860640186, + "grad_norm": 0.34374934434890747, + "learning_rate": 3.707974016467e-06, + "loss": 0.0043, + "step": 24220 + }, + { + "epoch": 1.4829548932003183, + "grad_norm": 0.264528751373291, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0037, + "step": 24230 + }, + { + "epoch": 1.4835669257604505, + "grad_norm": 0.08419078588485718, + "learning_rate": 3.696562092850226e-06, + "loss": 0.0031, + "step": 24240 + }, + { + "epoch": 1.4841789583205827, + "grad_norm": 0.3805602192878723, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0039, + "step": 24250 + }, + { + "epoch": 1.4847909908807149, + "grad_norm": 0.09091196954250336, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0034, + "step": 24260 + }, + { + "epoch": 1.485403023440847, + "grad_norm": 0.1352047175168991, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0033, + "step": 24270 + }, + { + "epoch": 1.4860150560009793, + "grad_norm": 0.14287787675857544, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0033, + "step": 24280 + }, + { + "epoch": 1.4866270885611115, + "grad_norm": 0.15490861237049103, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0037, + "step": 24290 + }, + { + "epoch": 1.4872391211212437, + "grad_norm": 0.08607941120862961, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0042, + "step": 24300 + }, + { + "epoch": 1.4878511536813759, + "grad_norm": 0.2872561514377594, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0039, + "step": 24310 + }, + { + "epoch": 1.488463186241508, + "grad_norm": 0.09383561462163925, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.0036, + "step": 24320 + }, + { + "epoch": 1.4890752188016403, + "grad_norm": 0.13576671481132507, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0039, + "step": 24330 + }, + { + "epoch": 1.4896872513617725, + "grad_norm": 0.21924526989459991, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0038, + "step": 24340 + }, + { + "epoch": 1.4902992839219047, + "grad_norm": 0.24333837628364563, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0034, + "step": 24350 + }, + { + "epoch": 1.490911316482037, + "grad_norm": 0.08171682059764862, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0034, + "step": 24360 + }, + { + "epoch": 1.491523349042169, + "grad_norm": 0.11815544962882996, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.0027, + "step": 24370 + }, + { + "epoch": 1.4921353816023013, + "grad_norm": 0.15248773992061615, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0035, + "step": 24380 + }, + { + "epoch": 1.4927474141624335, + "grad_norm": 0.13664020597934723, + "learning_rate": 3.612069140022124e-06, + "loss": 0.0032, + "step": 24390 + }, + { + "epoch": 1.4933594467225657, + "grad_norm": 0.2877022624015808, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0034, + "step": 24400 + }, + { + "epoch": 1.493971479282698, + "grad_norm": 0.1447642594575882, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0033, + "step": 24410 + }, + { + "epoch": 1.4945835118428301, + "grad_norm": 0.18032193183898926, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0036, + "step": 24420 + }, + { + "epoch": 1.4951955444029623, + "grad_norm": 0.1249038353562355, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0023, + "step": 24430 + }, + { + "epoch": 1.4958075769630943, + "grad_norm": 0.21674089133739471, + "learning_rate": 3.584337233394337e-06, + "loss": 0.0036, + "step": 24440 + }, + { + "epoch": 1.4964196095232265, + "grad_norm": 0.2503979504108429, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0039, + "step": 24450 + }, + { + "epoch": 1.4970316420833587, + "grad_norm": 0.15412171185016632, + "learning_rate": 3.573305344104808e-06, + "loss": 0.0035, + "step": 24460 + }, + { + "epoch": 1.497643674643491, + "grad_norm": 0.17718803882598877, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0055, + "step": 24470 + }, + { + "epoch": 1.498255707203623, + "grad_norm": 0.24290283024311066, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0033, + "step": 24480 + }, + { + "epoch": 1.4988677397637553, + "grad_norm": 0.20131447911262512, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.0035, + "step": 24490 + }, + { + "epoch": 1.4994797723238875, + "grad_norm": 0.18041104078292847, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0037, + "step": 24500 + }, + { + "epoch": 1.5000918048840197, + "grad_norm": 0.11311472952365875, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0027, + "step": 24510 + }, + { + "epoch": 1.500703837444152, + "grad_norm": 0.10401099175214767, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0035, + "step": 24520 + }, + { + "epoch": 1.5013158700042841, + "grad_norm": 0.16640698909759521, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.0029, + "step": 24530 + }, + { + "epoch": 1.5019279025644163, + "grad_norm": 0.1116192489862442, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0027, + "step": 24540 + }, + { + "epoch": 1.5025399351245485, + "grad_norm": 0.14617346227169037, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0037, + "step": 24550 + }, + { + "epoch": 1.5031519676846807, + "grad_norm": 0.10546499490737915, + "learning_rate": 3.518669865884119e-06, + "loss": 0.0025, + "step": 24560 + }, + { + "epoch": 1.503764000244813, + "grad_norm": 0.11696954816579819, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0039, + "step": 24570 + }, + { + "epoch": 1.5043760328049451, + "grad_norm": 0.1503429412841797, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0038, + "step": 24580 + }, + { + "epoch": 1.5049880653650773, + "grad_norm": 0.13094773888587952, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0037, + "step": 24590 + }, + { + "epoch": 1.5056000979252095, + "grad_norm": 0.1519947648048401, + "learning_rate": 3.497061149826966e-06, + "loss": 0.0027, + "step": 24600 + }, + { + "epoch": 1.5062121304853417, + "grad_norm": 0.3586391806602478, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0041, + "step": 24610 + }, + { + "epoch": 1.506824163045474, + "grad_norm": 0.14964115619659424, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0029, + "step": 24620 + }, + { + "epoch": 1.5074361956056062, + "grad_norm": 0.2676304578781128, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.0033, + "step": 24630 + }, + { + "epoch": 1.5080482281657384, + "grad_norm": 0.117411769926548, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0041, + "step": 24640 + }, + { + "epoch": 1.5086602607258706, + "grad_norm": 0.11224953830242157, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0035, + "step": 24650 + }, + { + "epoch": 1.5092722932860028, + "grad_norm": 0.14367471635341644, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.0033, + "step": 24660 + }, + { + "epoch": 1.509884325846135, + "grad_norm": 0.27663105726242065, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.004, + "step": 24670 + }, + { + "epoch": 1.5104963584062672, + "grad_norm": 0.08599471300840378, + "learning_rate": 3.454266765790622e-06, + "loss": 0.0031, + "step": 24680 + }, + { + "epoch": 1.5111083909663994, + "grad_norm": 0.11320041120052338, + "learning_rate": 3.448957251110008e-06, + "loss": 0.0033, + "step": 24690 + }, + { + "epoch": 1.5117204235265316, + "grad_norm": 0.0896427258849144, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0032, + "step": 24700 + }, + { + "epoch": 1.5123324560866638, + "grad_norm": 0.1055784597992897, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0028, + "step": 24710 + }, + { + "epoch": 1.512944488646796, + "grad_norm": 0.0936208963394165, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0036, + "step": 24720 + }, + { + "epoch": 1.5135565212069282, + "grad_norm": 0.13069137930870056, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.0033, + "step": 24730 + }, + { + "epoch": 1.5141685537670604, + "grad_norm": 0.17260710895061493, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0034, + "step": 24740 + }, + { + "epoch": 1.5147805863271926, + "grad_norm": 0.26109611988067627, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.0044, + "step": 24750 + }, + { + "epoch": 1.5153926188873248, + "grad_norm": 0.22439827024936676, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.0042, + "step": 24760 + }, + { + "epoch": 1.516004651447457, + "grad_norm": 0.2269357591867447, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0039, + "step": 24770 + }, + { + "epoch": 1.5166166840075892, + "grad_norm": 0.20416954159736633, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0039, + "step": 24780 + }, + { + "epoch": 1.5172287165677214, + "grad_norm": 0.1766926646232605, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0031, + "step": 24790 + }, + { + "epoch": 1.5178407491278536, + "grad_norm": 0.05759773403406143, + "learning_rate": 3.391138816571675e-06, + "loss": 0.0029, + "step": 24800 + }, + { + "epoch": 1.5184527816879858, + "grad_norm": 0.19152496755123138, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0034, + "step": 24810 + }, + { + "epoch": 1.519064814248118, + "grad_norm": 0.09876703470945358, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0032, + "step": 24820 + }, + { + "epoch": 1.5196768468082502, + "grad_norm": 0.11626110225915909, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0037, + "step": 24830 + }, + { + "epoch": 1.5202888793683824, + "grad_norm": 0.13713783025741577, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0027, + "step": 24840 + }, + { + "epoch": 1.5209009119285146, + "grad_norm": 0.19144660234451294, + "learning_rate": 3.36521439484193e-06, + "loss": 0.004, + "step": 24850 + }, + { + "epoch": 1.5215129444886468, + "grad_norm": 0.1376778483390808, + "learning_rate": 3.36005636574796e-06, + "loss": 0.0037, + "step": 24860 + }, + { + "epoch": 1.522124977048779, + "grad_norm": 0.4120432436466217, + "learning_rate": 3.354907302553392e-06, + "loss": 0.0042, + "step": 24870 + }, + { + "epoch": 1.5227370096089112, + "grad_norm": 0.14245551824569702, + "learning_rate": 3.349767211300933e-06, + "loss": 0.003, + "step": 24880 + }, + { + "epoch": 1.5233490421690434, + "grad_norm": 0.19136923551559448, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0053, + "step": 24890 + }, + { + "epoch": 1.5239610747291756, + "grad_norm": 0.28412777185440063, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.0031, + "step": 24900 + }, + { + "epoch": 1.5245731072893078, + "grad_norm": 0.18925072252750397, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.003, + "step": 24910 + }, + { + "epoch": 1.52518513984944, + "grad_norm": 0.21378494799137115, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0034, + "step": 24920 + }, + { + "epoch": 1.5257971724095722, + "grad_norm": 0.19160443544387817, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0037, + "step": 24930 + }, + { + "epoch": 1.5264092049697044, + "grad_norm": 0.19070027768611908, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0038, + "step": 24940 + }, + { + "epoch": 1.5270212375298367, + "grad_norm": 0.20489074289798737, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.003, + "step": 24950 + }, + { + "epoch": 1.5276332700899689, + "grad_norm": 0.15747228264808655, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0037, + "step": 24960 + }, + { + "epoch": 1.528245302650101, + "grad_norm": 0.21312901377677917, + "learning_rate": 3.303911119253872e-06, + "loss": 0.0035, + "step": 24970 + }, + { + "epoch": 1.5288573352102333, + "grad_norm": 0.10329846292734146, + "learning_rate": 3.298861077451818e-06, + "loss": 0.0033, + "step": 24980 + }, + { + "epoch": 1.5294693677703655, + "grad_norm": 0.13872355222702026, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0056, + "step": 24990 + }, + { + "epoch": 1.5300814003304977, + "grad_norm": 0.08532251417636871, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0026, + "step": 25000 + }, + { + "epoch": 1.5306934328906299, + "grad_norm": 0.1309783011674881, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.0038, + "step": 25010 + }, + { + "epoch": 1.531305465450762, + "grad_norm": 0.16484731435775757, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.0034, + "step": 25020 + }, + { + "epoch": 1.5319174980108943, + "grad_norm": 0.1756003201007843, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0044, + "step": 25030 + }, + { + "epoch": 1.5325295305710265, + "grad_norm": 0.13745243847370148, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0063, + "step": 25040 + }, + { + "epoch": 1.5331415631311587, + "grad_norm": 0.1077183336019516, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0032, + "step": 25050 + }, + { + "epoch": 1.5337535956912909, + "grad_norm": 0.3091605007648468, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0036, + "step": 25060 + }, + { + "epoch": 1.534365628251423, + "grad_norm": 0.13469856977462769, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0031, + "step": 25070 + }, + { + "epoch": 1.5349776608115553, + "grad_norm": 0.2445354014635086, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0037, + "step": 25080 + }, + { + "epoch": 1.5355896933716875, + "grad_norm": 0.1065889522433281, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0038, + "step": 25090 + }, + { + "epoch": 1.5362017259318197, + "grad_norm": 0.1539459079504013, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0036, + "step": 25100 + }, + { + "epoch": 1.536813758491952, + "grad_norm": 0.23242861032485962, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0037, + "step": 25110 + }, + { + "epoch": 1.537425791052084, + "grad_norm": 0.18660615384578705, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0034, + "step": 25120 + }, + { + "epoch": 1.5380378236122163, + "grad_norm": 0.14089861512184143, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0057, + "step": 25130 + }, + { + "epoch": 1.5386498561723485, + "grad_norm": 0.30568358302116394, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0036, + "step": 25140 + }, + { + "epoch": 1.5392618887324807, + "grad_norm": 0.0965384691953659, + "learning_rate": 3.214397932123149e-06, + "loss": 0.0029, + "step": 25150 + }, + { + "epoch": 1.539873921292613, + "grad_norm": 0.12925416231155396, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0028, + "step": 25160 + }, + { + "epoch": 1.5404859538527451, + "grad_norm": 0.10820749402046204, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0025, + "step": 25170 + }, + { + "epoch": 1.5410979864128773, + "grad_norm": 0.200232595205307, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0038, + "step": 25180 + }, + { + "epoch": 1.5417100189730095, + "grad_norm": 0.13515910506248474, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.0037, + "step": 25190 + }, + { + "epoch": 1.5423220515331417, + "grad_norm": 0.08493158221244812, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0026, + "step": 25200 + }, + { + "epoch": 1.542934084093274, + "grad_norm": 0.21674226224422455, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0047, + "step": 25210 + }, + { + "epoch": 1.543546116653406, + "grad_norm": 0.18259066343307495, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0029, + "step": 25220 + }, + { + "epoch": 1.5441581492135381, + "grad_norm": 0.14857260882854462, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0029, + "step": 25230 + }, + { + "epoch": 1.5447701817736703, + "grad_norm": 0.1540914922952652, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.0026, + "step": 25240 + }, + { + "epoch": 1.5453822143338025, + "grad_norm": 0.08827090263366699, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0029, + "step": 25250 + }, + { + "epoch": 1.5459942468939347, + "grad_norm": 0.07511961460113525, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0035, + "step": 25260 + }, + { + "epoch": 1.546606279454067, + "grad_norm": 0.26209381222724915, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.0033, + "step": 25270 + }, + { + "epoch": 1.5472183120141991, + "grad_norm": 0.08861620724201202, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.0033, + "step": 25280 + }, + { + "epoch": 1.5478303445743313, + "grad_norm": 0.1642802655696869, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0032, + "step": 25290 + }, + { + "epoch": 1.5484423771344635, + "grad_norm": 0.24771225452423096, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0034, + "step": 25300 + }, + { + "epoch": 1.5490544096945957, + "grad_norm": 0.2717854976654053, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.004, + "step": 25310 + }, + { + "epoch": 1.549666442254728, + "grad_norm": 0.12177802622318268, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.0029, + "step": 25320 + }, + { + "epoch": 1.5502784748148601, + "grad_norm": 0.09988416731357574, + "learning_rate": 3.127844986891409e-06, + "loss": 0.0052, + "step": 25330 + }, + { + "epoch": 1.5508905073749923, + "grad_norm": 0.08877446502447128, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0024, + "step": 25340 + }, + { + "epoch": 1.5515025399351245, + "grad_norm": 0.16233091056346893, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.003, + "step": 25350 + }, + { + "epoch": 1.5521145724952568, + "grad_norm": 0.10167178511619568, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.0034, + "step": 25360 + }, + { + "epoch": 1.552726605055389, + "grad_norm": 0.14738866686820984, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0036, + "step": 25370 + }, + { + "epoch": 1.5533386376155212, + "grad_norm": 0.07526370882987976, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0036, + "step": 25380 + }, + { + "epoch": 1.5539506701756534, + "grad_norm": 0.1659732311964035, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0032, + "step": 25390 + }, + { + "epoch": 1.5545627027357856, + "grad_norm": 0.18707287311553955, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0023, + "step": 25400 + }, + { + "epoch": 1.5551747352959178, + "grad_norm": 0.21416662633419037, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0029, + "step": 25410 + }, + { + "epoch": 1.55578676785605, + "grad_norm": 0.3034561574459076, + "learning_rate": 3.085688933413021e-06, + "loss": 0.003, + "step": 25420 + }, + { + "epoch": 1.5563988004161822, + "grad_norm": 0.18879717588424683, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0035, + "step": 25430 + }, + { + "epoch": 1.5570108329763144, + "grad_norm": 0.12917254865169525, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.0038, + "step": 25440 + }, + { + "epoch": 1.5576228655364466, + "grad_norm": 0.0970548763871193, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0046, + "step": 25450 + }, + { + "epoch": 1.5582348980965788, + "grad_norm": 0.17424598336219788, + "learning_rate": 3.067194157156521e-06, + "loss": 0.003, + "step": 25460 + }, + { + "epoch": 1.558846930656711, + "grad_norm": 0.11429346352815628, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0039, + "step": 25470 + }, + { + "epoch": 1.5594589632168432, + "grad_norm": 0.19154596328735352, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0028, + "step": 25480 + }, + { + "epoch": 1.5600709957769754, + "grad_norm": 0.1475156843662262, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.0028, + "step": 25490 + }, + { + "epoch": 1.5606830283371074, + "grad_norm": 0.29066604375839233, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0037, + "step": 25500 + }, + { + "epoch": 1.5612950608972396, + "grad_norm": 0.21379634737968445, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.004, + "step": 25510 + }, + { + "epoch": 1.5619070934573718, + "grad_norm": 0.1648091822862625, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.003, + "step": 25520 + }, + { + "epoch": 1.562519126017504, + "grad_norm": 0.2791198790073395, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0038, + "step": 25530 + }, + { + "epoch": 1.5631311585776362, + "grad_norm": 0.13038018345832825, + "learning_rate": 3.030651808761638e-06, + "loss": 0.0028, + "step": 25540 + }, + { + "epoch": 1.5637431911377684, + "grad_norm": 0.07513634115457535, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0032, + "step": 25550 + }, + { + "epoch": 1.5643552236979006, + "grad_norm": 0.34259703755378723, + "learning_rate": 3.021609639602321e-06, + "loss": 0.0034, + "step": 25560 + }, + { + "epoch": 1.5649672562580328, + "grad_norm": 0.1602829545736313, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.0036, + "step": 25570 + }, + { + "epoch": 1.565579288818165, + "grad_norm": 0.11303776502609253, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.003, + "step": 25580 + }, + { + "epoch": 1.5661913213782972, + "grad_norm": 0.06348636001348495, + "learning_rate": 3.008116622200155e-06, + "loss": 0.0038, + "step": 25590 + }, + { + "epoch": 1.5668033539384294, + "grad_norm": 0.2563594579696655, + "learning_rate": 3.003637700546652e-06, + "loss": 0.0027, + "step": 25600 + }, + { + "epoch": 1.5674153864985616, + "grad_norm": 0.08260748535394669, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0029, + "step": 25610 + }, + { + "epoch": 1.5680274190586938, + "grad_norm": 0.15986980497837067, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0044, + "step": 25620 + }, + { + "epoch": 1.568639451618826, + "grad_norm": 0.19412761926651, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.004, + "step": 25630 + }, + { + "epoch": 1.5692514841789582, + "grad_norm": 0.16794568300247192, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0042, + "step": 25640 + }, + { + "epoch": 1.5698635167390904, + "grad_norm": 0.34898805618286133, + "learning_rate": 2.981383959667165e-06, + "loss": 0.003, + "step": 25650 + }, + { + "epoch": 1.5704755492992226, + "grad_norm": 0.11825685203075409, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0037, + "step": 25660 + }, + { + "epoch": 1.5710875818593548, + "grad_norm": 0.1430155634880066, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0049, + "step": 25670 + }, + { + "epoch": 1.571699614419487, + "grad_norm": 0.13148540258407593, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0031, + "step": 25680 + }, + { + "epoch": 1.5723116469796192, + "grad_norm": 0.14384756982326508, + "learning_rate": 2.963750320724704e-06, + "loss": 0.0036, + "step": 25690 + }, + { + "epoch": 1.5729236795397514, + "grad_norm": 0.11322541534900665, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0028, + "step": 25700 + }, + { + "epoch": 1.5735357120998836, + "grad_norm": 0.1428067833185196, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0033, + "step": 25710 + }, + { + "epoch": 1.5741477446600158, + "grad_norm": 0.1169947013258934, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.0024, + "step": 25720 + }, + { + "epoch": 1.574759777220148, + "grad_norm": 0.33150142431259155, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0038, + "step": 25730 + }, + { + "epoch": 1.5753718097802802, + "grad_norm": 0.12486983090639114, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.003, + "step": 25740 + }, + { + "epoch": 1.5759838423404124, + "grad_norm": 0.12485318630933762, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0025, + "step": 25750 + }, + { + "epoch": 1.5765958749005446, + "grad_norm": 0.10158280283212662, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0032, + "step": 25760 + }, + { + "epoch": 1.5772079074606769, + "grad_norm": 0.13820113241672516, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0041, + "step": 25770 + }, + { + "epoch": 1.577819940020809, + "grad_norm": 0.18718287348747253, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0027, + "step": 25780 + }, + { + "epoch": 1.5784319725809413, + "grad_norm": 0.154324010014534, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.004, + "step": 25790 + }, + { + "epoch": 1.5790440051410735, + "grad_norm": 0.10862802714109421, + "learning_rate": 2.916036854664115e-06, + "loss": 0.0039, + "step": 25800 + }, + { + "epoch": 1.5796560377012057, + "grad_norm": 0.11738114804029465, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0032, + "step": 25810 + }, + { + "epoch": 1.5802680702613379, + "grad_norm": 0.08674368262290955, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0035, + "step": 25820 + }, + { + "epoch": 1.58088010282147, + "grad_norm": 0.16917847096920013, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0032, + "step": 25830 + }, + { + "epoch": 1.5814921353816023, + "grad_norm": 0.10122957825660706, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0044, + "step": 25840 + }, + { + "epoch": 1.5821041679417345, + "grad_norm": 0.14450572431087494, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.0031, + "step": 25850 + }, + { + "epoch": 1.5827162005018667, + "grad_norm": 0.11220426112413406, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0035, + "step": 25860 + }, + { + "epoch": 1.5833282330619989, + "grad_norm": 0.15793107450008392, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0034, + "step": 25870 + }, + { + "epoch": 1.583940265622131, + "grad_norm": 0.11485118418931961, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0049, + "step": 25880 + }, + { + "epoch": 1.5845522981822633, + "grad_norm": 0.11588255316019058, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0032, + "step": 25890 + }, + { + "epoch": 1.5851643307423955, + "grad_norm": 0.09770877659320831, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0033, + "step": 25900 + }, + { + "epoch": 1.5857763633025277, + "grad_norm": 0.4078996479511261, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0041, + "step": 25910 + }, + { + "epoch": 1.58638839586266, + "grad_norm": 0.16744333505630493, + "learning_rate": 2.865295218604555e-06, + "loss": 0.003, + "step": 25920 + }, + { + "epoch": 1.587000428422792, + "grad_norm": 0.10358662158250809, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0034, + "step": 25930 + }, + { + "epoch": 1.5876124609829243, + "grad_norm": 0.1420212686061859, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0024, + "step": 25940 + }, + { + "epoch": 1.5882244935430565, + "grad_norm": 0.1387208104133606, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0035, + "step": 25950 + }, + { + "epoch": 1.5888365261031887, + "grad_norm": 0.2383398711681366, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0044, + "step": 25960 + }, + { + "epoch": 1.589448558663321, + "grad_norm": 0.1263049691915512, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.0029, + "step": 25970 + }, + { + "epoch": 1.5900605912234531, + "grad_norm": 0.10938797891139984, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0029, + "step": 25980 + }, + { + "epoch": 1.5906726237835853, + "grad_norm": 0.18173988163471222, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0033, + "step": 25990 + }, + { + "epoch": 1.5912846563437175, + "grad_norm": 0.20956522226333618, + "learning_rate": 2.832230653119002e-06, + "loss": 0.003, + "step": 26000 + }, + { + "epoch": 1.5918966889038497, + "grad_norm": 0.5168828368186951, + "learning_rate": 2.828140665735232e-06, + "loss": 0.0038, + "step": 26010 + }, + { + "epoch": 1.592508721463982, + "grad_norm": 0.19130735099315643, + "learning_rate": 2.8240602684835614e-06, + "loss": 0.003, + "step": 26020 + }, + { + "epoch": 1.5931207540241141, + "grad_norm": 0.2398800253868103, + "learning_rate": 2.8199894661525695e-06, + "loss": 0.0031, + "step": 26030 + }, + { + "epoch": 1.5937327865842463, + "grad_norm": 0.13288211822509766, + "learning_rate": 2.8159282635195604e-06, + "loss": 0.0039, + "step": 26040 + }, + { + "epoch": 1.5943448191443785, + "grad_norm": 0.12008156627416611, + "learning_rate": 2.8118766653505857e-06, + "loss": 0.005, + "step": 26050 + }, + { + "epoch": 1.5949568517045107, + "grad_norm": 0.06939925253391266, + "learning_rate": 2.8078346764004217e-06, + "loss": 0.0026, + "step": 26060 + }, + { + "epoch": 1.595568884264643, + "grad_norm": 0.11179028451442719, + "learning_rate": 2.8038023014125693e-06, + "loss": 0.0032, + "step": 26070 + }, + { + "epoch": 1.5961809168247751, + "grad_norm": 0.07841819524765015, + "learning_rate": 2.799779545119241e-06, + "loss": 0.0035, + "step": 26080 + }, + { + "epoch": 1.5967929493849073, + "grad_norm": 0.3470489978790283, + "learning_rate": 2.7957664122413685e-06, + "loss": 0.0067, + "step": 26090 + }, + { + "epoch": 1.5974049819450395, + "grad_norm": 0.13002917170524597, + "learning_rate": 2.7917629074885855e-06, + "loss": 0.0035, + "step": 26100 + }, + { + "epoch": 1.5980170145051718, + "grad_norm": 0.10265816748142242, + "learning_rate": 2.78776903555923e-06, + "loss": 0.0026, + "step": 26110 + }, + { + "epoch": 1.598629047065304, + "grad_norm": 0.0917414203286171, + "learning_rate": 2.7837848011403307e-06, + "loss": 0.003, + "step": 26120 + }, + { + "epoch": 1.5992410796254362, + "grad_norm": 0.11112091690301895, + "learning_rate": 2.7798102089076096e-06, + "loss": 0.0039, + "step": 26130 + }, + { + "epoch": 1.5998531121855684, + "grad_norm": 0.08949574083089828, + "learning_rate": 2.7758452635254706e-06, + "loss": 0.0035, + "step": 26140 + }, + { + "epoch": 1.6004651447457006, + "grad_norm": 0.10606437176465988, + "learning_rate": 2.771889969647e-06, + "loss": 0.0046, + "step": 26150 + }, + { + "epoch": 1.6010771773058328, + "grad_norm": 0.1891089379787445, + "learning_rate": 2.7679443319139547e-06, + "loss": 0.0025, + "step": 26160 + }, + { + "epoch": 1.601689209865965, + "grad_norm": 0.11007837951183319, + "learning_rate": 2.76400835495676e-06, + "loss": 0.0032, + "step": 26170 + }, + { + "epoch": 1.6023012424260972, + "grad_norm": 0.2129961997270584, + "learning_rate": 2.760082043394504e-06, + "loss": 0.0032, + "step": 26180 + }, + { + "epoch": 1.6029132749862294, + "grad_norm": 0.2265758216381073, + "learning_rate": 2.756165401834933e-06, + "loss": 0.003, + "step": 26190 + }, + { + "epoch": 1.6035253075463616, + "grad_norm": 0.29450783133506775, + "learning_rate": 2.7522584348744443e-06, + "loss": 0.0039, + "step": 26200 + }, + { + "epoch": 1.6041373401064938, + "grad_norm": 0.48828232288360596, + "learning_rate": 2.748361147098079e-06, + "loss": 0.0054, + "step": 26210 + }, + { + "epoch": 1.604749372666626, + "grad_norm": 0.2561551630496979, + "learning_rate": 2.7444735430795245e-06, + "loss": 0.0038, + "step": 26220 + }, + { + "epoch": 1.6053614052267582, + "grad_norm": 0.1838567554950714, + "learning_rate": 2.740595627381096e-06, + "loss": 0.004, + "step": 26230 + }, + { + "epoch": 1.6059734377868904, + "grad_norm": 0.1419040560722351, + "learning_rate": 2.7367274045537477e-06, + "loss": 0.0028, + "step": 26240 + }, + { + "epoch": 1.6065854703470226, + "grad_norm": 0.11946547776460648, + "learning_rate": 2.732868879137055e-06, + "loss": 0.004, + "step": 26250 + }, + { + "epoch": 1.6071975029071548, + "grad_norm": 0.2451052963733673, + "learning_rate": 2.7290200556592094e-06, + "loss": 0.0041, + "step": 26260 + }, + { + "epoch": 1.607809535467287, + "grad_norm": 0.11013349890708923, + "learning_rate": 2.72518093863702e-06, + "loss": 0.0031, + "step": 26270 + }, + { + "epoch": 1.6084215680274192, + "grad_norm": 0.13513876497745514, + "learning_rate": 2.721351532575906e-06, + "loss": 0.0067, + "step": 26280 + }, + { + "epoch": 1.6090336005875514, + "grad_norm": 0.13167037069797516, + "learning_rate": 2.717531841969889e-06, + "loss": 0.0054, + "step": 26290 + }, + { + "epoch": 1.6096456331476836, + "grad_norm": 0.17578460276126862, + "learning_rate": 2.713721871301588e-06, + "loss": 0.0042, + "step": 26300 + }, + { + "epoch": 1.6102576657078158, + "grad_norm": 0.26278436183929443, + "learning_rate": 2.709921625042214e-06, + "loss": 0.0052, + "step": 26310 + }, + { + "epoch": 1.610869698267948, + "grad_norm": 0.12841887772083282, + "learning_rate": 2.7061311076515717e-06, + "loss": 0.0029, + "step": 26320 + }, + { + "epoch": 1.6114817308280802, + "grad_norm": 0.08532734215259552, + "learning_rate": 2.7023503235780395e-06, + "loss": 0.0037, + "step": 26330 + }, + { + "epoch": 1.6120937633882122, + "grad_norm": 0.23955127596855164, + "learning_rate": 2.6985792772585826e-06, + "loss": 0.0026, + "step": 26340 + }, + { + "epoch": 1.6127057959483444, + "grad_norm": 0.11942708492279053, + "learning_rate": 2.6948179731187315e-06, + "loss": 0.0032, + "step": 26350 + }, + { + "epoch": 1.6133178285084766, + "grad_norm": 0.2980901002883911, + "learning_rate": 2.6910664155725847e-06, + "loss": 0.0036, + "step": 26360 + }, + { + "epoch": 1.6139298610686088, + "grad_norm": 0.18042345345020294, + "learning_rate": 2.6873246090228063e-06, + "loss": 0.0023, + "step": 26370 + }, + { + "epoch": 1.614541893628741, + "grad_norm": 0.09250669926404953, + "learning_rate": 2.683592557860616e-06, + "loss": 0.0028, + "step": 26380 + }, + { + "epoch": 1.6151539261888732, + "grad_norm": 0.11877484619617462, + "learning_rate": 2.6798702664657803e-06, + "loss": 0.0042, + "step": 26390 + }, + { + "epoch": 1.6157659587490054, + "grad_norm": 0.20574252307415009, + "learning_rate": 2.6761577392066163e-06, + "loss": 0.0028, + "step": 26400 + }, + { + "epoch": 1.6163779913091376, + "grad_norm": 0.18342842161655426, + "learning_rate": 2.6724549804399845e-06, + "loss": 0.0028, + "step": 26410 + }, + { + "epoch": 1.6169900238692698, + "grad_norm": 0.18038654327392578, + "learning_rate": 2.6687619945112743e-06, + "loss": 0.0031, + "step": 26420 + }, + { + "epoch": 1.617602056429402, + "grad_norm": 0.14160999655723572, + "learning_rate": 2.6650787857544134e-06, + "loss": 0.0035, + "step": 26430 + }, + { + "epoch": 1.6182140889895342, + "grad_norm": 0.09427947551012039, + "learning_rate": 2.661405358491851e-06, + "loss": 0.0041, + "step": 26440 + }, + { + "epoch": 1.6188261215496664, + "grad_norm": 0.07515032589435577, + "learning_rate": 2.6577417170345594e-06, + "loss": 0.0032, + "step": 26450 + }, + { + "epoch": 1.6194381541097986, + "grad_norm": 0.19633768498897552, + "learning_rate": 2.6540878656820246e-06, + "loss": 0.0043, + "step": 26460 + }, + { + "epoch": 1.6200501866699308, + "grad_norm": 0.22237136960029602, + "learning_rate": 2.6504438087222474e-06, + "loss": 0.003, + "step": 26470 + }, + { + "epoch": 1.620662219230063, + "grad_norm": 0.21898943185806274, + "learning_rate": 2.6468095504317275e-06, + "loss": 0.0033, + "step": 26480 + }, + { + "epoch": 1.6212742517901952, + "grad_norm": 0.14833909273147583, + "learning_rate": 2.643185095075473e-06, + "loss": 0.003, + "step": 26490 + }, + { + "epoch": 1.6218862843503274, + "grad_norm": 0.10988935828208923, + "learning_rate": 2.6395704469069837e-06, + "loss": 0.0036, + "step": 26500 + }, + { + "epoch": 1.6224983169104596, + "grad_norm": 0.17635370790958405, + "learning_rate": 2.635965610168249e-06, + "loss": 0.0047, + "step": 26510 + }, + { + "epoch": 1.6231103494705919, + "grad_norm": 0.15108852088451385, + "learning_rate": 2.6323705890897464e-06, + "loss": 0.0034, + "step": 26520 + }, + { + "epoch": 1.623722382030724, + "grad_norm": 0.1829880177974701, + "learning_rate": 2.628785387890433e-06, + "loss": 0.0041, + "step": 26530 + }, + { + "epoch": 1.6243344145908563, + "grad_norm": 0.15146563947200775, + "learning_rate": 2.6252100107777422e-06, + "loss": 0.0034, + "step": 26540 + }, + { + "epoch": 1.6249464471509885, + "grad_norm": 0.1440849006175995, + "learning_rate": 2.6216444619475786e-06, + "loss": 0.0044, + "step": 26550 + }, + { + "epoch": 1.6255584797111207, + "grad_norm": 0.1681547313928604, + "learning_rate": 2.6180887455843135e-06, + "loss": 0.0045, + "step": 26560 + }, + { + "epoch": 1.6261705122712529, + "grad_norm": 0.07170043885707855, + "learning_rate": 2.6145428658607753e-06, + "loss": 0.0028, + "step": 26570 + }, + { + "epoch": 1.626782544831385, + "grad_norm": 0.0961712971329689, + "learning_rate": 2.6110068269382534e-06, + "loss": 0.0026, + "step": 26580 + }, + { + "epoch": 1.6273945773915173, + "grad_norm": 0.0957784354686737, + "learning_rate": 2.6074806329664854e-06, + "loss": 0.0031, + "step": 26590 + }, + { + "epoch": 1.6280066099516495, + "grad_norm": 0.09888478368520737, + "learning_rate": 2.6039642880836585e-06, + "loss": 0.0042, + "step": 26600 + }, + { + "epoch": 1.6286186425117817, + "grad_norm": 0.1469460278749466, + "learning_rate": 2.600457796416397e-06, + "loss": 0.003, + "step": 26610 + }, + { + "epoch": 1.6292306750719139, + "grad_norm": 0.23431086540222168, + "learning_rate": 2.5969611620797636e-06, + "loss": 0.003, + "step": 26620 + }, + { + "epoch": 1.6298427076320459, + "grad_norm": 0.11390798538923264, + "learning_rate": 2.593474389177255e-06, + "loss": 0.0026, + "step": 26630 + }, + { + "epoch": 1.630454740192178, + "grad_norm": 0.17735126614570618, + "learning_rate": 2.5899974818007924e-06, + "loss": 0.0032, + "step": 26640 + }, + { + "epoch": 1.6310667727523103, + "grad_norm": 0.047082606703042984, + "learning_rate": 2.586530444030723e-06, + "loss": 0.0036, + "step": 26650 + }, + { + "epoch": 1.6316788053124425, + "grad_norm": 0.3262721598148346, + "learning_rate": 2.583073279935805e-06, + "loss": 0.004, + "step": 26660 + }, + { + "epoch": 1.6322908378725747, + "grad_norm": 0.2153632938861847, + "learning_rate": 2.5796259935732143e-06, + "loss": 0.003, + "step": 26670 + }, + { + "epoch": 1.6329028704327069, + "grad_norm": 0.12398967891931534, + "learning_rate": 2.5761885889885346e-06, + "loss": 0.0031, + "step": 26680 + }, + { + "epoch": 1.633514902992839, + "grad_norm": 0.404419481754303, + "learning_rate": 2.5727610702157518e-06, + "loss": 0.0038, + "step": 26690 + }, + { + "epoch": 1.6341269355529713, + "grad_norm": 0.3094029426574707, + "learning_rate": 2.5693434412772496e-06, + "loss": 0.005, + "step": 26700 + }, + { + "epoch": 1.6347389681131035, + "grad_norm": 0.23702147603034973, + "learning_rate": 2.565935706183804e-06, + "loss": 0.003, + "step": 26710 + }, + { + "epoch": 1.6353510006732357, + "grad_norm": 0.175592839717865, + "learning_rate": 2.5625378689345837e-06, + "loss": 0.0038, + "step": 26720 + }, + { + "epoch": 1.635963033233368, + "grad_norm": 0.20330312848091125, + "learning_rate": 2.5591499335171394e-06, + "loss": 0.0034, + "step": 26730 + }, + { + "epoch": 1.6365750657935, + "grad_norm": 0.1990291029214859, + "learning_rate": 2.555771903907403e-06, + "loss": 0.0031, + "step": 26740 + }, + { + "epoch": 1.6371870983536323, + "grad_norm": 0.2611120343208313, + "learning_rate": 2.5524037840696787e-06, + "loss": 0.0026, + "step": 26750 + }, + { + "epoch": 1.6377991309137645, + "grad_norm": 0.15563850104808807, + "learning_rate": 2.5490455779566446e-06, + "loss": 0.0029, + "step": 26760 + }, + { + "epoch": 1.6384111634738967, + "grad_norm": 0.10159289091825485, + "learning_rate": 2.545697289509341e-06, + "loss": 0.0027, + "step": 26770 + }, + { + "epoch": 1.639023196034029, + "grad_norm": 0.14164364337921143, + "learning_rate": 2.5423589226571733e-06, + "loss": 0.0022, + "step": 26780 + }, + { + "epoch": 1.639635228594161, + "grad_norm": 0.09149957448244095, + "learning_rate": 2.5390304813179e-06, + "loss": 0.0042, + "step": 26790 + }, + { + "epoch": 1.6402472611542933, + "grad_norm": 0.19528718292713165, + "learning_rate": 2.5357119693976297e-06, + "loss": 0.0021, + "step": 26800 + }, + { + "epoch": 1.6408592937144255, + "grad_norm": 0.11716540157794952, + "learning_rate": 2.532403390790823e-06, + "loss": 0.0029, + "step": 26810 + }, + { + "epoch": 1.6414713262745577, + "grad_norm": 0.06402851641178131, + "learning_rate": 2.529104749380281e-06, + "loss": 0.0023, + "step": 26820 + }, + { + "epoch": 1.64208335883469, + "grad_norm": 0.12224840372800827, + "learning_rate": 2.5258160490371446e-06, + "loss": 0.0029, + "step": 26830 + }, + { + "epoch": 1.6426953913948221, + "grad_norm": 0.13217593729496002, + "learning_rate": 2.5225372936208854e-06, + "loss": 0.0035, + "step": 26840 + }, + { + "epoch": 1.6433074239549543, + "grad_norm": 0.15030793845653534, + "learning_rate": 2.5192684869793043e-06, + "loss": 0.0026, + "step": 26850 + }, + { + "epoch": 1.6439194565150865, + "grad_norm": 0.10057740658521652, + "learning_rate": 2.51600963294853e-06, + "loss": 0.0044, + "step": 26860 + }, + { + "epoch": 1.6445314890752187, + "grad_norm": 0.19387565553188324, + "learning_rate": 2.5127607353530097e-06, + "loss": 0.0032, + "step": 26870 + }, + { + "epoch": 1.645143521635351, + "grad_norm": 0.32513365149497986, + "learning_rate": 2.5095217980055052e-06, + "loss": 0.0026, + "step": 26880 + }, + { + "epoch": 1.6457555541954831, + "grad_norm": 0.11426142603158951, + "learning_rate": 2.5062928247070873e-06, + "loss": 0.0035, + "step": 26890 + }, + { + "epoch": 1.6463675867556153, + "grad_norm": 0.15678571164608002, + "learning_rate": 2.503073819247138e-06, + "loss": 0.0033, + "step": 26900 + }, + { + "epoch": 1.6469796193157475, + "grad_norm": 0.0901828184723854, + "learning_rate": 2.4998647854033393e-06, + "loss": 0.0033, + "step": 26910 + }, + { + "epoch": 1.6475916518758797, + "grad_norm": 0.1439771205186844, + "learning_rate": 2.4966657269416738e-06, + "loss": 0.0036, + "step": 26920 + }, + { + "epoch": 1.648203684436012, + "grad_norm": 0.08516893535852432, + "learning_rate": 2.49347664761641e-06, + "loss": 0.0031, + "step": 26930 + }, + { + "epoch": 1.6488157169961442, + "grad_norm": 0.13487808406352997, + "learning_rate": 2.490297551170112e-06, + "loss": 0.0038, + "step": 26940 + }, + { + "epoch": 1.6494277495562764, + "grad_norm": 0.12181483954191208, + "learning_rate": 2.487128441333628e-06, + "loss": 0.0048, + "step": 26950 + }, + { + "epoch": 1.6500397821164086, + "grad_norm": 0.11907542496919632, + "learning_rate": 2.4839693218260844e-06, + "loss": 0.0028, + "step": 26960 + }, + { + "epoch": 1.6506518146765408, + "grad_norm": 0.11463847011327744, + "learning_rate": 2.4808201963548844e-06, + "loss": 0.0032, + "step": 26970 + }, + { + "epoch": 1.651263847236673, + "grad_norm": 0.10308004170656204, + "learning_rate": 2.477681068615698e-06, + "loss": 0.0027, + "step": 26980 + }, + { + "epoch": 1.6518758797968052, + "grad_norm": 0.1553436815738678, + "learning_rate": 2.4745519422924715e-06, + "loss": 0.0032, + "step": 26990 + }, + { + "epoch": 1.6524879123569374, + "grad_norm": 0.11983859539031982, + "learning_rate": 2.471432821057406e-06, + "loss": 0.0031, + "step": 27000 + }, + { + "epoch": 1.6530999449170696, + "grad_norm": 0.07867950201034546, + "learning_rate": 2.4683237085709673e-06, + "loss": 0.003, + "step": 27010 + }, + { + "epoch": 1.6537119774772018, + "grad_norm": 0.0990489274263382, + "learning_rate": 2.4652246084818678e-06, + "loss": 0.0035, + "step": 27020 + }, + { + "epoch": 1.654324010037334, + "grad_norm": 0.15849289298057556, + "learning_rate": 2.4621355244270764e-06, + "loss": 0.0029, + "step": 27030 + }, + { + "epoch": 1.6549360425974662, + "grad_norm": 0.23918525874614716, + "learning_rate": 2.4590564600318047e-06, + "loss": 0.0032, + "step": 27040 + }, + { + "epoch": 1.6555480751575984, + "grad_norm": 0.15686926245689392, + "learning_rate": 2.4559874189095077e-06, + "loss": 0.003, + "step": 27050 + }, + { + "epoch": 1.6561601077177306, + "grad_norm": 0.06435749679803848, + "learning_rate": 2.4529284046618745e-06, + "loss": 0.003, + "step": 27060 + }, + { + "epoch": 1.6567721402778628, + "grad_norm": 0.1966746598482132, + "learning_rate": 2.4498794208788296e-06, + "loss": 0.0029, + "step": 27070 + }, + { + "epoch": 1.657384172837995, + "grad_norm": 0.1173984557390213, + "learning_rate": 2.446840471138524e-06, + "loss": 0.0034, + "step": 27080 + }, + { + "epoch": 1.6579962053981272, + "grad_norm": 0.15185165405273438, + "learning_rate": 2.443811559007335e-06, + "loss": 0.0036, + "step": 27090 + }, + { + "epoch": 1.6586082379582594, + "grad_norm": 0.1371954381465912, + "learning_rate": 2.440792688039862e-06, + "loss": 0.002, + "step": 27100 + }, + { + "epoch": 1.6592202705183916, + "grad_norm": 0.10718704760074615, + "learning_rate": 2.437783861778914e-06, + "loss": 0.003, + "step": 27110 + }, + { + "epoch": 1.6598323030785238, + "grad_norm": 0.09085255861282349, + "learning_rate": 2.4347850837555197e-06, + "loss": 0.0028, + "step": 27120 + }, + { + "epoch": 1.660444335638656, + "grad_norm": 0.12604662775993347, + "learning_rate": 2.4317963574889108e-06, + "loss": 0.0029, + "step": 27130 + }, + { + "epoch": 1.6610563681987882, + "grad_norm": 0.06227592006325722, + "learning_rate": 2.428817686486524e-06, + "loss": 0.0034, + "step": 27140 + }, + { + "epoch": 1.6616684007589204, + "grad_norm": 0.15667739510536194, + "learning_rate": 2.425849074243997e-06, + "loss": 0.0029, + "step": 27150 + }, + { + "epoch": 1.6622804333190526, + "grad_norm": 0.11927297711372375, + "learning_rate": 2.4228905242451593e-06, + "loss": 0.0025, + "step": 27160 + }, + { + "epoch": 1.6628924658791848, + "grad_norm": 0.13583429157733917, + "learning_rate": 2.419942039962035e-06, + "loss": 0.0049, + "step": 27170 + }, + { + "epoch": 1.663504498439317, + "grad_norm": 0.31264790892601013, + "learning_rate": 2.4170036248548345e-06, + "loss": 0.0032, + "step": 27180 + }, + { + "epoch": 1.6641165309994492, + "grad_norm": 0.1507059931755066, + "learning_rate": 2.414075282371954e-06, + "loss": 0.0041, + "step": 27190 + }, + { + "epoch": 1.6647285635595814, + "grad_norm": 0.22571611404418945, + "learning_rate": 2.411157015949963e-06, + "loss": 0.006, + "step": 27200 + }, + { + "epoch": 1.6653405961197136, + "grad_norm": 0.07582036405801773, + "learning_rate": 2.408248829013611e-06, + "loss": 0.0025, + "step": 27210 + }, + { + "epoch": 1.6659526286798458, + "grad_norm": 0.16827397048473358, + "learning_rate": 2.4053507249758174e-06, + "loss": 0.0025, + "step": 27220 + }, + { + "epoch": 1.666564661239978, + "grad_norm": 0.26645299792289734, + "learning_rate": 2.40246270723767e-06, + "loss": 0.0034, + "step": 27230 + }, + { + "epoch": 1.6671766938001102, + "grad_norm": 0.15947957336902618, + "learning_rate": 2.399584779188417e-06, + "loss": 0.003, + "step": 27240 + }, + { + "epoch": 1.6677887263602424, + "grad_norm": 0.16127845644950867, + "learning_rate": 2.396716944205467e-06, + "loss": 0.0049, + "step": 27250 + }, + { + "epoch": 1.6684007589203746, + "grad_norm": 0.1279461681842804, + "learning_rate": 2.3938592056543853e-06, + "loss": 0.0027, + "step": 27260 + }, + { + "epoch": 1.6690127914805069, + "grad_norm": 0.06649098545312881, + "learning_rate": 2.391011566888887e-06, + "loss": 0.0021, + "step": 27270 + }, + { + "epoch": 1.669624824040639, + "grad_norm": 0.196940615773201, + "learning_rate": 2.3881740312508346e-06, + "loss": 0.0024, + "step": 27280 + }, + { + "epoch": 1.6702368566007713, + "grad_norm": 0.07980433851480484, + "learning_rate": 2.3853466020702323e-06, + "loss": 0.0028, + "step": 27290 + }, + { + "epoch": 1.6708488891609035, + "grad_norm": 0.10023880004882812, + "learning_rate": 2.382529282665229e-06, + "loss": 0.0027, + "step": 27300 + }, + { + "epoch": 1.6714609217210357, + "grad_norm": 0.12118209153413773, + "learning_rate": 2.379722076342103e-06, + "loss": 0.0035, + "step": 27310 + }, + { + "epoch": 1.6720729542811679, + "grad_norm": 0.1536104530096054, + "learning_rate": 2.376924986395271e-06, + "loss": 0.0032, + "step": 27320 + }, + { + "epoch": 1.6726849868413, + "grad_norm": 0.0671612024307251, + "learning_rate": 2.37413801610727e-06, + "loss": 0.0025, + "step": 27330 + }, + { + "epoch": 1.6732970194014323, + "grad_norm": 0.17756326496601105, + "learning_rate": 2.371361168748767e-06, + "loss": 0.0033, + "step": 27340 + }, + { + "epoch": 1.6739090519615645, + "grad_norm": 0.07412310689687729, + "learning_rate": 2.3685944475785463e-06, + "loss": 0.0039, + "step": 27350 + }, + { + "epoch": 1.6745210845216967, + "grad_norm": 0.17036253213882446, + "learning_rate": 2.3658378558435098e-06, + "loss": 0.0046, + "step": 27360 + }, + { + "epoch": 1.6751331170818289, + "grad_norm": 0.07159245759248734, + "learning_rate": 2.363091396778672e-06, + "loss": 0.0025, + "step": 27370 + }, + { + "epoch": 1.675745149641961, + "grad_norm": 0.11311008781194687, + "learning_rate": 2.3603550736071535e-06, + "loss": 0.0028, + "step": 27380 + }, + { + "epoch": 1.6763571822020933, + "grad_norm": 0.062365781515836716, + "learning_rate": 2.357628889540182e-06, + "loss": 0.0029, + "step": 27390 + }, + { + "epoch": 1.6769692147622255, + "grad_norm": 0.1132882833480835, + "learning_rate": 2.3549128477770894e-06, + "loss": 0.003, + "step": 27400 + }, + { + "epoch": 1.6775812473223577, + "grad_norm": 0.2946174740791321, + "learning_rate": 2.3522069515052996e-06, + "loss": 0.003, + "step": 27410 + }, + { + "epoch": 1.67819327988249, + "grad_norm": 0.22978715598583221, + "learning_rate": 2.349511203900333e-06, + "loss": 0.0028, + "step": 27420 + }, + { + "epoch": 1.678805312442622, + "grad_norm": 0.12381251156330109, + "learning_rate": 2.3468256081258e-06, + "loss": 0.0035, + "step": 27430 + }, + { + "epoch": 1.6794173450027543, + "grad_norm": 0.3918306231498718, + "learning_rate": 2.344150167333397e-06, + "loss": 0.0036, + "step": 27440 + }, + { + "epoch": 1.6800293775628865, + "grad_norm": 0.1729428470134735, + "learning_rate": 2.3414848846629013e-06, + "loss": 0.0039, + "step": 27450 + }, + { + "epoch": 1.6806414101230187, + "grad_norm": 0.10841631144285202, + "learning_rate": 2.3388297632421727e-06, + "loss": 0.0032, + "step": 27460 + }, + { + "epoch": 1.6812534426831507, + "grad_norm": 0.12045114487409592, + "learning_rate": 2.3361848061871417e-06, + "loss": 0.0048, + "step": 27470 + }, + { + "epoch": 1.681865475243283, + "grad_norm": 0.15946263074874878, + "learning_rate": 2.333550016601814e-06, + "loss": 0.0025, + "step": 27480 + }, + { + "epoch": 1.682477507803415, + "grad_norm": 0.20978282392024994, + "learning_rate": 2.3309253975782623e-06, + "loss": 0.0039, + "step": 27490 + }, + { + "epoch": 1.6830895403635473, + "grad_norm": 0.4889276325702667, + "learning_rate": 2.3283109521966236e-06, + "loss": 0.005, + "step": 27500 + }, + { + "epoch": 1.6837015729236795, + "grad_norm": 0.10033760219812393, + "learning_rate": 2.325706683525094e-06, + "loss": 0.0032, + "step": 27510 + }, + { + "epoch": 1.6843136054838117, + "grad_norm": 0.16516660153865814, + "learning_rate": 2.3231125946199277e-06, + "loss": 0.0043, + "step": 27520 + }, + { + "epoch": 1.684925638043944, + "grad_norm": 0.15988346934318542, + "learning_rate": 2.320528688525433e-06, + "loss": 0.0031, + "step": 27530 + }, + { + "epoch": 1.685537670604076, + "grad_norm": 0.0838918536901474, + "learning_rate": 2.317954968273969e-06, + "loss": 0.0033, + "step": 27540 + }, + { + "epoch": 1.6861497031642083, + "grad_norm": 0.09774886816740036, + "learning_rate": 2.3153914368859386e-06, + "loss": 0.0037, + "step": 27550 + }, + { + "epoch": 1.6867617357243405, + "grad_norm": 0.11428319662809372, + "learning_rate": 2.3128380973697868e-06, + "loss": 0.0028, + "step": 27560 + }, + { + "epoch": 1.6873737682844727, + "grad_norm": 0.0789853185415268, + "learning_rate": 2.3102949527220025e-06, + "loss": 0.0033, + "step": 27570 + }, + { + "epoch": 1.687985800844605, + "grad_norm": 0.12702232599258423, + "learning_rate": 2.3077620059271054e-06, + "loss": 0.0033, + "step": 27580 + }, + { + "epoch": 1.6885978334047371, + "grad_norm": 0.12080296128988266, + "learning_rate": 2.305239259957653e-06, + "loss": 0.0027, + "step": 27590 + }, + { + "epoch": 1.6892098659648693, + "grad_norm": 0.21917396783828735, + "learning_rate": 2.302726717774224e-06, + "loss": 0.0032, + "step": 27600 + }, + { + "epoch": 1.6898218985250015, + "grad_norm": 0.28265318274497986, + "learning_rate": 2.3002243823254294e-06, + "loss": 0.0036, + "step": 27610 + }, + { + "epoch": 1.6904339310851337, + "grad_norm": 0.09106706827878952, + "learning_rate": 2.2977322565478988e-06, + "loss": 0.0029, + "step": 27620 + }, + { + "epoch": 1.691045963645266, + "grad_norm": 0.1670890897512436, + "learning_rate": 2.2952503433662806e-06, + "loss": 0.0024, + "step": 27630 + }, + { + "epoch": 1.6916579962053981, + "grad_norm": 0.16830581426620483, + "learning_rate": 2.2927786456932383e-06, + "loss": 0.0033, + "step": 27640 + }, + { + "epoch": 1.6922700287655303, + "grad_norm": 0.3394775092601776, + "learning_rate": 2.2903171664294446e-06, + "loss": 0.0045, + "step": 27650 + }, + { + "epoch": 1.6928820613256625, + "grad_norm": 0.11403192579746246, + "learning_rate": 2.287865908463585e-06, + "loss": 0.0047, + "step": 27660 + }, + { + "epoch": 1.6934940938857947, + "grad_norm": 0.12133318930864334, + "learning_rate": 2.2854248746723464e-06, + "loss": 0.0038, + "step": 27670 + }, + { + "epoch": 1.694106126445927, + "grad_norm": 0.2074453979730606, + "learning_rate": 2.2829940679204192e-06, + "loss": 0.0045, + "step": 27680 + }, + { + "epoch": 1.6947181590060592, + "grad_norm": 0.0654371827840805, + "learning_rate": 2.280573491060488e-06, + "loss": 0.0023, + "step": 27690 + }, + { + "epoch": 1.6953301915661914, + "grad_norm": 0.3289278745651245, + "learning_rate": 2.278163146933236e-06, + "loss": 0.0043, + "step": 27700 + }, + { + "epoch": 1.6959422241263236, + "grad_norm": 0.10692958533763885, + "learning_rate": 2.275763038367336e-06, + "loss": 0.0026, + "step": 27710 + }, + { + "epoch": 1.6965542566864558, + "grad_norm": 0.06414066255092621, + "learning_rate": 2.2733731681794505e-06, + "loss": 0.0022, + "step": 27720 + }, + { + "epoch": 1.697166289246588, + "grad_norm": 0.22467097640037537, + "learning_rate": 2.270993539174225e-06, + "loss": 0.0028, + "step": 27730 + }, + { + "epoch": 1.6977783218067202, + "grad_norm": 0.14074043929576874, + "learning_rate": 2.268624154144283e-06, + "loss": 0.0033, + "step": 27740 + }, + { + "epoch": 1.6983903543668522, + "grad_norm": 0.17113615572452545, + "learning_rate": 2.266265015870234e-06, + "loss": 0.0033, + "step": 27750 + }, + { + "epoch": 1.6990023869269844, + "grad_norm": 0.09429248422384262, + "learning_rate": 2.2639161271206562e-06, + "loss": 0.0027, + "step": 27760 + }, + { + "epoch": 1.6996144194871166, + "grad_norm": 0.06843049824237823, + "learning_rate": 2.261577490652103e-06, + "loss": 0.0044, + "step": 27770 + }, + { + "epoch": 1.7002264520472488, + "grad_norm": 0.08251061290502548, + "learning_rate": 2.259249109209093e-06, + "loss": 0.0029, + "step": 27780 + }, + { + "epoch": 1.700838484607381, + "grad_norm": 0.29461193084716797, + "learning_rate": 2.256930985524111e-06, + "loss": 0.0037, + "step": 27790 + }, + { + "epoch": 1.7014505171675132, + "grad_norm": 0.11461394280195236, + "learning_rate": 2.2546231223176062e-06, + "loss": 0.0027, + "step": 27800 + }, + { + "epoch": 1.7020625497276454, + "grad_norm": 0.15875136852264404, + "learning_rate": 2.2523255222979846e-06, + "loss": 0.0031, + "step": 27810 + }, + { + "epoch": 1.7026745822877776, + "grad_norm": 0.097860187292099, + "learning_rate": 2.2500381881616064e-06, + "loss": 0.0034, + "step": 27820 + }, + { + "epoch": 1.7032866148479098, + "grad_norm": 0.07356908917427063, + "learning_rate": 2.2477611225927847e-06, + "loss": 0.0026, + "step": 27830 + }, + { + "epoch": 1.703898647408042, + "grad_norm": 0.1890958547592163, + "learning_rate": 2.2454943282637852e-06, + "loss": 0.0037, + "step": 27840 + }, + { + "epoch": 1.7045106799681742, + "grad_norm": 0.1173754408955574, + "learning_rate": 2.2432378078348166e-06, + "loss": 0.0034, + "step": 27850 + }, + { + "epoch": 1.7051227125283064, + "grad_norm": 0.2559126019477844, + "learning_rate": 2.2409915639540295e-06, + "loss": 0.0024, + "step": 27860 + }, + { + "epoch": 1.7057347450884386, + "grad_norm": 0.17337289452552795, + "learning_rate": 2.2387555992575192e-06, + "loss": 0.0026, + "step": 27870 + }, + { + "epoch": 1.7063467776485708, + "grad_norm": 0.34073203802108765, + "learning_rate": 2.236529916369313e-06, + "loss": 0.0057, + "step": 27880 + }, + { + "epoch": 1.706958810208703, + "grad_norm": 0.1395779252052307, + "learning_rate": 2.2343145179013726e-06, + "loss": 0.0029, + "step": 27890 + }, + { + "epoch": 1.7075708427688352, + "grad_norm": 0.07645416259765625, + "learning_rate": 2.232109406453595e-06, + "loss": 0.0034, + "step": 27900 + }, + { + "epoch": 1.7081828753289674, + "grad_norm": 0.19695641100406647, + "learning_rate": 2.229914584613798e-06, + "loss": 0.0031, + "step": 27910 + }, + { + "epoch": 1.7087949078890996, + "grad_norm": 0.09641100466251373, + "learning_rate": 2.22773005495773e-06, + "loss": 0.0029, + "step": 27920 + }, + { + "epoch": 1.7094069404492318, + "grad_norm": 0.13393571972846985, + "learning_rate": 2.2255558200490557e-06, + "loss": 0.0029, + "step": 27930 + }, + { + "epoch": 1.710018973009364, + "grad_norm": 0.12252296507358551, + "learning_rate": 2.2233918824393625e-06, + "loss": 0.0035, + "step": 27940 + }, + { + "epoch": 1.7106310055694962, + "grad_norm": 0.18026909232139587, + "learning_rate": 2.221238244668151e-06, + "loss": 0.0033, + "step": 27950 + }, + { + "epoch": 1.7112430381296284, + "grad_norm": 0.11210714280605316, + "learning_rate": 2.219094909262834e-06, + "loss": 0.0041, + "step": 27960 + }, + { + "epoch": 1.7118550706897606, + "grad_norm": 0.08154530823230743, + "learning_rate": 2.2169618787387374e-06, + "loss": 0.0023, + "step": 27970 + }, + { + "epoch": 1.7124671032498928, + "grad_norm": 0.11625959724187851, + "learning_rate": 2.2148391555990905e-06, + "loss": 0.002, + "step": 27980 + }, + { + "epoch": 1.713079135810025, + "grad_norm": 0.17261847853660583, + "learning_rate": 2.212726742335025e-06, + "loss": 0.0034, + "step": 27990 + }, + { + "epoch": 1.7136911683701572, + "grad_norm": 0.2842121422290802, + "learning_rate": 2.210624641425579e-06, + "loss": 0.0037, + "step": 28000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.748988931866624e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/training_args.bin b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cd9e28a44ae85140e2ef027a82e8be4c39167cc4 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5644791eb57bcb4c4808b4c2429b71e4c49eece4fc60f263f4553a3380f230bb +size 6097 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/added_tokens.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/generation_config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00001-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9129331ebe688349ffe3716540872bf018d97ac3 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b408b1954964cad3baaf590b0f8a4577e39f5c3a0cd70d56798923e542df9431 +size 4921072616 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00002-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e7bc8d4157c4995f8c2ef08078ef001e58b1e6be --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:889c8a2bc301c00c7a3595263f5be2ecdd423b3a320092d60f49d545bfc96a1c +size 4978830984 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00003-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3abc785fb50560572293522e2ccb633a3ac3f9b2 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0443466509d753d0638fcb9827ce86fbec9b3c05f32cc9a3db9201092996dd6 +size 4100977896 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model.safetensors.index.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/norm_stats.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..f33de4b80f47e0bac1a414431a8354d8345d60c5 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -24.65332532291412, + -30.64622355117798, + -14.452480476760865, + -1.8581012797355654, + -2.2742317820549007, + -1.9569469915390014, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 3.0011677881240857, + 22.348905650329584, + 21.68580058555603, + 2.3937565994262693, + 4.117288079452516, + 3.295379007720948, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + -6.570000648498535, + -1.0618462562561035, + 3.623035430908203, + 0.010442602448165417, + 0.7240540385246277, + 0.44398337602615356, + 0.12898989021778107, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 7.04909086227417, + 17.099597930908203, + 8.363018989562988, + 0.6997263431549072, + 1.1358375549316406, + 0.9687971472740173, + 0.9916459321975708, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.777750787353515, + -21.249025872802733, + -2.4021557040214536, + -4.092200187206268, + -3.2986312219619753, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.645499613952634, + 30.59561934127808, + 14.405443457031247, + 1.8499586300849913, + 2.268683268356323, + 1.963451420021057, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.6817545890808105, + 1.3444018363952637, + -3.5411791801452637, + -0.009792014956474304, + -0.7230188846588135, + -0.44849714636802673, + 0.15749873220920563, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.988739013671875, + 16.884004592895508, + 8.242538452148438, + 0.6991510391235352, + 1.1302146911621094, + 0.9690405130386353, + 0.9875192046165466, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/pi0.yaml b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff8c0ecf109af377331583e4079865e7d8037bc8 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 5 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/special_tokens_map.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/tokenizer.model b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/tokenizer_config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/trainer_state.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..53e54192e5280910ff8e8d69dec7e74c9d98fbee --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/trainer_state.json @@ -0,0 +1,21034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.8360976803965972, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006120325601321991, + "grad_norm": 2.2432243824005127, + "learning_rate": 1.8e-07, + "loss": 0.1384, + "step": 10 + }, + { + "epoch": 0.0012240651202643981, + "grad_norm": 1.959119439125061, + "learning_rate": 3.8e-07, + "loss": 0.1388, + "step": 20 + }, + { + "epoch": 0.001836097680396597, + "grad_norm": 1.8843899965286255, + "learning_rate": 5.800000000000001e-07, + "loss": 0.1307, + "step": 30 + }, + { + "epoch": 0.0024481302405287963, + "grad_norm": 1.7569042444229126, + "learning_rate": 7.8e-07, + "loss": 0.1238, + "step": 40 + }, + { + "epoch": 0.0030601628006609954, + "grad_norm": 2.6189017295837402, + "learning_rate": 9.800000000000001e-07, + "loss": 0.1275, + "step": 50 + }, + { + "epoch": 0.003672195360793194, + "grad_norm": 1.8418694734573364, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.1032, + "step": 60 + }, + { + "epoch": 0.004284227920925393, + "grad_norm": 1.481676697731018, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0816, + "step": 70 + }, + { + "epoch": 0.004896260481057593, + "grad_norm": 0.9590038061141968, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.0732, + "step": 80 + }, + { + "epoch": 0.005508293041189791, + "grad_norm": 1.002897024154663, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.0608, + "step": 90 + }, + { + "epoch": 0.006120325601321991, + "grad_norm": 0.9830108284950256, + "learning_rate": 1.98e-06, + "loss": 0.042, + "step": 100 + }, + { + "epoch": 0.006732358161454189, + "grad_norm": 0.858244001865387, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.0314, + "step": 110 + }, + { + "epoch": 0.007344390721586388, + "grad_norm": 0.5761063694953918, + "learning_rate": 2.38e-06, + "loss": 0.029, + "step": 120 + }, + { + "epoch": 0.007956423281718587, + "grad_norm": 0.5434514284133911, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.0227, + "step": 130 + }, + { + "epoch": 0.008568455841850786, + "grad_norm": 0.6488766670227051, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.0202, + "step": 140 + }, + { + "epoch": 0.009180488401982986, + "grad_norm": 0.36763015389442444, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.0157, + "step": 150 + }, + { + "epoch": 0.009792520962115185, + "grad_norm": 0.49271446466445923, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.0194, + "step": 160 + }, + { + "epoch": 0.010404553522247383, + "grad_norm": 0.23608209192752838, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.0122, + "step": 170 + }, + { + "epoch": 0.011016586082379582, + "grad_norm": 0.47871828079223633, + "learning_rate": 3.58e-06, + "loss": 0.0131, + "step": 180 + }, + { + "epoch": 0.011628618642511782, + "grad_norm": 0.6862446069717407, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.0131, + "step": 190 + }, + { + "epoch": 0.012240651202643981, + "grad_norm": 0.7964349389076233, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0155, + "step": 200 + }, + { + "epoch": 0.01285268376277618, + "grad_norm": 0.5564846396446228, + "learning_rate": 4.18e-06, + "loss": 0.0104, + "step": 210 + }, + { + "epoch": 0.013464716322908379, + "grad_norm": 0.2810452878475189, + "learning_rate": 4.38e-06, + "loss": 0.0128, + "step": 220 + }, + { + "epoch": 0.014076748883040578, + "grad_norm": 0.4474979341030121, + "learning_rate": 4.58e-06, + "loss": 0.0188, + "step": 230 + }, + { + "epoch": 0.014688781443172776, + "grad_norm": 0.47965875267982483, + "learning_rate": 4.78e-06, + "loss": 0.0141, + "step": 240 + }, + { + "epoch": 0.015300814003304975, + "grad_norm": 0.3410812020301819, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0085, + "step": 250 + }, + { + "epoch": 0.015912846563437173, + "grad_norm": 0.39907002449035645, + "learning_rate": 5.18e-06, + "loss": 0.0106, + "step": 260 + }, + { + "epoch": 0.016524879123569373, + "grad_norm": 0.28909367322921753, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0103, + "step": 270 + }, + { + "epoch": 0.017136911683701572, + "grad_norm": 0.31524109840393066, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0101, + "step": 280 + }, + { + "epoch": 0.017748944243833772, + "grad_norm": 0.29430100321769714, + "learning_rate": 5.78e-06, + "loss": 0.0109, + "step": 290 + }, + { + "epoch": 0.01836097680396597, + "grad_norm": 0.2709169387817383, + "learning_rate": 5.98e-06, + "loss": 0.0102, + "step": 300 + }, + { + "epoch": 0.01897300936409817, + "grad_norm": 0.33067119121551514, + "learning_rate": 6.18e-06, + "loss": 0.0095, + "step": 310 + }, + { + "epoch": 0.01958504192423037, + "grad_norm": 0.28110620379447937, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0102, + "step": 320 + }, + { + "epoch": 0.02019707448436257, + "grad_norm": 0.27736902236938477, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0088, + "step": 330 + }, + { + "epoch": 0.020809107044494766, + "grad_norm": 0.3238557279109955, + "learning_rate": 6.780000000000001e-06, + "loss": 0.01, + "step": 340 + }, + { + "epoch": 0.021421139604626965, + "grad_norm": 0.30263441801071167, + "learning_rate": 6.98e-06, + "loss": 0.0095, + "step": 350 + }, + { + "epoch": 0.022033172164759165, + "grad_norm": 0.2618265450000763, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0096, + "step": 360 + }, + { + "epoch": 0.022645204724891364, + "grad_norm": 0.272565633058548, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0093, + "step": 370 + }, + { + "epoch": 0.023257237285023564, + "grad_norm": 0.44272440671920776, + "learning_rate": 7.58e-06, + "loss": 0.0087, + "step": 380 + }, + { + "epoch": 0.023869269845155763, + "grad_norm": 0.27631404995918274, + "learning_rate": 7.78e-06, + "loss": 0.0093, + "step": 390 + }, + { + "epoch": 0.024481302405287963, + "grad_norm": 0.4108494520187378, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0093, + "step": 400 + }, + { + "epoch": 0.02509333496542016, + "grad_norm": 0.43498387932777405, + "learning_rate": 8.18e-06, + "loss": 0.0098, + "step": 410 + }, + { + "epoch": 0.02570536752555236, + "grad_norm": 0.3419845700263977, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0091, + "step": 420 + }, + { + "epoch": 0.026317400085684558, + "grad_norm": 0.5677013993263245, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0104, + "step": 430 + }, + { + "epoch": 0.026929432645816757, + "grad_norm": 0.24424298107624054, + "learning_rate": 8.78e-06, + "loss": 0.0089, + "step": 440 + }, + { + "epoch": 0.027541465205948957, + "grad_norm": 0.267781138420105, + "learning_rate": 8.98e-06, + "loss": 0.0107, + "step": 450 + }, + { + "epoch": 0.028153497766081156, + "grad_norm": 0.38459253311157227, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0081, + "step": 460 + }, + { + "epoch": 0.028765530326213356, + "grad_norm": 0.2647954523563385, + "learning_rate": 9.38e-06, + "loss": 0.0082, + "step": 470 + }, + { + "epoch": 0.029377562886345552, + "grad_norm": 0.44312018156051636, + "learning_rate": 9.58e-06, + "loss": 0.0102, + "step": 480 + }, + { + "epoch": 0.02998959544647775, + "grad_norm": 0.2309781014919281, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0118, + "step": 490 + }, + { + "epoch": 0.03060162800660995, + "grad_norm": 0.41755014657974243, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0094, + "step": 500 + }, + { + "epoch": 0.03121366056674215, + "grad_norm": 0.38537120819091797, + "learning_rate": 1.018e-05, + "loss": 0.011, + "step": 510 + }, + { + "epoch": 0.031825693126874346, + "grad_norm": 0.49801477789878845, + "learning_rate": 1.038e-05, + "loss": 0.0093, + "step": 520 + }, + { + "epoch": 0.03243772568700655, + "grad_norm": 0.3854966163635254, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0116, + "step": 530 + }, + { + "epoch": 0.033049758247138745, + "grad_norm": 0.3163810968399048, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.008, + "step": 540 + }, + { + "epoch": 0.03366179080727095, + "grad_norm": 0.33000636100769043, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0093, + "step": 550 + }, + { + "epoch": 0.034273823367403145, + "grad_norm": 0.3350297808647156, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0083, + "step": 560 + }, + { + "epoch": 0.03488585592753535, + "grad_norm": 0.18780949711799622, + "learning_rate": 1.138e-05, + "loss": 0.0097, + "step": 570 + }, + { + "epoch": 0.035497888487667544, + "grad_norm": 0.20399607717990875, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0092, + "step": 580 + }, + { + "epoch": 0.03610992104779974, + "grad_norm": 0.15931005775928497, + "learning_rate": 1.178e-05, + "loss": 0.0076, + "step": 590 + }, + { + "epoch": 0.03672195360793194, + "grad_norm": 0.20751547813415527, + "learning_rate": 1.198e-05, + "loss": 0.0079, + "step": 600 + }, + { + "epoch": 0.03733398616806414, + "grad_norm": 0.39666953682899475, + "learning_rate": 1.218e-05, + "loss": 0.0072, + "step": 610 + }, + { + "epoch": 0.03794601872819634, + "grad_norm": 0.385407030582428, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0089, + "step": 620 + }, + { + "epoch": 0.03855805128832854, + "grad_norm": 0.5228332877159119, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0092, + "step": 630 + }, + { + "epoch": 0.03917008384846074, + "grad_norm": 0.29315415024757385, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0098, + "step": 640 + }, + { + "epoch": 0.03978211640859294, + "grad_norm": 0.4300646483898163, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0079, + "step": 650 + }, + { + "epoch": 0.04039414896872514, + "grad_norm": 0.38021156191825867, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0103, + "step": 660 + }, + { + "epoch": 0.041006181528857336, + "grad_norm": 0.43489688634872437, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0105, + "step": 670 + }, + { + "epoch": 0.04161821408898953, + "grad_norm": 0.48019328713417053, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0124, + "step": 680 + }, + { + "epoch": 0.042230246649121735, + "grad_norm": 0.28486984968185425, + "learning_rate": 1.378e-05, + "loss": 0.0122, + "step": 690 + }, + { + "epoch": 0.04284227920925393, + "grad_norm": 0.35172080993652344, + "learning_rate": 1.398e-05, + "loss": 0.0093, + "step": 700 + }, + { + "epoch": 0.043454311769386134, + "grad_norm": 0.32531124353408813, + "learning_rate": 1.418e-05, + "loss": 0.0116, + "step": 710 + }, + { + "epoch": 0.04406634432951833, + "grad_norm": 0.388637512922287, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0077, + "step": 720 + }, + { + "epoch": 0.04467837688965053, + "grad_norm": 0.3816429078578949, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0123, + "step": 730 + }, + { + "epoch": 0.04529040944978273, + "grad_norm": 0.22786036133766174, + "learning_rate": 1.478e-05, + "loss": 0.0089, + "step": 740 + }, + { + "epoch": 0.045902442009914925, + "grad_norm": 0.2965328097343445, + "learning_rate": 1.498e-05, + "loss": 0.011, + "step": 750 + }, + { + "epoch": 0.04651447457004713, + "grad_norm": 0.3568362593650818, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0101, + "step": 760 + }, + { + "epoch": 0.047126507130179324, + "grad_norm": 0.2972166836261749, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0093, + "step": 770 + }, + { + "epoch": 0.04773853969031153, + "grad_norm": 0.4221388101577759, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.012, + "step": 780 + }, + { + "epoch": 0.04835057225044372, + "grad_norm": 0.37255391478538513, + "learning_rate": 1.578e-05, + "loss": 0.0085, + "step": 790 + }, + { + "epoch": 0.048962604810575926, + "grad_norm": 0.36007094383239746, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.008, + "step": 800 + }, + { + "epoch": 0.04957463737070812, + "grad_norm": 0.40588808059692383, + "learning_rate": 1.618e-05, + "loss": 0.0081, + "step": 810 + }, + { + "epoch": 0.05018666993084032, + "grad_norm": 0.46563687920570374, + "learning_rate": 1.638e-05, + "loss": 0.0076, + "step": 820 + }, + { + "epoch": 0.05079870249097252, + "grad_norm": 0.3161381483078003, + "learning_rate": 1.658e-05, + "loss": 0.0129, + "step": 830 + }, + { + "epoch": 0.05141073505110472, + "grad_norm": 0.3800298869609833, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0146, + "step": 840 + }, + { + "epoch": 0.05202276761123692, + "grad_norm": 0.36572107672691345, + "learning_rate": 1.698e-05, + "loss": 0.0148, + "step": 850 + }, + { + "epoch": 0.052634800171369116, + "grad_norm": 0.4084141254425049, + "learning_rate": 1.718e-05, + "loss": 0.0085, + "step": 860 + }, + { + "epoch": 0.05324683273150132, + "grad_norm": 0.2906867265701294, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0116, + "step": 870 + }, + { + "epoch": 0.053858865291633515, + "grad_norm": 0.41204380989074707, + "learning_rate": 1.758e-05, + "loss": 0.0076, + "step": 880 + }, + { + "epoch": 0.05447089785176571, + "grad_norm": 0.5292996764183044, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0094, + "step": 890 + }, + { + "epoch": 0.055082930411897914, + "grad_norm": 0.23192685842514038, + "learning_rate": 1.798e-05, + "loss": 0.0116, + "step": 900 + }, + { + "epoch": 0.05569496297203011, + "grad_norm": 0.41050270199775696, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0099, + "step": 910 + }, + { + "epoch": 0.05630699553216231, + "grad_norm": 0.3336002230644226, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0082, + "step": 920 + }, + { + "epoch": 0.05691902809229451, + "grad_norm": 0.46233776211738586, + "learning_rate": 1.858e-05, + "loss": 0.0104, + "step": 930 + }, + { + "epoch": 0.05753106065242671, + "grad_norm": 0.36776405572891235, + "learning_rate": 1.878e-05, + "loss": 0.0115, + "step": 940 + }, + { + "epoch": 0.05814309321255891, + "grad_norm": 0.47848618030548096, + "learning_rate": 1.898e-05, + "loss": 0.0108, + "step": 950 + }, + { + "epoch": 0.058755125772691104, + "grad_norm": 0.35507604479789734, + "learning_rate": 1.918e-05, + "loss": 0.0095, + "step": 960 + }, + { + "epoch": 0.05936715833282331, + "grad_norm": 0.4613397717475891, + "learning_rate": 1.938e-05, + "loss": 0.0119, + "step": 970 + }, + { + "epoch": 0.0599791908929555, + "grad_norm": 0.34492260217666626, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0109, + "step": 980 + }, + { + "epoch": 0.060591223453087706, + "grad_norm": 0.34624582529067993, + "learning_rate": 1.978e-05, + "loss": 0.0099, + "step": 990 + }, + { + "epoch": 0.0612032560132199, + "grad_norm": 0.9161475896835327, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0109, + "step": 1000 + }, + { + "epoch": 0.061815288573352105, + "grad_norm": 0.367807537317276, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0087, + "step": 1010 + }, + { + "epoch": 0.0624273211334843, + "grad_norm": 0.4043216407299042, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.0084, + "step": 1020 + }, + { + "epoch": 0.0630393536936165, + "grad_norm": 0.315305233001709, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0075, + "step": 1030 + }, + { + "epoch": 0.06365138625374869, + "grad_norm": 0.49702969193458557, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0103, + "step": 1040 + }, + { + "epoch": 0.0642634188138809, + "grad_norm": 0.46286216378211975, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0116, + "step": 1050 + }, + { + "epoch": 0.0648754513740131, + "grad_norm": 0.332142174243927, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0103, + "step": 1060 + }, + { + "epoch": 0.0654874839341453, + "grad_norm": 0.6118510961532593, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0118, + "step": 1070 + }, + { + "epoch": 0.06609951649427749, + "grad_norm": 0.49074795842170715, + "learning_rate": 1.999967041472886e-05, + "loss": 0.011, + "step": 1080 + }, + { + "epoch": 0.0667115490544097, + "grad_norm": 0.42575374245643616, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0125, + "step": 1090 + }, + { + "epoch": 0.0673235816145419, + "grad_norm": 0.3223794996738434, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0091, + "step": 1100 + }, + { + "epoch": 0.06793561417467409, + "grad_norm": 0.4952760636806488, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.011, + "step": 1110 + }, + { + "epoch": 0.06854764673480629, + "grad_norm": 0.36144813895225525, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0096, + "step": 1120 + }, + { + "epoch": 0.06915967929493849, + "grad_norm": 0.31190025806427, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0115, + "step": 1130 + }, + { + "epoch": 0.0697717118550707, + "grad_norm": 0.7014928460121155, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.014, + "step": 1140 + }, + { + "epoch": 0.07038374441520288, + "grad_norm": 0.4382205605506897, + "learning_rate": 1.999882759038658e-05, + "loss": 0.0111, + "step": 1150 + }, + { + "epoch": 0.07099577697533509, + "grad_norm": 0.3750714659690857, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0116, + "step": 1160 + }, + { + "epoch": 0.07160780953546729, + "grad_norm": 0.4174371361732483, + "learning_rate": 1.999849173538598e-05, + "loss": 0.009, + "step": 1170 + }, + { + "epoch": 0.07221984209559948, + "grad_norm": 0.44394591450691223, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0094, + "step": 1180 + }, + { + "epoch": 0.07283187465573168, + "grad_norm": 0.43412888050079346, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0098, + "step": 1190 + }, + { + "epoch": 0.07344390721586389, + "grad_norm": 0.6421196460723877, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.01, + "step": 1200 + }, + { + "epoch": 0.07405593977599609, + "grad_norm": 0.6313903331756592, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0137, + "step": 1210 + }, + { + "epoch": 0.07466797233612828, + "grad_norm": 0.49340254068374634, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0104, + "step": 1220 + }, + { + "epoch": 0.07528000489626048, + "grad_norm": 0.40420663356781006, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0126, + "step": 1230 + }, + { + "epoch": 0.07589203745639268, + "grad_norm": 0.3955318033695221, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.013, + "step": 1240 + }, + { + "epoch": 0.07650407001652489, + "grad_norm": 0.4967520236968994, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0098, + "step": 1250 + }, + { + "epoch": 0.07711610257665708, + "grad_norm": 0.3380029499530792, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0084, + "step": 1260 + }, + { + "epoch": 0.07772813513678928, + "grad_norm": 0.4542321562767029, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.009, + "step": 1270 + }, + { + "epoch": 0.07834016769692148, + "grad_norm": 0.4533286392688751, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0093, + "step": 1280 + }, + { + "epoch": 0.07895220025705367, + "grad_norm": 0.39559242129325867, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0106, + "step": 1290 + }, + { + "epoch": 0.07956423281718587, + "grad_norm": 0.23190362751483917, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.01, + "step": 1300 + }, + { + "epoch": 0.08017626537731808, + "grad_norm": 0.4732286334037781, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0089, + "step": 1310 + }, + { + "epoch": 0.08078829793745028, + "grad_norm": 0.3010174036026001, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0091, + "step": 1320 + }, + { + "epoch": 0.08140033049758247, + "grad_norm": 0.3989834189414978, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0097, + "step": 1330 + }, + { + "epoch": 0.08201236305771467, + "grad_norm": 0.4597114622592926, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.01, + "step": 1340 + }, + { + "epoch": 0.08262439561784687, + "grad_norm": 0.426826536655426, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.011, + "step": 1350 + }, + { + "epoch": 0.08323642817797906, + "grad_norm": 0.4876341223716736, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0105, + "step": 1360 + }, + { + "epoch": 0.08384846073811127, + "grad_norm": 0.5444457530975342, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0099, + "step": 1370 + }, + { + "epoch": 0.08446049329824347, + "grad_norm": 0.5096126794815063, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.007, + "step": 1380 + }, + { + "epoch": 0.08507252585837567, + "grad_norm": 0.43828368186950684, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.009, + "step": 1390 + }, + { + "epoch": 0.08568455841850786, + "grad_norm": 0.40163955092430115, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0103, + "step": 1400 + }, + { + "epoch": 0.08629659097864006, + "grad_norm": 0.3110432028770447, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0115, + "step": 1410 + }, + { + "epoch": 0.08690862353877227, + "grad_norm": 0.8393893241882324, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.012, + "step": 1420 + }, + { + "epoch": 0.08752065609890446, + "grad_norm": 0.2751714289188385, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0093, + "step": 1430 + }, + { + "epoch": 0.08813268865903666, + "grad_norm": 0.36969971656799316, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0112, + "step": 1440 + }, + { + "epoch": 0.08874472121916886, + "grad_norm": 0.3721938729286194, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0113, + "step": 1450 + }, + { + "epoch": 0.08935675377930107, + "grad_norm": 0.26564934849739075, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0107, + "step": 1460 + }, + { + "epoch": 0.08996878633943325, + "grad_norm": 0.36552169919013977, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0123, + "step": 1470 + }, + { + "epoch": 0.09058081889956546, + "grad_norm": 0.23664990067481995, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0074, + "step": 1480 + }, + { + "epoch": 0.09119285145969766, + "grad_norm": 0.49903133511543274, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0084, + "step": 1490 + }, + { + "epoch": 0.09180488401982985, + "grad_norm": 0.43505051732063293, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0115, + "step": 1500 + }, + { + "epoch": 0.09241691657996205, + "grad_norm": 0.20318932831287384, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0088, + "step": 1510 + }, + { + "epoch": 0.09302894914009426, + "grad_norm": 0.3289708197116852, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.008, + "step": 1520 + }, + { + "epoch": 0.09364098170022646, + "grad_norm": 0.3920934200286865, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0145, + "step": 1530 + }, + { + "epoch": 0.09425301426035865, + "grad_norm": 0.40396374464035034, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0081, + "step": 1540 + }, + { + "epoch": 0.09486504682049085, + "grad_norm": 0.4044182300567627, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.012, + "step": 1550 + }, + { + "epoch": 0.09547707938062305, + "grad_norm": 0.2318611741065979, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0115, + "step": 1560 + }, + { + "epoch": 0.09608911194075524, + "grad_norm": 0.3905714750289917, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.008, + "step": 1570 + }, + { + "epoch": 0.09670114450088745, + "grad_norm": 0.2516922652721405, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0084, + "step": 1580 + }, + { + "epoch": 0.09731317706101965, + "grad_norm": 0.338455468416214, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0122, + "step": 1590 + }, + { + "epoch": 0.09792520962115185, + "grad_norm": 0.31875041127204895, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0081, + "step": 1600 + }, + { + "epoch": 0.09853724218128404, + "grad_norm": 0.2996121644973755, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0068, + "step": 1610 + }, + { + "epoch": 0.09914927474141624, + "grad_norm": 0.4381162226200104, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0103, + "step": 1620 + }, + { + "epoch": 0.09976130730154845, + "grad_norm": 0.5531038045883179, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0168, + "step": 1630 + }, + { + "epoch": 0.10037333986168064, + "grad_norm": 1.1283385753631592, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0119, + "step": 1640 + }, + { + "epoch": 0.10098537242181284, + "grad_norm": 0.38017332553863525, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0096, + "step": 1650 + }, + { + "epoch": 0.10159740498194504, + "grad_norm": 0.4669477045536041, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0111, + "step": 1660 + }, + { + "epoch": 0.10220943754207724, + "grad_norm": 0.3903254270553589, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0098, + "step": 1670 + }, + { + "epoch": 0.10282147010220943, + "grad_norm": 0.49671587347984314, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0083, + "step": 1680 + }, + { + "epoch": 0.10343350266234164, + "grad_norm": 0.36555853486061096, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0088, + "step": 1690 + }, + { + "epoch": 0.10404553522247384, + "grad_norm": 0.21804726123809814, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0086, + "step": 1700 + }, + { + "epoch": 0.10465756778260603, + "grad_norm": 0.6744784116744995, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0109, + "step": 1710 + }, + { + "epoch": 0.10526960034273823, + "grad_norm": 0.34379470348358154, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0096, + "step": 1720 + }, + { + "epoch": 0.10588163290287043, + "grad_norm": 0.27760598063468933, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0095, + "step": 1730 + }, + { + "epoch": 0.10649366546300264, + "grad_norm": 0.36294442415237427, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0091, + "step": 1740 + }, + { + "epoch": 0.10710569802313483, + "grad_norm": 0.42200908064842224, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.011, + "step": 1750 + }, + { + "epoch": 0.10771773058326703, + "grad_norm": 0.47863906621932983, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0107, + "step": 1760 + }, + { + "epoch": 0.10832976314339923, + "grad_norm": 0.32717248797416687, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0112, + "step": 1770 + }, + { + "epoch": 0.10894179570353142, + "grad_norm": 0.4255545735359192, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0106, + "step": 1780 + }, + { + "epoch": 0.10955382826366362, + "grad_norm": 0.5034983158111572, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0098, + "step": 1790 + }, + { + "epoch": 0.11016586082379583, + "grad_norm": 0.37071412801742554, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0099, + "step": 1800 + }, + { + "epoch": 0.11077789338392803, + "grad_norm": 0.23624737560749054, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0069, + "step": 1810 + }, + { + "epoch": 0.11138992594406022, + "grad_norm": 0.5815485715866089, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0095, + "step": 1820 + }, + { + "epoch": 0.11200195850419242, + "grad_norm": 1.1828722953796387, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0104, + "step": 1830 + }, + { + "epoch": 0.11261399106432463, + "grad_norm": 0.38099589943885803, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0106, + "step": 1840 + }, + { + "epoch": 0.11322602362445681, + "grad_norm": 0.38476184010505676, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0112, + "step": 1850 + }, + { + "epoch": 0.11383805618458902, + "grad_norm": 0.48982104659080505, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0125, + "step": 1860 + }, + { + "epoch": 0.11445008874472122, + "grad_norm": 0.4165821671485901, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0099, + "step": 1870 + }, + { + "epoch": 0.11506212130485342, + "grad_norm": 0.3412662446498871, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0061, + "step": 1880 + }, + { + "epoch": 0.11567415386498561, + "grad_norm": 0.46617937088012695, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0129, + "step": 1890 + }, + { + "epoch": 0.11628618642511782, + "grad_norm": 0.2705824077129364, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0082, + "step": 1900 + }, + { + "epoch": 0.11689821898525002, + "grad_norm": 0.3567829430103302, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0125, + "step": 1910 + }, + { + "epoch": 0.11751025154538221, + "grad_norm": 0.4438138008117676, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0129, + "step": 1920 + }, + { + "epoch": 0.11812228410551441, + "grad_norm": 0.356703519821167, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0083, + "step": 1930 + }, + { + "epoch": 0.11873431666564661, + "grad_norm": 0.6039804220199585, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0086, + "step": 1940 + }, + { + "epoch": 0.11934634922577882, + "grad_norm": 0.4572801887989044, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0067, + "step": 1950 + }, + { + "epoch": 0.119958381785911, + "grad_norm": 0.5063445568084717, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0091, + "step": 1960 + }, + { + "epoch": 0.12057041434604321, + "grad_norm": 0.3467857837677002, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.008, + "step": 1970 + }, + { + "epoch": 0.12118244690617541, + "grad_norm": 0.4875742197036743, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0102, + "step": 1980 + }, + { + "epoch": 0.1217944794663076, + "grad_norm": 0.3209119141101837, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0081, + "step": 1990 + }, + { + "epoch": 0.1224065120264398, + "grad_norm": 0.4731980860233307, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0118, + "step": 2000 + }, + { + "epoch": 0.123018544586572, + "grad_norm": 0.5742963552474976, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.0125, + "step": 2010 + }, + { + "epoch": 0.12363057714670421, + "grad_norm": 0.41357406973838806, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.1242426097068364, + "grad_norm": 0.6277521252632141, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0096, + "step": 2030 + }, + { + "epoch": 0.1248546422669686, + "grad_norm": 0.41252902150154114, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0108, + "step": 2040 + }, + { + "epoch": 0.1254666748271008, + "grad_norm": 0.782122790813446, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0134, + "step": 2050 + }, + { + "epoch": 0.126078707387233, + "grad_norm": 0.45011264085769653, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0102, + "step": 2060 + }, + { + "epoch": 0.1266907399473652, + "grad_norm": 0.2724951207637787, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0088, + "step": 2070 + }, + { + "epoch": 0.12730277250749739, + "grad_norm": 0.2351481169462204, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.009, + "step": 2080 + }, + { + "epoch": 0.1279148050676296, + "grad_norm": 0.34568479657173157, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0092, + "step": 2090 + }, + { + "epoch": 0.1285268376277618, + "grad_norm": 0.44493499398231506, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0087, + "step": 2100 + }, + { + "epoch": 0.129138870187894, + "grad_norm": 0.3011283874511719, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0108, + "step": 2110 + }, + { + "epoch": 0.1297509027480262, + "grad_norm": 0.4170232117176056, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0087, + "step": 2120 + }, + { + "epoch": 0.1303629353081584, + "grad_norm": 0.2696056365966797, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0093, + "step": 2130 + }, + { + "epoch": 0.1309749678682906, + "grad_norm": 0.4092336893081665, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0083, + "step": 2140 + }, + { + "epoch": 0.13158700042842278, + "grad_norm": 0.36637401580810547, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.01, + "step": 2150 + }, + { + "epoch": 0.13219903298855498, + "grad_norm": 0.28675684332847595, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0079, + "step": 2160 + }, + { + "epoch": 0.13281106554868718, + "grad_norm": 0.27699902653694153, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0071, + "step": 2170 + }, + { + "epoch": 0.1334230981088194, + "grad_norm": 0.3832298517227173, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0104, + "step": 2180 + }, + { + "epoch": 0.1340351306689516, + "grad_norm": 0.3590598702430725, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0074, + "step": 2190 + }, + { + "epoch": 0.1346471632290838, + "grad_norm": 0.21830014884471893, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0093, + "step": 2200 + }, + { + "epoch": 0.135259195789216, + "grad_norm": 0.342492938041687, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0109, + "step": 2210 + }, + { + "epoch": 0.13587122834934817, + "grad_norm": 0.6337023973464966, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0082, + "step": 2220 + }, + { + "epoch": 0.13648326090948038, + "grad_norm": 0.41742798686027527, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0083, + "step": 2230 + }, + { + "epoch": 0.13709529346961258, + "grad_norm": 0.3180190324783325, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0083, + "step": 2240 + }, + { + "epoch": 0.13770732602974478, + "grad_norm": 0.36720144748687744, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0114, + "step": 2250 + }, + { + "epoch": 0.13831935858987698, + "grad_norm": 0.29457366466522217, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0077, + "step": 2260 + }, + { + "epoch": 0.1389313911500092, + "grad_norm": 0.24702222645282745, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0074, + "step": 2270 + }, + { + "epoch": 0.1395434237101414, + "grad_norm": 0.3203345835208893, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0079, + "step": 2280 + }, + { + "epoch": 0.14015545627027357, + "grad_norm": 0.4375395178794861, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0127, + "step": 2290 + }, + { + "epoch": 0.14076748883040577, + "grad_norm": 0.44338247179985046, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0077, + "step": 2300 + }, + { + "epoch": 0.14137952139053797, + "grad_norm": 0.31765618920326233, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0091, + "step": 2310 + }, + { + "epoch": 0.14199155395067017, + "grad_norm": 0.322534441947937, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0068, + "step": 2320 + }, + { + "epoch": 0.14260358651080238, + "grad_norm": 0.23571068048477173, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0094, + "step": 2330 + }, + { + "epoch": 0.14321561907093458, + "grad_norm": 0.26818808913230896, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0092, + "step": 2340 + }, + { + "epoch": 0.14382765163106678, + "grad_norm": 0.31886982917785645, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0084, + "step": 2350 + }, + { + "epoch": 0.14443968419119896, + "grad_norm": 0.5176070928573608, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0104, + "step": 2360 + }, + { + "epoch": 0.14505171675133116, + "grad_norm": 0.4322161078453064, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0081, + "step": 2370 + }, + { + "epoch": 0.14566374931146336, + "grad_norm": 0.4076510965824127, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0102, + "step": 2380 + }, + { + "epoch": 0.14627578187159557, + "grad_norm": 0.3808838725090027, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0096, + "step": 2390 + }, + { + "epoch": 0.14688781443172777, + "grad_norm": 0.5045232176780701, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0109, + "step": 2400 + }, + { + "epoch": 0.14749984699185997, + "grad_norm": 0.3932737708091736, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0077, + "step": 2410 + }, + { + "epoch": 0.14811187955199218, + "grad_norm": 0.28561875224113464, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0141, + "step": 2420 + }, + { + "epoch": 0.14872391211212435, + "grad_norm": 0.414410799741745, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0094, + "step": 2430 + }, + { + "epoch": 0.14933594467225655, + "grad_norm": 0.4587285816669464, + "learning_rate": 1.989086647373215e-05, + "loss": 0.009, + "step": 2440 + }, + { + "epoch": 0.14994797723238876, + "grad_norm": 0.7567377686500549, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0084, + "step": 2450 + }, + { + "epoch": 0.15056000979252096, + "grad_norm": 0.4980221390724182, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0087, + "step": 2460 + }, + { + "epoch": 0.15117204235265316, + "grad_norm": 0.41810303926467896, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0082, + "step": 2470 + }, + { + "epoch": 0.15178407491278537, + "grad_norm": 0.4193445146083832, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0131, + "step": 2480 + }, + { + "epoch": 0.15239610747291757, + "grad_norm": 0.2561246156692505, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0074, + "step": 2490 + }, + { + "epoch": 0.15300814003304977, + "grad_norm": 0.22316500544548035, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0069, + "step": 2500 + }, + { + "epoch": 0.15362017259318195, + "grad_norm": 0.31504112482070923, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0097, + "step": 2510 + }, + { + "epoch": 0.15423220515331415, + "grad_norm": 0.2944568991661072, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0101, + "step": 2520 + }, + { + "epoch": 0.15484423771344635, + "grad_norm": 0.2744649052619934, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0074, + "step": 2530 + }, + { + "epoch": 0.15545627027357856, + "grad_norm": 0.2717166841030121, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.007, + "step": 2540 + }, + { + "epoch": 0.15606830283371076, + "grad_norm": 0.32652929425239563, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0097, + "step": 2550 + }, + { + "epoch": 0.15668033539384296, + "grad_norm": 0.3169964849948883, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0089, + "step": 2560 + }, + { + "epoch": 0.15729236795397517, + "grad_norm": 0.24130010604858398, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0083, + "step": 2570 + }, + { + "epoch": 0.15790440051410734, + "grad_norm": 0.3869011700153351, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0088, + "step": 2580 + }, + { + "epoch": 0.15851643307423954, + "grad_norm": 0.2944110333919525, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0085, + "step": 2590 + }, + { + "epoch": 0.15912846563437175, + "grad_norm": 0.27993839979171753, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0097, + "step": 2600 + }, + { + "epoch": 0.15974049819450395, + "grad_norm": 0.42018845677375793, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0104, + "step": 2610 + }, + { + "epoch": 0.16035253075463615, + "grad_norm": 0.45006832480430603, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0072, + "step": 2620 + }, + { + "epoch": 0.16096456331476836, + "grad_norm": 0.275564581155777, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0072, + "step": 2630 + }, + { + "epoch": 0.16157659587490056, + "grad_norm": 0.503052294254303, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0091, + "step": 2640 + }, + { + "epoch": 0.16218862843503273, + "grad_norm": 0.33740976452827454, + "learning_rate": 1.985678043265668e-05, + "loss": 0.008, + "step": 2650 + }, + { + "epoch": 0.16280066099516494, + "grad_norm": 0.5379078984260559, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0097, + "step": 2660 + }, + { + "epoch": 0.16341269355529714, + "grad_norm": 0.3605813980102539, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0109, + "step": 2670 + }, + { + "epoch": 0.16402472611542934, + "grad_norm": 0.49490585923194885, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.013, + "step": 2680 + }, + { + "epoch": 0.16463675867556155, + "grad_norm": 0.29894375801086426, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0089, + "step": 2690 + }, + { + "epoch": 0.16524879123569375, + "grad_norm": 0.395270437002182, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0092, + "step": 2700 + }, + { + "epoch": 0.16586082379582595, + "grad_norm": 0.25507843494415283, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0078, + "step": 2710 + }, + { + "epoch": 0.16647285635595813, + "grad_norm": 0.3304852843284607, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0178, + "step": 2720 + }, + { + "epoch": 0.16708488891609033, + "grad_norm": 0.4356633126735687, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0082, + "step": 2730 + }, + { + "epoch": 0.16769692147622253, + "grad_norm": 0.4104527533054352, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0086, + "step": 2740 + }, + { + "epoch": 0.16830895403635474, + "grad_norm": 0.25723493099212646, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0097, + "step": 2750 + }, + { + "epoch": 0.16892098659648694, + "grad_norm": 0.3280608057975769, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0079, + "step": 2760 + }, + { + "epoch": 0.16953301915661914, + "grad_norm": 0.4641128480434418, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0081, + "step": 2770 + }, + { + "epoch": 0.17014505171675134, + "grad_norm": 0.2704941928386688, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.0112, + "step": 2780 + }, + { + "epoch": 0.17075708427688352, + "grad_norm": 0.42343780398368835, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0084, + "step": 2790 + }, + { + "epoch": 0.17136911683701572, + "grad_norm": 0.2606532573699951, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0085, + "step": 2800 + }, + { + "epoch": 0.17198114939714793, + "grad_norm": 0.39099374413490295, + "learning_rate": 1.982773261916081e-05, + "loss": 0.014, + "step": 2810 + }, + { + "epoch": 0.17259318195728013, + "grad_norm": 0.32653889060020447, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0095, + "step": 2820 + }, + { + "epoch": 0.17320521451741233, + "grad_norm": 0.34765321016311646, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0084, + "step": 2830 + }, + { + "epoch": 0.17381724707754453, + "grad_norm": 0.2844177186489105, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.011, + "step": 2840 + }, + { + "epoch": 0.17442927963767674, + "grad_norm": 0.5079899430274963, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0114, + "step": 2850 + }, + { + "epoch": 0.1750413121978089, + "grad_norm": 0.4043678045272827, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0073, + "step": 2860 + }, + { + "epoch": 0.17565334475794112, + "grad_norm": 0.3833003640174866, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0076, + "step": 2870 + }, + { + "epoch": 0.17626537731807332, + "grad_norm": 0.2826341986656189, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0072, + "step": 2880 + }, + { + "epoch": 0.17687740987820552, + "grad_norm": 0.6043460965156555, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0084, + "step": 2890 + }, + { + "epoch": 0.17748944243833772, + "grad_norm": 0.3238481879234314, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0149, + "step": 2900 + }, + { + "epoch": 0.17810147499846993, + "grad_norm": 0.45817995071411133, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0103, + "step": 2910 + }, + { + "epoch": 0.17871350755860213, + "grad_norm": 0.21048744022846222, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0094, + "step": 2920 + }, + { + "epoch": 0.1793255401187343, + "grad_norm": 0.3401891887187958, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0058, + "step": 2930 + }, + { + "epoch": 0.1799375726788665, + "grad_norm": 0.3655509948730469, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0071, + "step": 2940 + }, + { + "epoch": 0.1805496052389987, + "grad_norm": 0.47406241297721863, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0067, + "step": 2950 + }, + { + "epoch": 0.18116163779913091, + "grad_norm": 0.3278841972351074, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0121, + "step": 2960 + }, + { + "epoch": 0.18177367035926312, + "grad_norm": 0.271436482667923, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.008, + "step": 2970 + }, + { + "epoch": 0.18238570291939532, + "grad_norm": 0.41475561261177063, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.008, + "step": 2980 + }, + { + "epoch": 0.18299773547952752, + "grad_norm": 0.5389090776443481, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0091, + "step": 2990 + }, + { + "epoch": 0.1836097680396597, + "grad_norm": 0.3958609700202942, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0106, + "step": 3000 + }, + { + "epoch": 0.1842218005997919, + "grad_norm": 0.3456019461154938, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0084, + "step": 3010 + }, + { + "epoch": 0.1848338331599241, + "grad_norm": 0.2959386706352234, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0071, + "step": 3020 + }, + { + "epoch": 0.1854458657200563, + "grad_norm": 0.2617223858833313, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0082, + "step": 3030 + }, + { + "epoch": 0.1860578982801885, + "grad_norm": 0.45173966884613037, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0105, + "step": 3040 + }, + { + "epoch": 0.1866699308403207, + "grad_norm": 0.4127421975135803, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.008, + "step": 3050 + }, + { + "epoch": 0.18728196340045292, + "grad_norm": 0.3142230808734894, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0085, + "step": 3060 + }, + { + "epoch": 0.1878939959605851, + "grad_norm": 0.49720287322998047, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0089, + "step": 3070 + }, + { + "epoch": 0.1885060285207173, + "grad_norm": 0.6417365074157715, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.0088, + "step": 3080 + }, + { + "epoch": 0.1891180610808495, + "grad_norm": 0.44801583886146545, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.0098, + "step": 3090 + }, + { + "epoch": 0.1897300936409817, + "grad_norm": 0.3606127202510834, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0087, + "step": 3100 + }, + { + "epoch": 0.1903421262011139, + "grad_norm": 0.268971711397171, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0092, + "step": 3110 + }, + { + "epoch": 0.1909541587612461, + "grad_norm": 0.2367011308670044, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0074, + "step": 3120 + }, + { + "epoch": 0.1915661913213783, + "grad_norm": 0.41643625497817993, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0062, + "step": 3130 + }, + { + "epoch": 0.19217822388151048, + "grad_norm": 0.33202284574508667, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0081, + "step": 3140 + }, + { + "epoch": 0.1927902564416427, + "grad_norm": 0.279813289642334, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0074, + "step": 3150 + }, + { + "epoch": 0.1934022890017749, + "grad_norm": 0.5127174258232117, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0101, + "step": 3160 + }, + { + "epoch": 0.1940143215619071, + "grad_norm": 0.36921849846839905, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0078, + "step": 3170 + }, + { + "epoch": 0.1946263541220393, + "grad_norm": 0.3509728014469147, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0114, + "step": 3180 + }, + { + "epoch": 0.1952383866821715, + "grad_norm": 0.3088139295578003, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0089, + "step": 3190 + }, + { + "epoch": 0.1958504192423037, + "grad_norm": 0.43653762340545654, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0116, + "step": 3200 + }, + { + "epoch": 0.19646245180243588, + "grad_norm": 0.2522308826446533, + "learning_rate": 1.974353140804231e-05, + "loss": 0.007, + "step": 3210 + }, + { + "epoch": 0.19707448436256808, + "grad_norm": 0.37519100308418274, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0098, + "step": 3220 + }, + { + "epoch": 0.19768651692270028, + "grad_norm": 0.379027783870697, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0086, + "step": 3230 + }, + { + "epoch": 0.1982985494828325, + "grad_norm": 0.2713090479373932, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0073, + "step": 3240 + }, + { + "epoch": 0.1989105820429647, + "grad_norm": 0.41106846928596497, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0188, + "step": 3250 + }, + { + "epoch": 0.1995226146030969, + "grad_norm": 0.3914758861064911, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0097, + "step": 3260 + }, + { + "epoch": 0.2001346471632291, + "grad_norm": 0.4763018488883972, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0082, + "step": 3270 + }, + { + "epoch": 0.20074667972336127, + "grad_norm": 0.23002664744853973, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0085, + "step": 3280 + }, + { + "epoch": 0.20135871228349347, + "grad_norm": 0.2887377142906189, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0074, + "step": 3290 + }, + { + "epoch": 0.20197074484362568, + "grad_norm": 0.2322079837322235, + "learning_rate": 1.972231769371516e-05, + "loss": 0.009, + "step": 3300 + }, + { + "epoch": 0.20258277740375788, + "grad_norm": 0.39307233691215515, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0095, + "step": 3310 + }, + { + "epoch": 0.20319480996389008, + "grad_norm": 0.5209783315658569, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.012, + "step": 3320 + }, + { + "epoch": 0.20380684252402229, + "grad_norm": 0.45187172293663025, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0086, + "step": 3330 + }, + { + "epoch": 0.2044188750841545, + "grad_norm": 0.480970174074173, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0072, + "step": 3340 + }, + { + "epoch": 0.20503090764428666, + "grad_norm": 0.30979010462760925, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0091, + "step": 3350 + }, + { + "epoch": 0.20564294020441887, + "grad_norm": 0.6410729289054871, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0096, + "step": 3360 + }, + { + "epoch": 0.20625497276455107, + "grad_norm": 0.23707512021064758, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0078, + "step": 3370 + }, + { + "epoch": 0.20686700532468327, + "grad_norm": 0.3029544949531555, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0115, + "step": 3380 + }, + { + "epoch": 0.20747903788481548, + "grad_norm": 0.28677740693092346, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0068, + "step": 3390 + }, + { + "epoch": 0.20809107044494768, + "grad_norm": 0.2433662712574005, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0082, + "step": 3400 + }, + { + "epoch": 0.20870310300507988, + "grad_norm": 0.38066667318344116, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0089, + "step": 3410 + }, + { + "epoch": 0.20931513556521206, + "grad_norm": 0.3830282390117645, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0073, + "step": 3420 + }, + { + "epoch": 0.20992716812534426, + "grad_norm": 0.359684556722641, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0088, + "step": 3430 + }, + { + "epoch": 0.21053920068547646, + "grad_norm": 0.3497346341609955, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0082, + "step": 3440 + }, + { + "epoch": 0.21115123324560867, + "grad_norm": 0.3664748966693878, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0091, + "step": 3450 + }, + { + "epoch": 0.21176326580574087, + "grad_norm": 0.382804811000824, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0077, + "step": 3460 + }, + { + "epoch": 0.21237529836587307, + "grad_norm": 0.22746194899082184, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0107, + "step": 3470 + }, + { + "epoch": 0.21298733092600527, + "grad_norm": 0.4094266891479492, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0096, + "step": 3480 + }, + { + "epoch": 0.21359936348613745, + "grad_norm": 0.26990365982055664, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0089, + "step": 3490 + }, + { + "epoch": 0.21421139604626965, + "grad_norm": 0.2602371275424957, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0074, + "step": 3500 + }, + { + "epoch": 0.21482342860640186, + "grad_norm": 0.34200435876846313, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0083, + "step": 3510 + }, + { + "epoch": 0.21543546116653406, + "grad_norm": 0.4260508716106415, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0098, + "step": 3520 + }, + { + "epoch": 0.21604749372666626, + "grad_norm": 0.4017483592033386, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0103, + "step": 3530 + }, + { + "epoch": 0.21665952628679847, + "grad_norm": 0.40005844831466675, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0094, + "step": 3540 + }, + { + "epoch": 0.21727155884693067, + "grad_norm": 0.3856841027736664, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0095, + "step": 3550 + }, + { + "epoch": 0.21788359140706284, + "grad_norm": 0.3245168626308441, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0067, + "step": 3560 + }, + { + "epoch": 0.21849562396719505, + "grad_norm": 0.2698485255241394, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0079, + "step": 3570 + }, + { + "epoch": 0.21910765652732725, + "grad_norm": 0.24520452320575714, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0109, + "step": 3580 + }, + { + "epoch": 0.21971968908745945, + "grad_norm": 0.397175133228302, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0114, + "step": 3590 + }, + { + "epoch": 0.22033172164759166, + "grad_norm": 0.40339091420173645, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0094, + "step": 3600 + }, + { + "epoch": 0.22094375420772386, + "grad_norm": 0.404435396194458, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0087, + "step": 3610 + }, + { + "epoch": 0.22155578676785606, + "grad_norm": 0.3300188183784485, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0073, + "step": 3620 + }, + { + "epoch": 0.22216781932798824, + "grad_norm": 0.23486892879009247, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0087, + "step": 3630 + }, + { + "epoch": 0.22277985188812044, + "grad_norm": 0.37211188673973083, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0078, + "step": 3640 + }, + { + "epoch": 0.22339188444825264, + "grad_norm": 0.32422709465026855, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.009, + "step": 3650 + }, + { + "epoch": 0.22400391700838485, + "grad_norm": 0.43535664677619934, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0099, + "step": 3660 + }, + { + "epoch": 0.22461594956851705, + "grad_norm": 0.3295724093914032, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0074, + "step": 3670 + }, + { + "epoch": 0.22522798212864925, + "grad_norm": 0.2840734124183655, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0082, + "step": 3680 + }, + { + "epoch": 0.22584001468878145, + "grad_norm": 0.2861844599246979, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0079, + "step": 3690 + }, + { + "epoch": 0.22645204724891363, + "grad_norm": 0.3194407820701599, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0071, + "step": 3700 + }, + { + "epoch": 0.22706407980904583, + "grad_norm": 0.38770729303359985, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0076, + "step": 3710 + }, + { + "epoch": 0.22767611236917804, + "grad_norm": 0.4637960195541382, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0093, + "step": 3720 + }, + { + "epoch": 0.22828814492931024, + "grad_norm": 0.31972312927246094, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0078, + "step": 3730 + }, + { + "epoch": 0.22890017748944244, + "grad_norm": 0.5273001790046692, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0076, + "step": 3740 + }, + { + "epoch": 0.22951221004957464, + "grad_norm": 0.30589622259140015, + "learning_rate": 1.960385541132679e-05, + "loss": 0.009, + "step": 3750 + }, + { + "epoch": 0.23012424260970685, + "grad_norm": 0.31634265184402466, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0063, + "step": 3760 + }, + { + "epoch": 0.23073627516983902, + "grad_norm": 0.32762402296066284, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0081, + "step": 3770 + }, + { + "epoch": 0.23134830772997123, + "grad_norm": 0.42696496844291687, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0089, + "step": 3780 + }, + { + "epoch": 0.23196034029010343, + "grad_norm": 0.4676671624183655, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0093, + "step": 3790 + }, + { + "epoch": 0.23257237285023563, + "grad_norm": 0.3347911536693573, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0131, + "step": 3800 + }, + { + "epoch": 0.23318440541036783, + "grad_norm": 0.3083193600177765, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0072, + "step": 3810 + }, + { + "epoch": 0.23379643797050004, + "grad_norm": 0.38178423047065735, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0083, + "step": 3820 + }, + { + "epoch": 0.23440847053063224, + "grad_norm": 0.2796846330165863, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0072, + "step": 3830 + }, + { + "epoch": 0.23502050309076442, + "grad_norm": 0.37444883584976196, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.008, + "step": 3840 + }, + { + "epoch": 0.23563253565089662, + "grad_norm": 0.3286772668361664, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0092, + "step": 3850 + }, + { + "epoch": 0.23624456821102882, + "grad_norm": 0.45423513650894165, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0068, + "step": 3860 + }, + { + "epoch": 0.23685660077116102, + "grad_norm": 0.36881721019744873, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0068, + "step": 3870 + }, + { + "epoch": 0.23746863333129323, + "grad_norm": 0.3560579717159271, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0084, + "step": 3880 + }, + { + "epoch": 0.23808066589142543, + "grad_norm": 0.43887296319007874, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0076, + "step": 3890 + }, + { + "epoch": 0.23869269845155763, + "grad_norm": 0.3080165982246399, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0069, + "step": 3900 + }, + { + "epoch": 0.2393047310116898, + "grad_norm": 0.2327195703983307, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0077, + "step": 3910 + }, + { + "epoch": 0.239916763571822, + "grad_norm": 0.5960802435874939, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0088, + "step": 3920 + }, + { + "epoch": 0.24052879613195421, + "grad_norm": 0.36213600635528564, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0076, + "step": 3930 + }, + { + "epoch": 0.24114082869208642, + "grad_norm": 0.2950032949447632, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0115, + "step": 3940 + }, + { + "epoch": 0.24175286125221862, + "grad_norm": 0.4527084529399872, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0089, + "step": 3950 + }, + { + "epoch": 0.24236489381235082, + "grad_norm": 0.4422491192817688, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0135, + "step": 3960 + }, + { + "epoch": 0.24297692637248303, + "grad_norm": 0.45049232244491577, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 0.2435889589326152, + "grad_norm": 0.2566494941711426, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0095, + "step": 3980 + }, + { + "epoch": 0.2442009914927474, + "grad_norm": 0.49880343675613403, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0099, + "step": 3990 + }, + { + "epoch": 0.2448130240528796, + "grad_norm": 0.4699341952800751, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0084, + "step": 4000 + }, + { + "epoch": 0.2454250566130118, + "grad_norm": 0.41230708360671997, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0071, + "step": 4010 + }, + { + "epoch": 0.246037089173144, + "grad_norm": 0.4836854934692383, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.01, + "step": 4020 + }, + { + "epoch": 0.24664912173327622, + "grad_norm": 0.3056115508079529, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0082, + "step": 4030 + }, + { + "epoch": 0.24726115429340842, + "grad_norm": 0.151325523853302, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0062, + "step": 4040 + }, + { + "epoch": 0.2478731868535406, + "grad_norm": 0.3798811137676239, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0081, + "step": 4050 + }, + { + "epoch": 0.2484852194136728, + "grad_norm": 0.3308229148387909, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0072, + "step": 4060 + }, + { + "epoch": 0.249097251973805, + "grad_norm": 0.2891339957714081, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0074, + "step": 4070 + }, + { + "epoch": 0.2497092845339372, + "grad_norm": 0.24179549515247345, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 0.2503213170940694, + "grad_norm": 0.20879383385181427, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0064, + "step": 4090 + }, + { + "epoch": 0.2509333496542016, + "grad_norm": 0.39275774359703064, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0073, + "step": 4100 + }, + { + "epoch": 0.2515453822143338, + "grad_norm": 0.2925782799720764, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0095, + "step": 4110 + }, + { + "epoch": 0.252157414774466, + "grad_norm": 0.6465128660202026, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0102, + "step": 4120 + }, + { + "epoch": 0.2527694473345982, + "grad_norm": 0.34663915634155273, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.007, + "step": 4130 + }, + { + "epoch": 0.2533814798947304, + "grad_norm": 0.3387165367603302, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0091, + "step": 4140 + }, + { + "epoch": 0.2539935124548626, + "grad_norm": 0.32989630103111267, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0084, + "step": 4150 + }, + { + "epoch": 0.25460554501499477, + "grad_norm": 0.22870391607284546, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0071, + "step": 4160 + }, + { + "epoch": 0.255217577575127, + "grad_norm": 0.3866496682167053, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0079, + "step": 4170 + }, + { + "epoch": 0.2558296101352592, + "grad_norm": 0.29885268211364746, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0068, + "step": 4180 + }, + { + "epoch": 0.2564416426953914, + "grad_norm": 0.4693736135959625, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0095, + "step": 4190 + }, + { + "epoch": 0.2570536752555236, + "grad_norm": 0.2822454273700714, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0074, + "step": 4200 + }, + { + "epoch": 0.2576657078156558, + "grad_norm": 0.21141012012958527, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0099, + "step": 4210 + }, + { + "epoch": 0.258277740375788, + "grad_norm": 0.2284570336341858, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0102, + "step": 4220 + }, + { + "epoch": 0.2588897729359202, + "grad_norm": 0.4675048887729645, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0112, + "step": 4230 + }, + { + "epoch": 0.2595018054960524, + "grad_norm": 0.3906441628932953, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0088, + "step": 4240 + }, + { + "epoch": 0.2601138380561846, + "grad_norm": 0.22990387678146362, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0091, + "step": 4250 + }, + { + "epoch": 0.2607258706163168, + "grad_norm": 0.41871073842048645, + "learning_rate": 1.944490251296856e-05, + "loss": 0.009, + "step": 4260 + }, + { + "epoch": 0.261337903176449, + "grad_norm": 0.2724440395832062, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0074, + "step": 4270 + }, + { + "epoch": 0.2619499357365812, + "grad_norm": 0.42590636014938354, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0069, + "step": 4280 + }, + { + "epoch": 0.2625619682967134, + "grad_norm": 0.3604855239391327, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0068, + "step": 4290 + }, + { + "epoch": 0.26317400085684556, + "grad_norm": 0.475304514169693, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0082, + "step": 4300 + }, + { + "epoch": 0.26378603341697776, + "grad_norm": 0.24752479791641235, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0065, + "step": 4310 + }, + { + "epoch": 0.26439806597710996, + "grad_norm": 0.4384835958480835, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0104, + "step": 4320 + }, + { + "epoch": 0.26501009853724217, + "grad_norm": 0.24999107420444489, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0076, + "step": 4330 + }, + { + "epoch": 0.26562213109737437, + "grad_norm": 0.292491614818573, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0069, + "step": 4340 + }, + { + "epoch": 0.2662341636575066, + "grad_norm": 0.2380208522081375, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0069, + "step": 4350 + }, + { + "epoch": 0.2668461962176388, + "grad_norm": 0.2906023859977722, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0063, + "step": 4360 + }, + { + "epoch": 0.267458228777771, + "grad_norm": 0.4718990623950958, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0074, + "step": 4370 + }, + { + "epoch": 0.2680702613379032, + "grad_norm": 0.33257269859313965, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0073, + "step": 4380 + }, + { + "epoch": 0.2686822938980354, + "grad_norm": 0.34411463141441345, + "learning_rate": 1.940024231916886e-05, + "loss": 0.006, + "step": 4390 + }, + { + "epoch": 0.2692943264581676, + "grad_norm": 0.40312516689300537, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0093, + "step": 4400 + }, + { + "epoch": 0.2699063590182998, + "grad_norm": 0.2248350828886032, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0082, + "step": 4410 + }, + { + "epoch": 0.270518391578432, + "grad_norm": 0.30094820261001587, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0092, + "step": 4420 + }, + { + "epoch": 0.2711304241385642, + "grad_norm": 0.4277440309524536, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0099, + "step": 4430 + }, + { + "epoch": 0.27174245669869634, + "grad_norm": 0.2876254916191101, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0078, + "step": 4440 + }, + { + "epoch": 0.27235448925882855, + "grad_norm": 0.3453986346721649, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0096, + "step": 4450 + }, + { + "epoch": 0.27296652181896075, + "grad_norm": 0.31379634141921997, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0088, + "step": 4460 + }, + { + "epoch": 0.27357855437909295, + "grad_norm": 0.294477254152298, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0073, + "step": 4470 + }, + { + "epoch": 0.27419058693922516, + "grad_norm": 0.3773270845413208, + "learning_rate": 1.936834723687526e-05, + "loss": 0.008, + "step": 4480 + }, + { + "epoch": 0.27480261949935736, + "grad_norm": 0.31942978501319885, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0086, + "step": 4490 + }, + { + "epoch": 0.27541465205948956, + "grad_norm": 0.46827632188796997, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0081, + "step": 4500 + }, + { + "epoch": 0.27602668461962176, + "grad_norm": 0.2735249102115631, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0107, + "step": 4510 + }, + { + "epoch": 0.27663871717975397, + "grad_norm": 0.30048197507858276, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0082, + "step": 4520 + }, + { + "epoch": 0.27725074973988617, + "grad_norm": 0.3507469594478607, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0093, + "step": 4530 + }, + { + "epoch": 0.2778627823000184, + "grad_norm": 0.5642989277839661, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0091, + "step": 4540 + }, + { + "epoch": 0.2784748148601506, + "grad_norm": 0.2769993245601654, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0105, + "step": 4550 + }, + { + "epoch": 0.2790868474202828, + "grad_norm": 0.30269622802734375, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0066, + "step": 4560 + }, + { + "epoch": 0.279698879980415, + "grad_norm": 0.3717023432254791, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0114, + "step": 4570 + }, + { + "epoch": 0.28031091254054713, + "grad_norm": 0.5065163373947144, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0075, + "step": 4580 + }, + { + "epoch": 0.28092294510067933, + "grad_norm": 0.4302189350128174, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0077, + "step": 4590 + }, + { + "epoch": 0.28153497766081154, + "grad_norm": 0.44008374214172363, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0086, + "step": 4600 + }, + { + "epoch": 0.28214701022094374, + "grad_norm": 0.4647364318370819, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0073, + "step": 4610 + }, + { + "epoch": 0.28275904278107594, + "grad_norm": 0.4229913651943207, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0077, + "step": 4620 + }, + { + "epoch": 0.28337107534120815, + "grad_norm": 0.36600178480148315, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0078, + "step": 4630 + }, + { + "epoch": 0.28398310790134035, + "grad_norm": 0.47143280506134033, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0085, + "step": 4640 + }, + { + "epoch": 0.28459514046147255, + "grad_norm": 0.29140496253967285, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0056, + "step": 4650 + }, + { + "epoch": 0.28520717302160475, + "grad_norm": 0.3964666426181793, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0071, + "step": 4660 + }, + { + "epoch": 0.28581920558173696, + "grad_norm": 0.407536119222641, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0095, + "step": 4670 + }, + { + "epoch": 0.28643123814186916, + "grad_norm": 0.33687031269073486, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0091, + "step": 4680 + }, + { + "epoch": 0.28704327070200136, + "grad_norm": 0.3182448446750641, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0087, + "step": 4690 + }, + { + "epoch": 0.28765530326213357, + "grad_norm": 0.40998023748397827, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0099, + "step": 4700 + }, + { + "epoch": 0.28826733582226577, + "grad_norm": 0.28750360012054443, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0087, + "step": 4710 + }, + { + "epoch": 0.2888793683823979, + "grad_norm": 0.36494627594947815, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0062, + "step": 4720 + }, + { + "epoch": 0.2894914009425301, + "grad_norm": 0.37047910690307617, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0094, + "step": 4730 + }, + { + "epoch": 0.2901034335026623, + "grad_norm": 0.2577553987503052, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0082, + "step": 4740 + }, + { + "epoch": 0.2907154660627945, + "grad_norm": 0.24589397013187408, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0067, + "step": 4750 + }, + { + "epoch": 0.29132749862292673, + "grad_norm": 0.37927499413490295, + "learning_rate": 1.926404507646751e-05, + "loss": 0.008, + "step": 4760 + }, + { + "epoch": 0.29193953118305893, + "grad_norm": 0.40547946095466614, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0101, + "step": 4770 + }, + { + "epoch": 0.29255156374319113, + "grad_norm": 0.47896578907966614, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0114, + "step": 4780 + }, + { + "epoch": 0.29316359630332334, + "grad_norm": 0.42911696434020996, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0066, + "step": 4790 + }, + { + "epoch": 0.29377562886345554, + "grad_norm": 0.21735505759716034, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0072, + "step": 4800 + }, + { + "epoch": 0.29438766142358774, + "grad_norm": 0.25916650891304016, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0078, + "step": 4810 + }, + { + "epoch": 0.29499969398371995, + "grad_norm": 0.23863966763019562, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0086, + "step": 4820 + }, + { + "epoch": 0.29561172654385215, + "grad_norm": 0.41552650928497314, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0078, + "step": 4830 + }, + { + "epoch": 0.29622375910398435, + "grad_norm": 0.2775874733924866, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0082, + "step": 4840 + }, + { + "epoch": 0.29683579166411656, + "grad_norm": 0.28962916135787964, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0088, + "step": 4850 + }, + { + "epoch": 0.2974478242242487, + "grad_norm": 0.3488757610321045, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0076, + "step": 4860 + }, + { + "epoch": 0.2980598567843809, + "grad_norm": 0.3833489716053009, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0096, + "step": 4870 + }, + { + "epoch": 0.2986718893445131, + "grad_norm": 0.20357537269592285, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.0061, + "step": 4880 + }, + { + "epoch": 0.2992839219046453, + "grad_norm": 0.4648539423942566, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0099, + "step": 4890 + }, + { + "epoch": 0.2998959544647775, + "grad_norm": 0.2701941728591919, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0062, + "step": 4900 + }, + { + "epoch": 0.3005079870249097, + "grad_norm": 0.31277161836624146, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0069, + "step": 4910 + }, + { + "epoch": 0.3011200195850419, + "grad_norm": 0.27697697281837463, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0094, + "step": 4920 + }, + { + "epoch": 0.3017320521451741, + "grad_norm": 0.22880606353282928, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0074, + "step": 4930 + }, + { + "epoch": 0.3023440847053063, + "grad_norm": 0.258404940366745, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0078, + "step": 4940 + }, + { + "epoch": 0.30295611726543853, + "grad_norm": 0.394394189119339, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0109, + "step": 4950 + }, + { + "epoch": 0.30356814982557073, + "grad_norm": 0.24108687043190002, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0082, + "step": 4960 + }, + { + "epoch": 0.30418018238570294, + "grad_norm": 0.34520867466926575, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0098, + "step": 4970 + }, + { + "epoch": 0.30479221494583514, + "grad_norm": 0.33723267912864685, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0104, + "step": 4980 + }, + { + "epoch": 0.30540424750596734, + "grad_norm": 0.28276878595352173, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0072, + "step": 4990 + }, + { + "epoch": 0.30601628006609954, + "grad_norm": 0.32236188650131226, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.012, + "step": 5000 + }, + { + "epoch": 0.3066283126262317, + "grad_norm": 0.20596888661384583, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0073, + "step": 5010 + }, + { + "epoch": 0.3072403451863639, + "grad_norm": 0.37921255826950073, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0073, + "step": 5020 + }, + { + "epoch": 0.3078523777464961, + "grad_norm": 0.30738911032676697, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0083, + "step": 5030 + }, + { + "epoch": 0.3084644103066283, + "grad_norm": 0.1938163936138153, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0065, + "step": 5040 + }, + { + "epoch": 0.3090764428667605, + "grad_norm": 0.25826898217201233, + "learning_rate": 1.914800406458133e-05, + "loss": 0.008, + "step": 5050 + }, + { + "epoch": 0.3096884754268927, + "grad_norm": 0.18951697647571564, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0058, + "step": 5060 + }, + { + "epoch": 0.3103005079870249, + "grad_norm": 0.3877381980419159, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0095, + "step": 5070 + }, + { + "epoch": 0.3109125405471571, + "grad_norm": 0.3133573830127716, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0088, + "step": 5080 + }, + { + "epoch": 0.3115245731072893, + "grad_norm": 0.33131852746009827, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0062, + "step": 5090 + }, + { + "epoch": 0.3121366056674215, + "grad_norm": 0.21276263892650604, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0071, + "step": 5100 + }, + { + "epoch": 0.3127486382275537, + "grad_norm": 0.46878281235694885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0084, + "step": 5110 + }, + { + "epoch": 0.3133606707876859, + "grad_norm": 0.44227683544158936, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0097, + "step": 5120 + }, + { + "epoch": 0.3139727033478181, + "grad_norm": 0.41950204968452454, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0115, + "step": 5130 + }, + { + "epoch": 0.31458473590795033, + "grad_norm": 0.4214445948600769, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0081, + "step": 5140 + }, + { + "epoch": 0.3151967684680825, + "grad_norm": 0.3779868483543396, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0075, + "step": 5150 + }, + { + "epoch": 0.3158088010282147, + "grad_norm": 0.4587777853012085, + "learning_rate": 1.910187855634501e-05, + "loss": 0.009, + "step": 5160 + }, + { + "epoch": 0.3164208335883469, + "grad_norm": 0.4875587224960327, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0099, + "step": 5170 + }, + { + "epoch": 0.3170328661484791, + "grad_norm": 0.22378237545490265, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0071, + "step": 5180 + }, + { + "epoch": 0.3176448987086113, + "grad_norm": 0.3360678553581238, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0101, + "step": 5190 + }, + { + "epoch": 0.3182569312687435, + "grad_norm": 0.36370640993118286, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0068, + "step": 5200 + }, + { + "epoch": 0.3188689638288757, + "grad_norm": 0.25814393162727356, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0076, + "step": 5210 + }, + { + "epoch": 0.3194809963890079, + "grad_norm": 0.39010074734687805, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0066, + "step": 5220 + }, + { + "epoch": 0.3200930289491401, + "grad_norm": 0.44009074568748474, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0078, + "step": 5230 + }, + { + "epoch": 0.3207050615092723, + "grad_norm": 0.45733046531677246, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0074, + "step": 5240 + }, + { + "epoch": 0.3213170940694045, + "grad_norm": 0.4555135667324066, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0089, + "step": 5250 + }, + { + "epoch": 0.3219291266295367, + "grad_norm": 0.5864276885986328, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0083, + "step": 5260 + }, + { + "epoch": 0.3225411591896689, + "grad_norm": 0.3305470943450928, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0094, + "step": 5270 + }, + { + "epoch": 0.3231531917498011, + "grad_norm": 0.21458053588867188, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.0091, + "step": 5280 + }, + { + "epoch": 0.32376522430993326, + "grad_norm": 0.2927384376525879, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.007, + "step": 5290 + }, + { + "epoch": 0.32437725687006547, + "grad_norm": 0.387608140707016, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0068, + "step": 5300 + }, + { + "epoch": 0.32498928943019767, + "grad_norm": 0.28193122148513794, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0065, + "step": 5310 + }, + { + "epoch": 0.3256013219903299, + "grad_norm": 0.33098119497299194, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0082, + "step": 5320 + }, + { + "epoch": 0.3262133545504621, + "grad_norm": 0.5442482233047485, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0124, + "step": 5330 + }, + { + "epoch": 0.3268253871105943, + "grad_norm": 0.503669798374176, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0093, + "step": 5340 + }, + { + "epoch": 0.3274374196707265, + "grad_norm": 0.2307574301958084, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0071, + "step": 5350 + }, + { + "epoch": 0.3280494522308587, + "grad_norm": 0.3543917238712311, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.007, + "step": 5360 + }, + { + "epoch": 0.3286614847909909, + "grad_norm": 0.21763169765472412, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0059, + "step": 5370 + }, + { + "epoch": 0.3292735173511231, + "grad_norm": 0.38023391366004944, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0068, + "step": 5380 + }, + { + "epoch": 0.3298855499112553, + "grad_norm": 0.44597327709198, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0069, + "step": 5390 + }, + { + "epoch": 0.3304975824713875, + "grad_norm": 0.2994389533996582, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0083, + "step": 5400 + }, + { + "epoch": 0.3311096150315197, + "grad_norm": 0.26668304204940796, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0071, + "step": 5410 + }, + { + "epoch": 0.3317216475916519, + "grad_norm": 0.25944197177886963, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0065, + "step": 5420 + }, + { + "epoch": 0.33233368015178405, + "grad_norm": 0.3646431267261505, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0065, + "step": 5430 + }, + { + "epoch": 0.33294571271191625, + "grad_norm": 0.34860959649086, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0098, + "step": 5440 + }, + { + "epoch": 0.33355774527204846, + "grad_norm": 0.33718568086624146, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0069, + "step": 5450 + }, + { + "epoch": 0.33416977783218066, + "grad_norm": 0.2417302280664444, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0064, + "step": 5460 + }, + { + "epoch": 0.33478181039231286, + "grad_norm": 0.26607826352119446, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0048, + "step": 5470 + }, + { + "epoch": 0.33539384295244506, + "grad_norm": 0.31762364506721497, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0064, + "step": 5480 + }, + { + "epoch": 0.33600587551257727, + "grad_norm": 0.21427015960216522, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0079, + "step": 5490 + }, + { + "epoch": 0.33661790807270947, + "grad_norm": 0.3372637629508972, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0077, + "step": 5500 + }, + { + "epoch": 0.3372299406328417, + "grad_norm": 0.3760700821876526, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0066, + "step": 5510 + }, + { + "epoch": 0.3378419731929739, + "grad_norm": 0.22838029265403748, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0061, + "step": 5520 + }, + { + "epoch": 0.3384540057531061, + "grad_norm": 0.3105243444442749, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0089, + "step": 5530 + }, + { + "epoch": 0.3390660383132383, + "grad_norm": 0.23694929480552673, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0086, + "step": 5540 + }, + { + "epoch": 0.3396780708733705, + "grad_norm": 0.22935174405574799, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0074, + "step": 5550 + }, + { + "epoch": 0.3402901034335027, + "grad_norm": 0.26384714245796204, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0069, + "step": 5560 + }, + { + "epoch": 0.34090213599363484, + "grad_norm": 0.33245643973350525, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0061, + "step": 5570 + }, + { + "epoch": 0.34151416855376704, + "grad_norm": 0.3904813230037689, + "learning_rate": 1.891523933768891e-05, + "loss": 0.009, + "step": 5580 + }, + { + "epoch": 0.34212620111389924, + "grad_norm": 0.33858415484428406, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0067, + "step": 5590 + }, + { + "epoch": 0.34273823367403145, + "grad_norm": 0.3197486996650696, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0082, + "step": 5600 + }, + { + "epoch": 0.34335026623416365, + "grad_norm": 0.23814789950847626, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0068, + "step": 5610 + }, + { + "epoch": 0.34396229879429585, + "grad_norm": 0.3820457458496094, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0082, + "step": 5620 + }, + { + "epoch": 0.34457433135442805, + "grad_norm": 0.27518680691719055, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0063, + "step": 5630 + }, + { + "epoch": 0.34518636391456026, + "grad_norm": 0.24741721153259277, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0076, + "step": 5640 + }, + { + "epoch": 0.34579839647469246, + "grad_norm": 0.5140052437782288, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0178, + "step": 5650 + }, + { + "epoch": 0.34641042903482466, + "grad_norm": 0.5363543033599854, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0097, + "step": 5660 + }, + { + "epoch": 0.34702246159495687, + "grad_norm": 0.41116055846214294, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0078, + "step": 5670 + }, + { + "epoch": 0.34763449415508907, + "grad_norm": 0.412762314081192, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0086, + "step": 5680 + }, + { + "epoch": 0.34824652671522127, + "grad_norm": 0.399527907371521, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0088, + "step": 5690 + }, + { + "epoch": 0.3488585592753535, + "grad_norm": 0.3447834551334381, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0084, + "step": 5700 + }, + { + "epoch": 0.3494705918354856, + "grad_norm": 0.3418859541416168, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0092, + "step": 5710 + }, + { + "epoch": 0.3500826243956178, + "grad_norm": 0.3336535692214966, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.0065, + "step": 5720 + }, + { + "epoch": 0.35069465695575003, + "grad_norm": 0.34575122594833374, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0068, + "step": 5730 + }, + { + "epoch": 0.35130668951588223, + "grad_norm": 0.34325110912323, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.01, + "step": 5740 + }, + { + "epoch": 0.35191872207601443, + "grad_norm": 0.20104236900806427, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0085, + "step": 5750 + }, + { + "epoch": 0.35253075463614664, + "grad_norm": 0.33699074387550354, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0073, + "step": 5760 + }, + { + "epoch": 0.35314278719627884, + "grad_norm": 0.33322635293006897, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0091, + "step": 5770 + }, + { + "epoch": 0.35375481975641104, + "grad_norm": 0.26897475123405457, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0087, + "step": 5780 + }, + { + "epoch": 0.35436685231654325, + "grad_norm": 0.5310013890266418, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0067, + "step": 5790 + }, + { + "epoch": 0.35497888487667545, + "grad_norm": 0.4203440845012665, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0097, + "step": 5800 + }, + { + "epoch": 0.35559091743680765, + "grad_norm": 0.2179369181394577, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0082, + "step": 5810 + }, + { + "epoch": 0.35620294999693985, + "grad_norm": 0.2789444625377655, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0066, + "step": 5820 + }, + { + "epoch": 0.35681498255707206, + "grad_norm": 0.28009694814682007, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.007, + "step": 5830 + }, + { + "epoch": 0.35742701511720426, + "grad_norm": 0.304768443107605, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0065, + "step": 5840 + }, + { + "epoch": 0.3580390476773364, + "grad_norm": 0.2829401195049286, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0061, + "step": 5850 + }, + { + "epoch": 0.3586510802374686, + "grad_norm": 0.3388998508453369, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0083, + "step": 5860 + }, + { + "epoch": 0.3592631127976008, + "grad_norm": 0.3313426673412323, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0069, + "step": 5870 + }, + { + "epoch": 0.359875145357733, + "grad_norm": 0.2886904180049896, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0094, + "step": 5880 + }, + { + "epoch": 0.3604871779178652, + "grad_norm": 0.3132432997226715, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0086, + "step": 5890 + }, + { + "epoch": 0.3610992104779974, + "grad_norm": 0.37195107340812683, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0137, + "step": 5900 + }, + { + "epoch": 0.3617112430381296, + "grad_norm": 0.30853375792503357, + "learning_rate": 1.875708056549365e-05, + "loss": 0.01, + "step": 5910 + }, + { + "epoch": 0.36232327559826183, + "grad_norm": 0.39785459637641907, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0073, + "step": 5920 + }, + { + "epoch": 0.36293530815839403, + "grad_norm": 0.26958727836608887, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0059, + "step": 5930 + }, + { + "epoch": 0.36354734071852624, + "grad_norm": 0.354956716299057, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0069, + "step": 5940 + }, + { + "epoch": 0.36415937327865844, + "grad_norm": 0.3470858037471771, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0066, + "step": 5950 + }, + { + "epoch": 0.36477140583879064, + "grad_norm": 0.30000701546669006, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0075, + "step": 5960 + }, + { + "epoch": 0.36538343839892284, + "grad_norm": 0.5558263063430786, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0083, + "step": 5970 + }, + { + "epoch": 0.36599547095905505, + "grad_norm": 0.39146295189857483, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0062, + "step": 5980 + }, + { + "epoch": 0.3666075035191872, + "grad_norm": 0.44002753496170044, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0089, + "step": 5990 + }, + { + "epoch": 0.3672195360793194, + "grad_norm": 0.3220095932483673, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0097, + "step": 6000 + }, + { + "epoch": 0.3678315686394516, + "grad_norm": 0.3569507598876953, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0067, + "step": 6010 + }, + { + "epoch": 0.3684436011995838, + "grad_norm": 0.3004184365272522, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0084, + "step": 6020 + }, + { + "epoch": 0.369055633759716, + "grad_norm": 0.2931320071220398, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0079, + "step": 6030 + }, + { + "epoch": 0.3696676663198482, + "grad_norm": 0.39551016688346863, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0088, + "step": 6040 + }, + { + "epoch": 0.3702796988799804, + "grad_norm": 0.33755603432655334, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0082, + "step": 6050 + }, + { + "epoch": 0.3708917314401126, + "grad_norm": 0.3101558983325958, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0069, + "step": 6060 + }, + { + "epoch": 0.3715037640002448, + "grad_norm": 0.2921602129936218, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0065, + "step": 6070 + }, + { + "epoch": 0.372115796560377, + "grad_norm": 0.3601403832435608, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0063, + "step": 6080 + }, + { + "epoch": 0.3727278291205092, + "grad_norm": 0.34929168224334717, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0073, + "step": 6090 + }, + { + "epoch": 0.3733398616806414, + "grad_norm": 0.3987390995025635, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0068, + "step": 6100 + }, + { + "epoch": 0.37395189424077363, + "grad_norm": 0.2641090452671051, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0091, + "step": 6110 + }, + { + "epoch": 0.37456392680090583, + "grad_norm": 0.23139338195323944, + "learning_rate": 1.865125972978549e-05, + "loss": 0.006, + "step": 6120 + }, + { + "epoch": 0.375175959361038, + "grad_norm": 0.26552167534828186, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0056, + "step": 6130 + }, + { + "epoch": 0.3757879919211702, + "grad_norm": 0.43827885389328003, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0084, + "step": 6140 + }, + { + "epoch": 0.3764000244813024, + "grad_norm": 0.27495354413986206, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.012, + "step": 6150 + }, + { + "epoch": 0.3770120570414346, + "grad_norm": 0.36078640818595886, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0104, + "step": 6160 + }, + { + "epoch": 0.3776240896015668, + "grad_norm": 0.28252753615379333, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0103, + "step": 6170 + }, + { + "epoch": 0.378236122161699, + "grad_norm": 0.2674558162689209, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0069, + "step": 6180 + }, + { + "epoch": 0.3788481547218312, + "grad_norm": 0.21457509696483612, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0089, + "step": 6190 + }, + { + "epoch": 0.3794601872819634, + "grad_norm": 0.3142339885234833, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0075, + "step": 6200 + }, + { + "epoch": 0.3800722198420956, + "grad_norm": 0.32714203000068665, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0084, + "step": 6210 + }, + { + "epoch": 0.3806842524022278, + "grad_norm": 0.2632557153701782, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0072, + "step": 6220 + }, + { + "epoch": 0.38129628496236, + "grad_norm": 0.1893932968378067, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0063, + "step": 6230 + }, + { + "epoch": 0.3819083175224922, + "grad_norm": 0.49935290217399597, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0087, + "step": 6240 + }, + { + "epoch": 0.3825203500826244, + "grad_norm": 0.34605127573013306, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0101, + "step": 6250 + }, + { + "epoch": 0.3831323826427566, + "grad_norm": 0.3294198513031006, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0067, + "step": 6260 + }, + { + "epoch": 0.38374441520288877, + "grad_norm": 0.34797370433807373, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0065, + "step": 6270 + }, + { + "epoch": 0.38435644776302097, + "grad_norm": 0.37710750102996826, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0061, + "step": 6280 + }, + { + "epoch": 0.3849684803231532, + "grad_norm": 0.39949893951416016, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0066, + "step": 6290 + }, + { + "epoch": 0.3855805128832854, + "grad_norm": 0.33014294505119324, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0095, + "step": 6300 + }, + { + "epoch": 0.3861925454434176, + "grad_norm": 0.4329249858856201, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0089, + "step": 6310 + }, + { + "epoch": 0.3868045780035498, + "grad_norm": 0.298330157995224, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0076, + "step": 6320 + }, + { + "epoch": 0.387416610563682, + "grad_norm": 0.2672661542892456, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0074, + "step": 6330 + }, + { + "epoch": 0.3880286431238142, + "grad_norm": 0.48193076252937317, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0103, + "step": 6340 + }, + { + "epoch": 0.3886406756839464, + "grad_norm": 0.29180601239204407, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0075, + "step": 6350 + }, + { + "epoch": 0.3892527082440786, + "grad_norm": 0.21320492029190063, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0077, + "step": 6360 + }, + { + "epoch": 0.3898647408042108, + "grad_norm": 0.37252935767173767, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0078, + "step": 6370 + }, + { + "epoch": 0.390476773364343, + "grad_norm": 0.284586101770401, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0089, + "step": 6380 + }, + { + "epoch": 0.3910888059244752, + "grad_norm": 0.5030382871627808, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0088, + "step": 6390 + }, + { + "epoch": 0.3917008384846074, + "grad_norm": 0.357239305973053, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0069, + "step": 6400 + }, + { + "epoch": 0.39231287104473955, + "grad_norm": 0.20308594405651093, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0062, + "step": 6410 + }, + { + "epoch": 0.39292490360487176, + "grad_norm": 0.2678150534629822, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0085, + "step": 6420 + }, + { + "epoch": 0.39353693616500396, + "grad_norm": 0.35160595178604126, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0072, + "step": 6430 + }, + { + "epoch": 0.39414896872513616, + "grad_norm": 0.33254173398017883, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0083, + "step": 6440 + }, + { + "epoch": 0.39476100128526836, + "grad_norm": 0.22763408720493317, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0061, + "step": 6450 + }, + { + "epoch": 0.39537303384540057, + "grad_norm": 0.20889192819595337, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0067, + "step": 6460 + }, + { + "epoch": 0.39598506640553277, + "grad_norm": 0.22515206038951874, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0086, + "step": 6470 + }, + { + "epoch": 0.396597098965665, + "grad_norm": 0.36421817541122437, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.0064, + "step": 6480 + }, + { + "epoch": 0.3972091315257972, + "grad_norm": 0.3869773745536804, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0084, + "step": 6490 + }, + { + "epoch": 0.3978211640859294, + "grad_norm": 0.26248687505722046, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0061, + "step": 6500 + }, + { + "epoch": 0.3984331966460616, + "grad_norm": 0.22152310609817505, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0057, + "step": 6510 + }, + { + "epoch": 0.3990452292061938, + "grad_norm": 0.25921961665153503, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0071, + "step": 6520 + }, + { + "epoch": 0.399657261766326, + "grad_norm": 0.3289903998374939, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0076, + "step": 6530 + }, + { + "epoch": 0.4002692943264582, + "grad_norm": 0.2767571210861206, + "learning_rate": 1.8427795928237e-05, + "loss": 0.01, + "step": 6540 + }, + { + "epoch": 0.40088132688659034, + "grad_norm": 0.46339666843414307, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0064, + "step": 6550 + }, + { + "epoch": 0.40149335944672254, + "grad_norm": 0.2942553460597992, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0067, + "step": 6560 + }, + { + "epoch": 0.40210539200685474, + "grad_norm": 0.3868240714073181, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0085, + "step": 6570 + }, + { + "epoch": 0.40271742456698695, + "grad_norm": 0.3999684154987335, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0067, + "step": 6580 + }, + { + "epoch": 0.40332945712711915, + "grad_norm": 0.42856812477111816, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0091, + "step": 6590 + }, + { + "epoch": 0.40394148968725135, + "grad_norm": 0.3099806010723114, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0084, + "step": 6600 + }, + { + "epoch": 0.40455352224738356, + "grad_norm": 0.3798827826976776, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0066, + "step": 6610 + }, + { + "epoch": 0.40516555480751576, + "grad_norm": 0.19007280468940735, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0068, + "step": 6620 + }, + { + "epoch": 0.40577758736764796, + "grad_norm": 0.3723277151584625, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0085, + "step": 6630 + }, + { + "epoch": 0.40638961992778017, + "grad_norm": 0.21034900844097137, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0069, + "step": 6640 + }, + { + "epoch": 0.40700165248791237, + "grad_norm": 0.29838645458221436, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0095, + "step": 6650 + }, + { + "epoch": 0.40761368504804457, + "grad_norm": 0.2645854353904724, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0061, + "step": 6660 + }, + { + "epoch": 0.4082257176081768, + "grad_norm": 0.21633592247962952, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.006, + "step": 6670 + }, + { + "epoch": 0.408837750168309, + "grad_norm": 0.25387731194496155, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.008, + "step": 6680 + }, + { + "epoch": 0.4094497827284412, + "grad_norm": 0.3752288520336151, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0092, + "step": 6690 + }, + { + "epoch": 0.41006181528857333, + "grad_norm": 0.33368971943855286, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0072, + "step": 6700 + }, + { + "epoch": 0.41067384784870553, + "grad_norm": 0.34388917684555054, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0074, + "step": 6710 + }, + { + "epoch": 0.41128588040883773, + "grad_norm": 0.2683192789554596, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.007, + "step": 6720 + }, + { + "epoch": 0.41189791296896994, + "grad_norm": 0.5121234059333801, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0074, + "step": 6730 + }, + { + "epoch": 0.41250994552910214, + "grad_norm": 0.333406925201416, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0117, + "step": 6740 + }, + { + "epoch": 0.41312197808923434, + "grad_norm": 0.26011794805526733, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0062, + "step": 6750 + }, + { + "epoch": 0.41373401064936655, + "grad_norm": 0.28925821185112, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0066, + "step": 6760 + }, + { + "epoch": 0.41434604320949875, + "grad_norm": 0.2202957570552826, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0058, + "step": 6770 + }, + { + "epoch": 0.41495807576963095, + "grad_norm": 0.2740793824195862, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0072, + "step": 6780 + }, + { + "epoch": 0.41557010832976315, + "grad_norm": 0.46569427847862244, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0069, + "step": 6790 + }, + { + "epoch": 0.41618214088989536, + "grad_norm": 0.3959881067276001, + "learning_rate": 1.828172598376902e-05, + "loss": 0.009, + "step": 6800 + }, + { + "epoch": 0.41679417345002756, + "grad_norm": 0.2465214729309082, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0068, + "step": 6810 + }, + { + "epoch": 0.41740620601015976, + "grad_norm": 0.3207756280899048, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0083, + "step": 6820 + }, + { + "epoch": 0.41801823857029197, + "grad_norm": 0.5600990653038025, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0076, + "step": 6830 + }, + { + "epoch": 0.4186302711304241, + "grad_norm": 0.32832831144332886, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0072, + "step": 6840 + }, + { + "epoch": 0.4192423036905563, + "grad_norm": 0.3397129774093628, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0083, + "step": 6850 + }, + { + "epoch": 0.4198543362506885, + "grad_norm": 0.3481312096118927, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0064, + "step": 6860 + }, + { + "epoch": 0.4204663688108207, + "grad_norm": 0.4542059898376465, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0104, + "step": 6870 + }, + { + "epoch": 0.4210784013709529, + "grad_norm": 0.2517620325088501, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0089, + "step": 6880 + }, + { + "epoch": 0.42169043393108513, + "grad_norm": 0.3671923875808716, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0068, + "step": 6890 + }, + { + "epoch": 0.42230246649121733, + "grad_norm": 0.41340726613998413, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0084, + "step": 6900 + }, + { + "epoch": 0.42291449905134954, + "grad_norm": 0.22815965116024017, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0079, + "step": 6910 + }, + { + "epoch": 0.42352653161148174, + "grad_norm": 0.35324010252952576, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0073, + "step": 6920 + }, + { + "epoch": 0.42413856417161394, + "grad_norm": 0.30134323239326477, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0071, + "step": 6930 + }, + { + "epoch": 0.42475059673174614, + "grad_norm": 0.4007415771484375, + "learning_rate": 1.82006727813775e-05, + "loss": 0.006, + "step": 6940 + }, + { + "epoch": 0.42536262929187835, + "grad_norm": 0.3320179879665375, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0074, + "step": 6950 + }, + { + "epoch": 0.42597466185201055, + "grad_norm": 0.311971515417099, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0062, + "step": 6960 + }, + { + "epoch": 0.42658669441214275, + "grad_norm": 0.34347453713417053, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0068, + "step": 6970 + }, + { + "epoch": 0.4271987269722749, + "grad_norm": 0.25632336735725403, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0113, + "step": 6980 + }, + { + "epoch": 0.4278107595324071, + "grad_norm": 0.21711130440235138, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0068, + "step": 6990 + }, + { + "epoch": 0.4284227920925393, + "grad_norm": 0.3381270170211792, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0064, + "step": 7000 + }, + { + "epoch": 0.4290348246526715, + "grad_norm": 0.32262885570526123, + "learning_rate": 1.815952390818299e-05, + "loss": 0.0091, + "step": 7010 + }, + { + "epoch": 0.4296468572128037, + "grad_norm": 0.65865558385849, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0104, + "step": 7020 + }, + { + "epoch": 0.4302588897729359, + "grad_norm": 0.3021128177642822, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.009, + "step": 7030 + }, + { + "epoch": 0.4308709223330681, + "grad_norm": 0.2859005331993103, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0065, + "step": 7040 + }, + { + "epoch": 0.4314829548932003, + "grad_norm": 0.3379405736923218, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0062, + "step": 7050 + }, + { + "epoch": 0.4320949874533325, + "grad_norm": 0.22009991109371185, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.007, + "step": 7060 + }, + { + "epoch": 0.4327070200134647, + "grad_norm": 0.24766206741333008, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0072, + "step": 7070 + }, + { + "epoch": 0.43331905257359693, + "grad_norm": 0.3557615280151367, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0096, + "step": 7080 + }, + { + "epoch": 0.43393108513372913, + "grad_norm": 0.5700691938400269, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0066, + "step": 7090 + }, + { + "epoch": 0.43454311769386134, + "grad_norm": 0.3194892704486847, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0065, + "step": 7100 + }, + { + "epoch": 0.43515515025399354, + "grad_norm": 0.2766750752925873, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0074, + "step": 7110 + }, + { + "epoch": 0.4357671828141257, + "grad_norm": 0.2775132656097412, + "learning_rate": 1.809403050791396e-05, + "loss": 0.007, + "step": 7120 + }, + { + "epoch": 0.4363792153742579, + "grad_norm": 0.4468507170677185, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0066, + "step": 7130 + }, + { + "epoch": 0.4369912479343901, + "grad_norm": 0.3282400369644165, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0185, + "step": 7140 + }, + { + "epoch": 0.4376032804945223, + "grad_norm": 0.2625710964202881, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0088, + "step": 7150 + }, + { + "epoch": 0.4382153130546545, + "grad_norm": 0.47729599475860596, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.008, + "step": 7160 + }, + { + "epoch": 0.4388273456147867, + "grad_norm": 0.30350950360298157, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0091, + "step": 7170 + }, + { + "epoch": 0.4394393781749189, + "grad_norm": 0.3514627516269684, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0065, + "step": 7180 + }, + { + "epoch": 0.4400514107350511, + "grad_norm": 0.26150578260421753, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0087, + "step": 7190 + }, + { + "epoch": 0.4406634432951833, + "grad_norm": 0.374138206243515, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0073, + "step": 7200 + }, + { + "epoch": 0.4412754758553155, + "grad_norm": 0.2980635166168213, + "learning_rate": 1.803969531201634e-05, + "loss": 0.007, + "step": 7210 + }, + { + "epoch": 0.4418875084154477, + "grad_norm": 0.38190510869026184, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0077, + "step": 7220 + }, + { + "epoch": 0.4424995409755799, + "grad_norm": 0.28819066286087036, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0065, + "step": 7230 + }, + { + "epoch": 0.4431115735357121, + "grad_norm": 0.43382275104522705, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0077, + "step": 7240 + }, + { + "epoch": 0.4437236060958443, + "grad_norm": 0.31589648127555847, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0065, + "step": 7250 + }, + { + "epoch": 0.4443356386559765, + "grad_norm": 0.3744536340236664, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0067, + "step": 7260 + }, + { + "epoch": 0.4449476712161087, + "grad_norm": 0.2600225806236267, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.008, + "step": 7270 + }, + { + "epoch": 0.4455597037762409, + "grad_norm": 0.28064799308776855, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0061, + "step": 7280 + }, + { + "epoch": 0.4461717363363731, + "grad_norm": 0.2745135426521301, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0075, + "step": 7290 + }, + { + "epoch": 0.4467837688965053, + "grad_norm": 0.23609793186187744, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0056, + "step": 7300 + }, + { + "epoch": 0.4473958014566375, + "grad_norm": 0.35910022258758545, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0079, + "step": 7310 + }, + { + "epoch": 0.4480078340167697, + "grad_norm": 0.22230662405490875, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0065, + "step": 7320 + }, + { + "epoch": 0.4486198665769019, + "grad_norm": 0.3835199475288391, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.008, + "step": 7330 + }, + { + "epoch": 0.4492318991370341, + "grad_norm": 0.37863102555274963, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0083, + "step": 7340 + }, + { + "epoch": 0.4498439316971663, + "grad_norm": 0.25412216782569885, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0056, + "step": 7350 + }, + { + "epoch": 0.4504559642572985, + "grad_norm": 0.43248918652534485, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0061, + "step": 7360 + }, + { + "epoch": 0.4510679968174307, + "grad_norm": 0.2937811613082886, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0073, + "step": 7370 + }, + { + "epoch": 0.4516800293775629, + "grad_norm": 0.3018436133861542, + "learning_rate": 1.793524061803872e-05, + "loss": 0.007, + "step": 7380 + }, + { + "epoch": 0.4522920619376951, + "grad_norm": 0.32781726121902466, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0079, + "step": 7390 + }, + { + "epoch": 0.45290409449782726, + "grad_norm": 0.2843719720840454, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0067, + "step": 7400 + }, + { + "epoch": 0.45351612705795946, + "grad_norm": 0.27588292956352234, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0092, + "step": 7410 + }, + { + "epoch": 0.45412815961809166, + "grad_norm": 0.38858234882354736, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0067, + "step": 7420 + }, + { + "epoch": 0.45474019217822387, + "grad_norm": 0.4235166609287262, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0082, + "step": 7430 + }, + { + "epoch": 0.45535222473835607, + "grad_norm": 0.272210031747818, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0059, + "step": 7440 + }, + { + "epoch": 0.4559642572984883, + "grad_norm": 0.23851896822452545, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0071, + "step": 7450 + }, + { + "epoch": 0.4565762898586205, + "grad_norm": 0.37179476022720337, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0073, + "step": 7460 + }, + { + "epoch": 0.4571883224187527, + "grad_norm": 0.31902605295181274, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.009, + "step": 7470 + }, + { + "epoch": 0.4578003549788849, + "grad_norm": 0.47023633122444153, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0091, + "step": 7480 + }, + { + "epoch": 0.4584123875390171, + "grad_norm": 0.35726839303970337, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0079, + "step": 7490 + }, + { + "epoch": 0.4590244200991493, + "grad_norm": 0.27567291259765625, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0073, + "step": 7500 + }, + { + "epoch": 0.4596364526592815, + "grad_norm": 0.23053516447544098, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0065, + "step": 7510 + }, + { + "epoch": 0.4602484852194137, + "grad_norm": 0.2169056385755539, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0054, + "step": 7520 + }, + { + "epoch": 0.4608605177795459, + "grad_norm": 0.2912258207798004, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0083, + "step": 7530 + }, + { + "epoch": 0.46147255033967804, + "grad_norm": 0.2527846097946167, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.006, + "step": 7540 + }, + { + "epoch": 0.46208458289981025, + "grad_norm": 0.3878445029258728, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0079, + "step": 7550 + }, + { + "epoch": 0.46269661545994245, + "grad_norm": 0.3981980085372925, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0081, + "step": 7560 + }, + { + "epoch": 0.46330864802007465, + "grad_norm": 0.48834845423698425, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0086, + "step": 7570 + }, + { + "epoch": 0.46392068058020686, + "grad_norm": 0.3045276701450348, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0085, + "step": 7580 + }, + { + "epoch": 0.46453271314033906, + "grad_norm": 0.23345299065113068, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0072, + "step": 7590 + }, + { + "epoch": 0.46514474570047126, + "grad_norm": 0.3632943034172058, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0085, + "step": 7600 + }, + { + "epoch": 0.46575677826060347, + "grad_norm": 0.19813670217990875, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0073, + "step": 7610 + }, + { + "epoch": 0.46636881082073567, + "grad_norm": 0.36094173789024353, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0085, + "step": 7620 + }, + { + "epoch": 0.46698084338086787, + "grad_norm": 0.30049464106559753, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0085, + "step": 7630 + }, + { + "epoch": 0.4675928759410001, + "grad_norm": 0.27693697810173035, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0057, + "step": 7640 + }, + { + "epoch": 0.4682049085011323, + "grad_norm": 0.3656866252422333, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0064, + "step": 7650 + }, + { + "epoch": 0.4688169410612645, + "grad_norm": 0.602168083190918, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0076, + "step": 7660 + }, + { + "epoch": 0.4694289736213967, + "grad_norm": 0.3553078770637512, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0062, + "step": 7670 + }, + { + "epoch": 0.47004100618152883, + "grad_norm": 0.326695054769516, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0062, + "step": 7680 + }, + { + "epoch": 0.47065303874166103, + "grad_norm": 0.2762170732021332, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0075, + "step": 7690 + }, + { + "epoch": 0.47126507130179324, + "grad_norm": 0.35057321190834045, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0063, + "step": 7700 + }, + { + "epoch": 0.47187710386192544, + "grad_norm": 0.3906462788581848, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0086, + "step": 7710 + }, + { + "epoch": 0.47248913642205764, + "grad_norm": 0.290752112865448, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0087, + "step": 7720 + }, + { + "epoch": 0.47310116898218985, + "grad_norm": 0.2242034673690796, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0068, + "step": 7730 + }, + { + "epoch": 0.47371320154232205, + "grad_norm": 0.3283435106277466, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0068, + "step": 7740 + }, + { + "epoch": 0.47432523410245425, + "grad_norm": 0.24059069156646729, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.0056, + "step": 7750 + }, + { + "epoch": 0.47493726666258645, + "grad_norm": 0.2978667914867401, + "learning_rate": 1.769330275540774e-05, + "loss": 0.007, + "step": 7760 + }, + { + "epoch": 0.47554929922271866, + "grad_norm": 0.2605571150779724, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0084, + "step": 7770 + }, + { + "epoch": 0.47616133178285086, + "grad_norm": 0.4010445475578308, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0075, + "step": 7780 + }, + { + "epoch": 0.47677336434298306, + "grad_norm": 0.31932029128074646, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0076, + "step": 7790 + }, + { + "epoch": 0.47738539690311527, + "grad_norm": 0.3508684039115906, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0067, + "step": 7800 + }, + { + "epoch": 0.47799742946324747, + "grad_norm": 0.2835206091403961, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0064, + "step": 7810 + }, + { + "epoch": 0.4786094620233796, + "grad_norm": 0.2661663293838501, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0051, + "step": 7820 + }, + { + "epoch": 0.4792214945835118, + "grad_norm": 0.4146379828453064, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0073, + "step": 7830 + }, + { + "epoch": 0.479833527143644, + "grad_norm": 0.38621196150779724, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0076, + "step": 7840 + }, + { + "epoch": 0.4804455597037762, + "grad_norm": 0.19052188098430634, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.008, + "step": 7850 + }, + { + "epoch": 0.48105759226390843, + "grad_norm": 0.3699149489402771, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0071, + "step": 7860 + }, + { + "epoch": 0.48166962482404063, + "grad_norm": 0.3756427764892578, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0071, + "step": 7870 + }, + { + "epoch": 0.48228165738417283, + "grad_norm": 0.2987386882305145, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0069, + "step": 7880 + }, + { + "epoch": 0.48289368994430504, + "grad_norm": 0.24891899526119232, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0077, + "step": 7890 + }, + { + "epoch": 0.48350572250443724, + "grad_norm": 0.44080299139022827, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.011, + "step": 7900 + }, + { + "epoch": 0.48411775506456944, + "grad_norm": 0.20801177620887756, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0089, + "step": 7910 + }, + { + "epoch": 0.48472978762470165, + "grad_norm": 0.31475305557250977, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0091, + "step": 7920 + }, + { + "epoch": 0.48534182018483385, + "grad_norm": 0.29783639311790466, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0082, + "step": 7930 + }, + { + "epoch": 0.48595385274496605, + "grad_norm": 0.3330203890800476, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0071, + "step": 7940 + }, + { + "epoch": 0.48656588530509826, + "grad_norm": 0.3537667691707611, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0068, + "step": 7950 + }, + { + "epoch": 0.4871779178652304, + "grad_norm": 0.2810688316822052, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0059, + "step": 7960 + }, + { + "epoch": 0.4877899504253626, + "grad_norm": 0.3359779715538025, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0059, + "step": 7970 + }, + { + "epoch": 0.4884019829854948, + "grad_norm": 0.36015257239341736, + "learning_rate": 1.754802282200567e-05, + "loss": 0.008, + "step": 7980 + }, + { + "epoch": 0.489014015545627, + "grad_norm": 0.2647690176963806, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0065, + "step": 7990 + }, + { + "epoch": 0.4896260481057592, + "grad_norm": 0.23366811871528625, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0068, + "step": 8000 + }, + { + "epoch": 0.4902380806658914, + "grad_norm": 0.2904139757156372, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0054, + "step": 8010 + }, + { + "epoch": 0.4908501132260236, + "grad_norm": 0.30941230058670044, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0069, + "step": 8020 + }, + { + "epoch": 0.4914621457861558, + "grad_norm": 0.1959473341703415, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0063, + "step": 8030 + }, + { + "epoch": 0.492074178346288, + "grad_norm": 0.33349713683128357, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0059, + "step": 8040 + }, + { + "epoch": 0.49268621090642023, + "grad_norm": 0.39017921686172485, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0067, + "step": 8050 + }, + { + "epoch": 0.49329824346655243, + "grad_norm": 0.36401957273483276, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0061, + "step": 8060 + }, + { + "epoch": 0.49391027602668464, + "grad_norm": 0.22296921908855438, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0071, + "step": 8070 + }, + { + "epoch": 0.49452230858681684, + "grad_norm": 0.8712129592895508, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0104, + "step": 8080 + }, + { + "epoch": 0.49513434114694904, + "grad_norm": 0.39942649006843567, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0068, + "step": 8090 + }, + { + "epoch": 0.4957463737070812, + "grad_norm": 0.3821292817592621, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0076, + "step": 8100 + }, + { + "epoch": 0.4963584062672134, + "grad_norm": 0.35861077904701233, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0086, + "step": 8110 + }, + { + "epoch": 0.4969704388273456, + "grad_norm": 0.38629451394081116, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0068, + "step": 8120 + }, + { + "epoch": 0.4975824713874778, + "grad_norm": 3.412374973297119, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0168, + "step": 8130 + }, + { + "epoch": 0.49819450394761, + "grad_norm": 0.2893833816051483, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0065, + "step": 8140 + }, + { + "epoch": 0.4988065365077422, + "grad_norm": 0.37679117918014526, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0058, + "step": 8150 + }, + { + "epoch": 0.4994185690678744, + "grad_norm": 0.2745130658149719, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0082, + "step": 8160 + }, + { + "epoch": 0.5000306016280066, + "grad_norm": 0.30250442028045654, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0065, + "step": 8170 + }, + { + "epoch": 0.5006426341881388, + "grad_norm": 0.19602464139461517, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0056, + "step": 8180 + }, + { + "epoch": 0.501254666748271, + "grad_norm": 0.4736115634441376, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0062, + "step": 8190 + }, + { + "epoch": 0.5018666993084032, + "grad_norm": 0.25439244508743286, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0069, + "step": 8200 + }, + { + "epoch": 0.5024787318685354, + "grad_norm": 0.19290995597839355, + "learning_rate": 1.739216409306913e-05, + "loss": 0.007, + "step": 8210 + }, + { + "epoch": 0.5030907644286676, + "grad_norm": 0.24844267964363098, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0071, + "step": 8220 + }, + { + "epoch": 0.5037027969887998, + "grad_norm": 0.21179668605327606, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0071, + "step": 8230 + }, + { + "epoch": 0.504314829548932, + "grad_norm": 0.29139387607574463, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0082, + "step": 8240 + }, + { + "epoch": 0.5049268621090642, + "grad_norm": 0.2621973752975464, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0068, + "step": 8250 + }, + { + "epoch": 0.5055388946691964, + "grad_norm": 0.23394125699996948, + "learning_rate": 1.735775329110705e-05, + "loss": 0.006, + "step": 8260 + }, + { + "epoch": 0.5061509272293286, + "grad_norm": 0.28399863839149475, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0067, + "step": 8270 + }, + { + "epoch": 0.5067629597894608, + "grad_norm": 0.5048072934150696, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.008, + "step": 8280 + }, + { + "epoch": 0.507374992349593, + "grad_norm": 0.33848801255226135, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0054, + "step": 8290 + }, + { + "epoch": 0.5079870249097252, + "grad_norm": 0.28341951966285706, + "learning_rate": 1.733009030001197e-05, + "loss": 0.008, + "step": 8300 + }, + { + "epoch": 0.5085990574698575, + "grad_norm": 0.3223153054714203, + "learning_rate": 1.732315596014244e-05, + "loss": 0.007, + "step": 8310 + }, + { + "epoch": 0.5092110900299895, + "grad_norm": 0.23227599263191223, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0049, + "step": 8320 + }, + { + "epoch": 0.5098231225901217, + "grad_norm": 0.2847786247730255, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.007, + "step": 8330 + }, + { + "epoch": 0.510435155150254, + "grad_norm": 0.2026357650756836, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.007, + "step": 8340 + }, + { + "epoch": 0.5110471877103862, + "grad_norm": 0.3617453873157501, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0067, + "step": 8350 + }, + { + "epoch": 0.5116592202705184, + "grad_norm": 0.4439109265804291, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0067, + "step": 8360 + }, + { + "epoch": 0.5122712528306506, + "grad_norm": 0.26640209555625916, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0086, + "step": 8370 + }, + { + "epoch": 0.5128832853907828, + "grad_norm": 0.38045984506607056, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0075, + "step": 8380 + }, + { + "epoch": 0.513495317950915, + "grad_norm": 0.23035791516304016, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.006, + "step": 8390 + }, + { + "epoch": 0.5141073505110472, + "grad_norm": 0.40618664026260376, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0071, + "step": 8400 + }, + { + "epoch": 0.5147193830711794, + "grad_norm": 0.2593354880809784, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0064, + "step": 8410 + }, + { + "epoch": 0.5153314156313116, + "grad_norm": 0.27723655104637146, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0062, + "step": 8420 + }, + { + "epoch": 0.5159434481914438, + "grad_norm": 0.3793911039829254, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0059, + "step": 8430 + }, + { + "epoch": 0.516555480751576, + "grad_norm": 0.28634312748908997, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0063, + "step": 8440 + }, + { + "epoch": 0.5171675133117082, + "grad_norm": 0.39417290687561035, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0078, + "step": 8450 + }, + { + "epoch": 0.5177795458718404, + "grad_norm": 0.3043057322502136, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0064, + "step": 8460 + }, + { + "epoch": 0.5183915784319726, + "grad_norm": 0.36794111132621765, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0106, + "step": 8470 + }, + { + "epoch": 0.5190036109921048, + "grad_norm": 0.312161922454834, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0067, + "step": 8480 + }, + { + "epoch": 0.519615643552237, + "grad_norm": 0.39240267872810364, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0065, + "step": 8490 + }, + { + "epoch": 0.5202276761123692, + "grad_norm": 0.4500446915626526, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0073, + "step": 8500 + }, + { + "epoch": 0.5208397086725014, + "grad_norm": 0.22808927297592163, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0072, + "step": 8510 + }, + { + "epoch": 0.5214517412326336, + "grad_norm": 0.3262411057949066, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0065, + "step": 8520 + }, + { + "epoch": 0.5220637737927658, + "grad_norm": 0.472229927778244, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0068, + "step": 8530 + }, + { + "epoch": 0.522675806352898, + "grad_norm": 0.31563568115234375, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0074, + "step": 8540 + }, + { + "epoch": 0.5232878389130302, + "grad_norm": 0.27949750423431396, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0064, + "step": 8550 + }, + { + "epoch": 0.5238998714731624, + "grad_norm": 0.30297499895095825, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0075, + "step": 8560 + }, + { + "epoch": 0.5245119040332946, + "grad_norm": 0.3946770429611206, + "learning_rate": 1.714028248198457e-05, + "loss": 0.011, + "step": 8570 + }, + { + "epoch": 0.5251239365934268, + "grad_norm": 0.3405992090702057, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0082, + "step": 8580 + }, + { + "epoch": 0.525735969153559, + "grad_norm": 0.2963511347770691, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0066, + "step": 8590 + }, + { + "epoch": 0.5263480017136911, + "grad_norm": 0.1909177303314209, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.006, + "step": 8600 + }, + { + "epoch": 0.5269600342738233, + "grad_norm": 0.3378836512565613, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0058, + "step": 8610 + }, + { + "epoch": 0.5275720668339555, + "grad_norm": 0.30862805247306824, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0067, + "step": 8620 + }, + { + "epoch": 0.5281840993940877, + "grad_norm": 0.397293359041214, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0071, + "step": 8630 + }, + { + "epoch": 0.5287961319542199, + "grad_norm": 0.3665411174297333, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0068, + "step": 8640 + }, + { + "epoch": 0.5294081645143521, + "grad_norm": 0.34842419624328613, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0068, + "step": 8650 + }, + { + "epoch": 0.5300201970744843, + "grad_norm": 0.38205671310424805, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0065, + "step": 8660 + }, + { + "epoch": 0.5306322296346165, + "grad_norm": 0.35549092292785645, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0068, + "step": 8670 + }, + { + "epoch": 0.5312442621947487, + "grad_norm": 0.15676020085811615, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0055, + "step": 8680 + }, + { + "epoch": 0.5318562947548809, + "grad_norm": 0.22985056042671204, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0071, + "step": 8690 + }, + { + "epoch": 0.5324683273150131, + "grad_norm": 0.2743426263332367, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0047, + "step": 8700 + }, + { + "epoch": 0.5330803598751453, + "grad_norm": 0.2503803074359894, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0079, + "step": 8710 + }, + { + "epoch": 0.5336923924352776, + "grad_norm": 0.5036469101905823, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0095, + "step": 8720 + }, + { + "epoch": 0.5343044249954098, + "grad_norm": 0.2349964827299118, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0068, + "step": 8730 + }, + { + "epoch": 0.534916457555542, + "grad_norm": 0.28706061840057373, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0065, + "step": 8740 + }, + { + "epoch": 0.5355284901156742, + "grad_norm": 0.21812452375888824, + "learning_rate": 1.701081551967764e-05, + "loss": 0.008, + "step": 8750 + }, + { + "epoch": 0.5361405226758064, + "grad_norm": 0.301618754863739, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0069, + "step": 8760 + }, + { + "epoch": 0.5367525552359386, + "grad_norm": 0.35402950644493103, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0067, + "step": 8770 + }, + { + "epoch": 0.5373645877960708, + "grad_norm": 0.2875203788280487, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0097, + "step": 8780 + }, + { + "epoch": 0.537976620356203, + "grad_norm": 0.2358965128660202, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0053, + "step": 8790 + }, + { + "epoch": 0.5385886529163352, + "grad_norm": 0.14462094008922577, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0059, + "step": 8800 + }, + { + "epoch": 0.5392006854764674, + "grad_norm": 0.17893171310424805, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0062, + "step": 8810 + }, + { + "epoch": 0.5398127180365996, + "grad_norm": 0.2923351526260376, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0078, + "step": 8820 + }, + { + "epoch": 0.5404247505967318, + "grad_norm": 0.3288479745388031, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0067, + "step": 8830 + }, + { + "epoch": 0.541036783156864, + "grad_norm": 0.3996310532093048, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.006, + "step": 8840 + }, + { + "epoch": 0.5416488157169962, + "grad_norm": 0.24345380067825317, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0067, + "step": 8850 + }, + { + "epoch": 0.5422608482771284, + "grad_norm": 0.26688340306282043, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0072, + "step": 8860 + }, + { + "epoch": 0.5428728808372606, + "grad_norm": 0.4816153645515442, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0099, + "step": 8870 + }, + { + "epoch": 0.5434849133973927, + "grad_norm": 0.22544988989830017, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.007, + "step": 8880 + }, + { + "epoch": 0.5440969459575249, + "grad_norm": 0.2820419669151306, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0064, + "step": 8890 + }, + { + "epoch": 0.5447089785176571, + "grad_norm": 0.2758846879005432, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0072, + "step": 8900 + }, + { + "epoch": 0.5453210110777893, + "grad_norm": 0.4620129466056824, + "learning_rate": 1.689381359053773e-05, + "loss": 0.008, + "step": 8910 + }, + { + "epoch": 0.5459330436379215, + "grad_norm": 0.5567039847373962, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0079, + "step": 8920 + }, + { + "epoch": 0.5465450761980537, + "grad_norm": 0.347251832485199, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.006, + "step": 8930 + }, + { + "epoch": 0.5471571087581859, + "grad_norm": 0.31768012046813965, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0065, + "step": 8940 + }, + { + "epoch": 0.5477691413183181, + "grad_norm": 0.24245156347751617, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0052, + "step": 8950 + }, + { + "epoch": 0.5483811738784503, + "grad_norm": 0.2124931961297989, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0074, + "step": 8960 + }, + { + "epoch": 0.5489932064385825, + "grad_norm": 0.18998636305332184, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0056, + "step": 8970 + }, + { + "epoch": 0.5496052389987147, + "grad_norm": 0.2667362689971924, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0056, + "step": 8980 + }, + { + "epoch": 0.5502172715588469, + "grad_norm": 0.4424617886543274, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0091, + "step": 8990 + }, + { + "epoch": 0.5508293041189791, + "grad_norm": 0.33623644709587097, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0061, + "step": 9000 + }, + { + "epoch": 0.5514413366791113, + "grad_norm": 0.29990604519844055, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0059, + "step": 9010 + }, + { + "epoch": 0.5520533692392435, + "grad_norm": 0.4384118914604187, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0059, + "step": 9020 + }, + { + "epoch": 0.5526654017993757, + "grad_norm": 0.3468496799468994, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0068, + "step": 9030 + }, + { + "epoch": 0.5532774343595079, + "grad_norm": 0.3473573327064514, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0062, + "step": 9040 + }, + { + "epoch": 0.5538894669196401, + "grad_norm": 0.36125242710113525, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0062, + "step": 9050 + }, + { + "epoch": 0.5545014994797723, + "grad_norm": 0.2603420615196228, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0091, + "step": 9060 + }, + { + "epoch": 0.5551135320399045, + "grad_norm": 0.27355659008026123, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0074, + "step": 9070 + }, + { + "epoch": 0.5557255646000367, + "grad_norm": 0.24741119146347046, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0064, + "step": 9080 + }, + { + "epoch": 0.556337597160169, + "grad_norm": 0.2001475840806961, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0094, + "step": 9090 + }, + { + "epoch": 0.5569496297203012, + "grad_norm": 0.41522347927093506, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0088, + "step": 9100 + }, + { + "epoch": 0.5575616622804334, + "grad_norm": 0.27282488346099854, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0062, + "step": 9110 + }, + { + "epoch": 0.5581736948405656, + "grad_norm": 0.26905956864356995, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.007, + "step": 9120 + }, + { + "epoch": 0.5587857274006978, + "grad_norm": 0.24747484922409058, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0084, + "step": 9130 + }, + { + "epoch": 0.55939775996083, + "grad_norm": 0.1863871067762375, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0061, + "step": 9140 + }, + { + "epoch": 0.5600097925209622, + "grad_norm": 0.3599740266799927, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0063, + "step": 9150 + }, + { + "epoch": 0.5606218250810943, + "grad_norm": 0.2238125205039978, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0047, + "step": 9160 + }, + { + "epoch": 0.5612338576412265, + "grad_norm": 0.272077351808548, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.006, + "step": 9170 + }, + { + "epoch": 0.5618458902013587, + "grad_norm": 0.2371625155210495, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0064, + "step": 9180 + }, + { + "epoch": 0.5624579227614909, + "grad_norm": 0.12783293426036835, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0054, + "step": 9190 + }, + { + "epoch": 0.5630699553216231, + "grad_norm": 0.3144581615924835, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0064, + "step": 9200 + }, + { + "epoch": 0.5636819878817553, + "grad_norm": 0.31995031237602234, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0082, + "step": 9210 + }, + { + "epoch": 0.5642940204418875, + "grad_norm": 0.31995660066604614, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0083, + "step": 9220 + }, + { + "epoch": 0.5649060530020197, + "grad_norm": 0.5018982291221619, + "learning_rate": 1.665453350687773e-05, + "loss": 0.007, + "step": 9230 + }, + { + "epoch": 0.5655180855621519, + "grad_norm": 0.2927841544151306, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0104, + "step": 9240 + }, + { + "epoch": 0.5661301181222841, + "grad_norm": 0.21124979853630066, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0068, + "step": 9250 + }, + { + "epoch": 0.5667421506824163, + "grad_norm": 0.25787463784217834, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0059, + "step": 9260 + }, + { + "epoch": 0.5673541832425485, + "grad_norm": 0.3194720447063446, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0065, + "step": 9270 + }, + { + "epoch": 0.5679662158026807, + "grad_norm": 0.24165599048137665, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.006, + "step": 9280 + }, + { + "epoch": 0.5685782483628129, + "grad_norm": 0.4880482256412506, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0096, + "step": 9290 + }, + { + "epoch": 0.5691902809229451, + "grad_norm": 0.24660199880599976, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0098, + "step": 9300 + }, + { + "epoch": 0.5698023134830773, + "grad_norm": 0.24707400798797607, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0052, + "step": 9310 + }, + { + "epoch": 0.5704143460432095, + "grad_norm": 0.33855682611465454, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.011, + "step": 9320 + }, + { + "epoch": 0.5710263786033417, + "grad_norm": 0.22913751006126404, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0074, + "step": 9330 + }, + { + "epoch": 0.5716384111634739, + "grad_norm": 0.24127185344696045, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0062, + "step": 9340 + }, + { + "epoch": 0.5722504437236061, + "grad_norm": 0.26104915142059326, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0065, + "step": 9350 + }, + { + "epoch": 0.5728624762837383, + "grad_norm": 0.21698857843875885, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0062, + "step": 9360 + }, + { + "epoch": 0.5734745088438705, + "grad_norm": 0.29092445969581604, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0081, + "step": 9370 + }, + { + "epoch": 0.5740865414040027, + "grad_norm": 0.2534378468990326, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0064, + "step": 9380 + }, + { + "epoch": 0.5746985739641349, + "grad_norm": 0.28900131583213806, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0064, + "step": 9390 + }, + { + "epoch": 0.5753106065242671, + "grad_norm": 0.3028101921081543, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0104, + "step": 9400 + }, + { + "epoch": 0.5759226390843993, + "grad_norm": 0.28851139545440674, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0069, + "step": 9410 + }, + { + "epoch": 0.5765346716445315, + "grad_norm": 0.5735841393470764, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0072, + "step": 9420 + }, + { + "epoch": 0.5771467042046637, + "grad_norm": 0.20355567336082458, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0056, + "step": 9430 + }, + { + "epoch": 0.5777587367647958, + "grad_norm": 0.37027955055236816, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.006, + "step": 9440 + }, + { + "epoch": 0.578370769324928, + "grad_norm": 0.2701684832572937, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0066, + "step": 9450 + }, + { + "epoch": 0.5789828018850602, + "grad_norm": 0.17381855845451355, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0054, + "step": 9460 + }, + { + "epoch": 0.5795948344451924, + "grad_norm": 0.250261515378952, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0096, + "step": 9470 + }, + { + "epoch": 0.5802068670053246, + "grad_norm": 0.22972841560840607, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0058, + "step": 9480 + }, + { + "epoch": 0.5808188995654568, + "grad_norm": 0.22654809057712555, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0061, + "step": 9490 + }, + { + "epoch": 0.581430932125589, + "grad_norm": 0.17165100574493408, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0058, + "step": 9500 + }, + { + "epoch": 0.5820429646857213, + "grad_norm": 0.2462143450975418, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0054, + "step": 9510 + }, + { + "epoch": 0.5826549972458535, + "grad_norm": 0.3970383107662201, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0069, + "step": 9520 + }, + { + "epoch": 0.5832670298059857, + "grad_norm": 0.21578988432884216, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0054, + "step": 9530 + }, + { + "epoch": 0.5838790623661179, + "grad_norm": 0.5680915713310242, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0072, + "step": 9540 + }, + { + "epoch": 0.5844910949262501, + "grad_norm": 0.24070246517658234, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0073, + "step": 9550 + }, + { + "epoch": 0.5851031274863823, + "grad_norm": 0.2524685263633728, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0068, + "step": 9560 + }, + { + "epoch": 0.5857151600465145, + "grad_norm": 0.27286672592163086, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.008, + "step": 9570 + }, + { + "epoch": 0.5863271926066467, + "grad_norm": 0.3459629714488983, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0088, + "step": 9580 + }, + { + "epoch": 0.5869392251667789, + "grad_norm": 0.2964814603328705, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0066, + "step": 9590 + }, + { + "epoch": 0.5875512577269111, + "grad_norm": 0.3559853434562683, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0064, + "step": 9600 + }, + { + "epoch": 0.5881632902870433, + "grad_norm": 0.256898432970047, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0056, + "step": 9610 + }, + { + "epoch": 0.5887753228471755, + "grad_norm": 0.25032711029052734, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0052, + "step": 9620 + }, + { + "epoch": 0.5893873554073077, + "grad_norm": 0.2467224895954132, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0048, + "step": 9630 + }, + { + "epoch": 0.5899993879674399, + "grad_norm": 0.5331161618232727, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0078, + "step": 9640 + }, + { + "epoch": 0.5906114205275721, + "grad_norm": 0.33348897099494934, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0068, + "step": 9650 + }, + { + "epoch": 0.5912234530877043, + "grad_norm": 0.21435993909835815, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0058, + "step": 9660 + }, + { + "epoch": 0.5918354856478365, + "grad_norm": 0.35850396752357483, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0068, + "step": 9670 + }, + { + "epoch": 0.5924475182079687, + "grad_norm": 0.3007623851299286, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0063, + "step": 9680 + }, + { + "epoch": 0.5930595507681009, + "grad_norm": 0.22949714958667755, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0054, + "step": 9690 + }, + { + "epoch": 0.5936715833282331, + "grad_norm": 0.23259367048740387, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0048, + "step": 9700 + }, + { + "epoch": 0.5942836158883653, + "grad_norm": 0.2305079996585846, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0047, + "step": 9710 + }, + { + "epoch": 0.5948956484484974, + "grad_norm": 0.33875930309295654, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0063, + "step": 9720 + }, + { + "epoch": 0.5955076810086296, + "grad_norm": 0.3981896936893463, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0076, + "step": 9730 + }, + { + "epoch": 0.5961197135687618, + "grad_norm": 0.280831515789032, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0075, + "step": 9740 + }, + { + "epoch": 0.596731746128894, + "grad_norm": 0.26045629382133484, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0064, + "step": 9750 + }, + { + "epoch": 0.5973437786890262, + "grad_norm": 0.23102521896362305, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0071, + "step": 9760 + }, + { + "epoch": 0.5979558112491584, + "grad_norm": 0.5013224482536316, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0063, + "step": 9770 + }, + { + "epoch": 0.5985678438092906, + "grad_norm": 0.45689067244529724, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0088, + "step": 9780 + }, + { + "epoch": 0.5991798763694228, + "grad_norm": 0.27118632197380066, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0065, + "step": 9790 + }, + { + "epoch": 0.599791908929555, + "grad_norm": 0.420202374458313, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0076, + "step": 9800 + }, + { + "epoch": 0.6004039414896872, + "grad_norm": 0.35844025015830994, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0056, + "step": 9810 + }, + { + "epoch": 0.6010159740498194, + "grad_norm": 0.2205585241317749, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0082, + "step": 9820 + }, + { + "epoch": 0.6016280066099516, + "grad_norm": 0.18860426545143127, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.007, + "step": 9830 + }, + { + "epoch": 0.6022400391700838, + "grad_norm": 0.25045180320739746, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0082, + "step": 9840 + }, + { + "epoch": 0.602852071730216, + "grad_norm": 0.2581705152988434, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0065, + "step": 9850 + }, + { + "epoch": 0.6034641042903482, + "grad_norm": 0.25894811749458313, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0058, + "step": 9860 + }, + { + "epoch": 0.6040761368504804, + "grad_norm": 0.43305444717407227, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0066, + "step": 9870 + }, + { + "epoch": 0.6046881694106127, + "grad_norm": 0.2295757383108139, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0069, + "step": 9880 + }, + { + "epoch": 0.6053002019707449, + "grad_norm": 0.29785802960395813, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0074, + "step": 9890 + }, + { + "epoch": 0.6059122345308771, + "grad_norm": 0.3353278338909149, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0068, + "step": 9900 + }, + { + "epoch": 0.6065242670910093, + "grad_norm": 0.29115045070648193, + "learning_rate": 1.612387195896372e-05, + "loss": 0.008, + "step": 9910 + }, + { + "epoch": 0.6071362996511415, + "grad_norm": 0.3202555477619171, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0071, + "step": 9920 + }, + { + "epoch": 0.6077483322112737, + "grad_norm": 0.2849314212799072, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.005, + "step": 9930 + }, + { + "epoch": 0.6083603647714059, + "grad_norm": 0.2768756151199341, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0051, + "step": 9940 + }, + { + "epoch": 0.6089723973315381, + "grad_norm": 0.3138035535812378, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0058, + "step": 9950 + }, + { + "epoch": 0.6095844298916703, + "grad_norm": 0.20827682316303253, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0058, + "step": 9960 + }, + { + "epoch": 0.6101964624518025, + "grad_norm": 0.29986995458602905, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0076, + "step": 9970 + }, + { + "epoch": 0.6108084950119347, + "grad_norm": 0.23564326763153076, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0056, + "step": 9980 + }, + { + "epoch": 0.6114205275720669, + "grad_norm": 0.24854765832424164, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0066, + "step": 9990 + }, + { + "epoch": 0.6120325601321991, + "grad_norm": 0.5696694850921631, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0072, + "step": 10000 + }, + { + "epoch": 0.6126445926923312, + "grad_norm": 0.24267911911010742, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.0061, + "step": 10010 + }, + { + "epoch": 0.6132566252524634, + "grad_norm": 0.1955283135175705, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0076, + "step": 10020 + }, + { + "epoch": 0.6138686578125956, + "grad_norm": 0.3427830934524536, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0073, + "step": 10030 + }, + { + "epoch": 0.6144806903727278, + "grad_norm": 0.38532915711402893, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0078, + "step": 10040 + }, + { + "epoch": 0.61509272293286, + "grad_norm": 0.4302294850349426, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0071, + "step": 10050 + }, + { + "epoch": 0.6157047554929922, + "grad_norm": 0.38420233130455017, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0072, + "step": 10060 + }, + { + "epoch": 0.6163167880531244, + "grad_norm": 0.23822636902332306, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.004, + "step": 10070 + }, + { + "epoch": 0.6169288206132566, + "grad_norm": 0.25123289227485657, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0065, + "step": 10080 + }, + { + "epoch": 0.6175408531733888, + "grad_norm": 0.23007746040821075, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0059, + "step": 10090 + }, + { + "epoch": 0.618152885733521, + "grad_norm": 0.24051082134246826, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0089, + "step": 10100 + }, + { + "epoch": 0.6187649182936532, + "grad_norm": 0.26246321201324463, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0052, + "step": 10110 + }, + { + "epoch": 0.6193769508537854, + "grad_norm": 0.3160432279109955, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0059, + "step": 10120 + }, + { + "epoch": 0.6199889834139176, + "grad_norm": 0.42534199357032776, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0071, + "step": 10130 + }, + { + "epoch": 0.6206010159740498, + "grad_norm": 0.22966268658638, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0052, + "step": 10140 + }, + { + "epoch": 0.621213048534182, + "grad_norm": 0.22234882414340973, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0073, + "step": 10150 + }, + { + "epoch": 0.6218250810943142, + "grad_norm": 0.31061676144599915, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0066, + "step": 10160 + }, + { + "epoch": 0.6224371136544464, + "grad_norm": 0.34178492426872253, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0063, + "step": 10170 + }, + { + "epoch": 0.6230491462145786, + "grad_norm": 0.263583779335022, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0079, + "step": 10180 + }, + { + "epoch": 0.6236611787747108, + "grad_norm": 0.3774336278438568, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0066, + "step": 10190 + }, + { + "epoch": 0.624273211334843, + "grad_norm": 0.29274430871009827, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.007, + "step": 10200 + }, + { + "epoch": 0.6248852438949752, + "grad_norm": 0.31850868463516235, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0056, + "step": 10210 + }, + { + "epoch": 0.6254972764551074, + "grad_norm": 0.3084369897842407, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0086, + "step": 10220 + }, + { + "epoch": 0.6261093090152396, + "grad_norm": 0.21596118807792664, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0072, + "step": 10230 + }, + { + "epoch": 0.6267213415753718, + "grad_norm": 0.16397996246814728, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0075, + "step": 10240 + }, + { + "epoch": 0.627333374135504, + "grad_norm": 0.15055827796459198, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0046, + "step": 10250 + }, + { + "epoch": 0.6279454066956363, + "grad_norm": 0.23483684659004211, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0064, + "step": 10260 + }, + { + "epoch": 0.6285574392557685, + "grad_norm": 0.3131091594696045, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0069, + "step": 10270 + }, + { + "epoch": 0.6291694718159007, + "grad_norm": 0.27958226203918457, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0067, + "step": 10280 + }, + { + "epoch": 0.6297815043760328, + "grad_norm": 0.23422567546367645, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0086, + "step": 10290 + }, + { + "epoch": 0.630393536936165, + "grad_norm": 0.4644703269004822, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0067, + "step": 10300 + }, + { + "epoch": 0.6310055694962972, + "grad_norm": 0.45787107944488525, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0068, + "step": 10310 + }, + { + "epoch": 0.6316176020564294, + "grad_norm": 0.21038737893104553, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0074, + "step": 10320 + }, + { + "epoch": 0.6322296346165616, + "grad_norm": 0.23812010884284973, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0054, + "step": 10330 + }, + { + "epoch": 0.6328416671766938, + "grad_norm": 0.36856284737586975, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0061, + "step": 10340 + }, + { + "epoch": 0.633453699736826, + "grad_norm": 0.3540131151676178, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0074, + "step": 10350 + }, + { + "epoch": 0.6340657322969582, + "grad_norm": 0.3004823923110962, + "learning_rate": 1.575723252169281e-05, + "loss": 0.006, + "step": 10360 + }, + { + "epoch": 0.6346777648570904, + "grad_norm": 0.17188489437103271, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0053, + "step": 10370 + }, + { + "epoch": 0.6352897974172226, + "grad_norm": 0.21710847318172455, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0062, + "step": 10380 + }, + { + "epoch": 0.6359018299773548, + "grad_norm": 0.2356785386800766, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0061, + "step": 10390 + }, + { + "epoch": 0.636513862537487, + "grad_norm": 0.2736414670944214, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0063, + "step": 10400 + }, + { + "epoch": 0.6371258950976192, + "grad_norm": 0.23872444033622742, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.006, + "step": 10410 + }, + { + "epoch": 0.6377379276577514, + "grad_norm": 0.24478361010551453, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0073, + "step": 10420 + }, + { + "epoch": 0.6383499602178836, + "grad_norm": 0.2964334487915039, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0048, + "step": 10430 + }, + { + "epoch": 0.6389619927780158, + "grad_norm": 0.2760549783706665, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0051, + "step": 10440 + }, + { + "epoch": 0.639574025338148, + "grad_norm": 0.2598065137863159, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0072, + "step": 10450 + }, + { + "epoch": 0.6401860578982802, + "grad_norm": 0.346999853849411, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0052, + "step": 10460 + }, + { + "epoch": 0.6407980904584124, + "grad_norm": 0.31291016936302185, + "learning_rate": 1.56658563993822e-05, + "loss": 0.007, + "step": 10470 + }, + { + "epoch": 0.6414101230185446, + "grad_norm": 0.2631952166557312, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0058, + "step": 10480 + }, + { + "epoch": 0.6420221555786768, + "grad_norm": 0.30895209312438965, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.006, + "step": 10490 + }, + { + "epoch": 0.642634188138809, + "grad_norm": 0.17614217102527618, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0069, + "step": 10500 + }, + { + "epoch": 0.6432462206989412, + "grad_norm": 0.38792312145233154, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0077, + "step": 10510 + }, + { + "epoch": 0.6438582532590734, + "grad_norm": 0.1722564697265625, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0054, + "step": 10520 + }, + { + "epoch": 0.6444702858192056, + "grad_norm": 0.2741699516773224, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0062, + "step": 10530 + }, + { + "epoch": 0.6450823183793378, + "grad_norm": 0.2059863954782486, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0073, + "step": 10540 + }, + { + "epoch": 0.64569435093947, + "grad_norm": 0.2702447474002838, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0044, + "step": 10550 + }, + { + "epoch": 0.6463063834996022, + "grad_norm": 0.2299312800168991, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0051, + "step": 10560 + }, + { + "epoch": 0.6469184160597343, + "grad_norm": 0.1995723992586136, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0057, + "step": 10570 + }, + { + "epoch": 0.6475304486198665, + "grad_norm": 0.30346980690956116, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0063, + "step": 10580 + }, + { + "epoch": 0.6481424811799987, + "grad_norm": 0.5040738582611084, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0064, + "step": 10590 + }, + { + "epoch": 0.6487545137401309, + "grad_norm": 0.16984818875789642, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0071, + "step": 10600 + }, + { + "epoch": 0.6493665463002631, + "grad_norm": 0.26560020446777344, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0116, + "step": 10610 + }, + { + "epoch": 0.6499785788603953, + "grad_norm": 0.4563823342323303, + "learning_rate": 1.554018740860716e-05, + "loss": 0.008, + "step": 10620 + }, + { + "epoch": 0.6505906114205275, + "grad_norm": 0.23272818326950073, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.006, + "step": 10630 + }, + { + "epoch": 0.6512026439806597, + "grad_norm": 0.19166870415210724, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0066, + "step": 10640 + }, + { + "epoch": 0.651814676540792, + "grad_norm": 0.2822705805301666, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0062, + "step": 10650 + }, + { + "epoch": 0.6524267091009242, + "grad_norm": 0.24001267552375793, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0069, + "step": 10660 + }, + { + "epoch": 0.6530387416610564, + "grad_norm": 0.2563900947570801, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0068, + "step": 10670 + }, + { + "epoch": 0.6536507742211886, + "grad_norm": 0.2747437357902527, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0058, + "step": 10680 + }, + { + "epoch": 0.6542628067813208, + "grad_norm": 0.39710354804992676, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.005, + "step": 10690 + }, + { + "epoch": 0.654874839341453, + "grad_norm": 0.30690231919288635, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0102, + "step": 10700 + }, + { + "epoch": 0.6554868719015852, + "grad_norm": 0.2879253923892975, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0072, + "step": 10710 + }, + { + "epoch": 0.6560989044617174, + "grad_norm": 0.19964110851287842, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0065, + "step": 10720 + }, + { + "epoch": 0.6567109370218496, + "grad_norm": 0.20109151303768158, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0066, + "step": 10730 + }, + { + "epoch": 0.6573229695819818, + "grad_norm": 0.21469832956790924, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0048, + "step": 10740 + }, + { + "epoch": 0.657935002142114, + "grad_norm": 0.19622936844825745, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0057, + "step": 10750 + }, + { + "epoch": 0.6585470347022462, + "grad_norm": 0.2255190759897232, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0052, + "step": 10760 + }, + { + "epoch": 0.6591590672623784, + "grad_norm": 0.47484955191612244, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0066, + "step": 10770 + }, + { + "epoch": 0.6597710998225106, + "grad_norm": 0.32192179560661316, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0067, + "step": 10780 + }, + { + "epoch": 0.6603831323826428, + "grad_norm": 0.33044904470443726, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0061, + "step": 10790 + }, + { + "epoch": 0.660995164942775, + "grad_norm": 0.3206661343574524, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0072, + "step": 10800 + }, + { + "epoch": 0.6616071975029072, + "grad_norm": 0.34903818368911743, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0055, + "step": 10810 + }, + { + "epoch": 0.6622192300630394, + "grad_norm": 0.1982222944498062, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0063, + "step": 10820 + }, + { + "epoch": 0.6628312626231716, + "grad_norm": 0.25388309359550476, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0072, + "step": 10830 + }, + { + "epoch": 0.6634432951833038, + "grad_norm": 0.2325269728899002, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0078, + "step": 10840 + }, + { + "epoch": 0.6640553277434359, + "grad_norm": 0.3364964425563812, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0054, + "step": 10850 + }, + { + "epoch": 0.6646673603035681, + "grad_norm": 0.198661208152771, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0061, + "step": 10860 + }, + { + "epoch": 0.6652793928637003, + "grad_norm": 0.333836168050766, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0058, + "step": 10870 + }, + { + "epoch": 0.6658914254238325, + "grad_norm": 0.21908101439476013, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0087, + "step": 10880 + }, + { + "epoch": 0.6665034579839647, + "grad_norm": 0.3094167709350586, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0062, + "step": 10890 + }, + { + "epoch": 0.6671154905440969, + "grad_norm": 0.28113746643066406, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0066, + "step": 10900 + }, + { + "epoch": 0.6677275231042291, + "grad_norm": 0.20239399373531342, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0071, + "step": 10910 + }, + { + "epoch": 0.6683395556643613, + "grad_norm": 0.32829156517982483, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0056, + "step": 10920 + }, + { + "epoch": 0.6689515882244935, + "grad_norm": 0.2950859069824219, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 0.6695636207846257, + "grad_norm": 0.36404141783714294, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0075, + "step": 10940 + }, + { + "epoch": 0.6701756533447579, + "grad_norm": 0.2479381114244461, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0055, + "step": 10950 + }, + { + "epoch": 0.6707876859048901, + "grad_norm": 0.1934390366077423, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.005, + "step": 10960 + }, + { + "epoch": 0.6713997184650223, + "grad_norm": 0.20912423729896545, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0064, + "step": 10970 + }, + { + "epoch": 0.6720117510251545, + "grad_norm": 0.1781405806541443, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0048, + "step": 10980 + }, + { + "epoch": 0.6726237835852867, + "grad_norm": 0.18812811374664307, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0048, + "step": 10990 + }, + { + "epoch": 0.6732358161454189, + "grad_norm": 0.2006077766418457, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0073, + "step": 11000 + }, + { + "epoch": 0.6738478487055511, + "grad_norm": 0.20471568405628204, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0068, + "step": 11010 + }, + { + "epoch": 0.6744598812656833, + "grad_norm": 0.2979716658592224, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0067, + "step": 11020 + }, + { + "epoch": 0.6750719138258156, + "grad_norm": 0.3256290853023529, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0054, + "step": 11030 + }, + { + "epoch": 0.6756839463859478, + "grad_norm": 0.3346560001373291, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0061, + "step": 11040 + }, + { + "epoch": 0.67629597894608, + "grad_norm": 0.35791122913360596, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0054, + "step": 11050 + }, + { + "epoch": 0.6769080115062122, + "grad_norm": 0.30428826808929443, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0066, + "step": 11060 + }, + { + "epoch": 0.6775200440663444, + "grad_norm": 0.31254154443740845, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0065, + "step": 11070 + }, + { + "epoch": 0.6781320766264766, + "grad_norm": 0.263028621673584, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0062, + "step": 11080 + }, + { + "epoch": 0.6787441091866088, + "grad_norm": 0.22496990859508514, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0056, + "step": 11090 + }, + { + "epoch": 0.679356141746741, + "grad_norm": 0.2647632360458374, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0072, + "step": 11100 + }, + { + "epoch": 0.6799681743068732, + "grad_norm": 0.2517150342464447, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0064, + "step": 11110 + }, + { + "epoch": 0.6805802068670054, + "grad_norm": 0.30550616979599, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0055, + "step": 11120 + }, + { + "epoch": 0.6811922394271375, + "grad_norm": 0.21312931180000305, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0074, + "step": 11130 + }, + { + "epoch": 0.6818042719872697, + "grad_norm": 0.21152199804782867, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0047, + "step": 11140 + }, + { + "epoch": 0.6824163045474019, + "grad_norm": 0.2030613273382187, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0045, + "step": 11150 + }, + { + "epoch": 0.6830283371075341, + "grad_norm": 0.30646151304244995, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0045, + "step": 11160 + }, + { + "epoch": 0.6836403696676663, + "grad_norm": 0.2693783938884735, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0061, + "step": 11170 + }, + { + "epoch": 0.6842524022277985, + "grad_norm": 0.25288495421409607, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0068, + "step": 11180 + }, + { + "epoch": 0.6848644347879307, + "grad_norm": 0.34989964962005615, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.007, + "step": 11190 + }, + { + "epoch": 0.6854764673480629, + "grad_norm": 0.192350834608078, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0064, + "step": 11200 + }, + { + "epoch": 0.6860884999081951, + "grad_norm": 0.3841196894645691, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0069, + "step": 11210 + }, + { + "epoch": 0.6867005324683273, + "grad_norm": 0.2168666571378708, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0063, + "step": 11220 + }, + { + "epoch": 0.6873125650284595, + "grad_norm": 0.2756234109401703, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0068, + "step": 11230 + }, + { + "epoch": 0.6879245975885917, + "grad_norm": 0.1971903294324875, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.006, + "step": 11240 + }, + { + "epoch": 0.6885366301487239, + "grad_norm": 0.3857499659061432, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0063, + "step": 11250 + }, + { + "epoch": 0.6891486627088561, + "grad_norm": 0.194110706448555, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0049, + "step": 11260 + }, + { + "epoch": 0.6897606952689883, + "grad_norm": 0.24935179948806763, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0054, + "step": 11270 + }, + { + "epoch": 0.6903727278291205, + "grad_norm": 0.5208527445793152, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0062, + "step": 11280 + }, + { + "epoch": 0.6909847603892527, + "grad_norm": 0.2917899191379547, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0058, + "step": 11290 + }, + { + "epoch": 0.6915967929493849, + "grad_norm": 0.42692577838897705, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0072, + "step": 11300 + }, + { + "epoch": 0.6922088255095171, + "grad_norm": 0.36888429522514343, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0059, + "step": 11310 + }, + { + "epoch": 0.6928208580696493, + "grad_norm": 0.26246029138565063, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0057, + "step": 11320 + }, + { + "epoch": 0.6934328906297815, + "grad_norm": 0.22163739800453186, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0078, + "step": 11330 + }, + { + "epoch": 0.6940449231899137, + "grad_norm": 0.33411458134651184, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0053, + "step": 11340 + }, + { + "epoch": 0.6946569557500459, + "grad_norm": 0.2792898118495941, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0095, + "step": 11350 + }, + { + "epoch": 0.6952689883101781, + "grad_norm": 0.2770175039768219, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0059, + "step": 11360 + }, + { + "epoch": 0.6958810208703103, + "grad_norm": 0.14913171529769897, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0046, + "step": 11370 + }, + { + "epoch": 0.6964930534304425, + "grad_norm": 0.22906239330768585, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0054, + "step": 11380 + }, + { + "epoch": 0.6971050859905747, + "grad_norm": 0.2854336202144623, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0052, + "step": 11390 + }, + { + "epoch": 0.697717118550707, + "grad_norm": 0.21835818886756897, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0064, + "step": 11400 + }, + { + "epoch": 0.698329151110839, + "grad_norm": 0.42180293798446655, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0071, + "step": 11410 + }, + { + "epoch": 0.6989411836709712, + "grad_norm": 0.3056841492652893, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0092, + "step": 11420 + }, + { + "epoch": 0.6995532162311034, + "grad_norm": 0.15149559080600739, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0049, + "step": 11430 + }, + { + "epoch": 0.7001652487912357, + "grad_norm": 0.15561188757419586, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0051, + "step": 11440 + }, + { + "epoch": 0.7007772813513679, + "grad_norm": 0.2941122055053711, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0065, + "step": 11450 + }, + { + "epoch": 0.7013893139115001, + "grad_norm": 0.3008195757865906, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0059, + "step": 11460 + }, + { + "epoch": 0.7020013464716323, + "grad_norm": 0.3787235617637634, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0068, + "step": 11470 + }, + { + "epoch": 0.7026133790317645, + "grad_norm": 0.2069675624370575, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.005, + "step": 11480 + }, + { + "epoch": 0.7032254115918967, + "grad_norm": 0.33505553007125854, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0058, + "step": 11490 + }, + { + "epoch": 0.7038374441520289, + "grad_norm": 0.281213641166687, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0064, + "step": 11500 + }, + { + "epoch": 0.7044494767121611, + "grad_norm": 0.28471192717552185, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0066, + "step": 11510 + }, + { + "epoch": 0.7050615092722933, + "grad_norm": 0.3166801929473877, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0062, + "step": 11520 + }, + { + "epoch": 0.7056735418324255, + "grad_norm": 0.26893407106399536, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.005, + "step": 11530 + }, + { + "epoch": 0.7062855743925577, + "grad_norm": 0.17421478033065796, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0063, + "step": 11540 + }, + { + "epoch": 0.7068976069526899, + "grad_norm": 0.40999990701675415, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0069, + "step": 11550 + }, + { + "epoch": 0.7075096395128221, + "grad_norm": 0.190180242061615, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0061, + "step": 11560 + }, + { + "epoch": 0.7081216720729543, + "grad_norm": 0.20383603870868683, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0049, + "step": 11570 + }, + { + "epoch": 0.7087337046330865, + "grad_norm": 0.28741395473480225, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0059, + "step": 11580 + }, + { + "epoch": 0.7093457371932187, + "grad_norm": 0.24231962859630585, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.008, + "step": 11590 + }, + { + "epoch": 0.7099577697533509, + "grad_norm": 0.2221115529537201, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0053, + "step": 11600 + }, + { + "epoch": 0.7105698023134831, + "grad_norm": 0.18564820289611816, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0072, + "step": 11610 + }, + { + "epoch": 0.7111818348736153, + "grad_norm": 0.3734343647956848, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0089, + "step": 11620 + }, + { + "epoch": 0.7117938674337475, + "grad_norm": 0.3215912878513336, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0093, + "step": 11630 + }, + { + "epoch": 0.7124058999938797, + "grad_norm": 0.22602899372577667, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0062, + "step": 11640 + }, + { + "epoch": 0.7130179325540119, + "grad_norm": 0.3115978538990021, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.006, + "step": 11650 + }, + { + "epoch": 0.7136299651141441, + "grad_norm": 0.26148155331611633, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0071, + "step": 11660 + }, + { + "epoch": 0.7142419976742763, + "grad_norm": 0.142781600356102, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0051, + "step": 11670 + }, + { + "epoch": 0.7148540302344085, + "grad_norm": 0.21306048333644867, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0068, + "step": 11680 + }, + { + "epoch": 0.7154660627945407, + "grad_norm": 0.3439876437187195, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.006, + "step": 11690 + }, + { + "epoch": 0.7160780953546728, + "grad_norm": 0.4010280966758728, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0062, + "step": 11700 + }, + { + "epoch": 0.716690127914805, + "grad_norm": 0.2760031819343567, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.006, + "step": 11710 + }, + { + "epoch": 0.7173021604749372, + "grad_norm": 0.45097261667251587, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0062, + "step": 11720 + }, + { + "epoch": 0.7179141930350694, + "grad_norm": 0.20118115842342377, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0054, + "step": 11730 + }, + { + "epoch": 0.7185262255952016, + "grad_norm": 0.3090760409832001, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0054, + "step": 11740 + }, + { + "epoch": 0.7191382581553338, + "grad_norm": 0.25016647577285767, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0077, + "step": 11750 + }, + { + "epoch": 0.719750290715466, + "grad_norm": 0.2310703545808792, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0053, + "step": 11760 + }, + { + "epoch": 0.7203623232755982, + "grad_norm": 0.2269359678030014, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.006, + "step": 11770 + }, + { + "epoch": 0.7209743558357304, + "grad_norm": 0.3917788565158844, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0062, + "step": 11780 + }, + { + "epoch": 0.7215863883958626, + "grad_norm": 0.25999465584754944, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0064, + "step": 11790 + }, + { + "epoch": 0.7221984209559948, + "grad_norm": 0.19340357184410095, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0071, + "step": 11800 + }, + { + "epoch": 0.722810453516127, + "grad_norm": 0.25046268105506897, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0078, + "step": 11810 + }, + { + "epoch": 0.7234224860762593, + "grad_norm": 0.19819264113903046, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.005, + "step": 11820 + }, + { + "epoch": 0.7240345186363915, + "grad_norm": 0.43484950065612793, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0078, + "step": 11830 + }, + { + "epoch": 0.7246465511965237, + "grad_norm": 0.29191601276397705, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0064, + "step": 11840 + }, + { + "epoch": 0.7252585837566559, + "grad_norm": 0.21717441082000732, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0056, + "step": 11850 + }, + { + "epoch": 0.7258706163167881, + "grad_norm": 0.3210129737854004, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0072, + "step": 11860 + }, + { + "epoch": 0.7264826488769203, + "grad_norm": 0.33192649483680725, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0061, + "step": 11870 + }, + { + "epoch": 0.7270946814370525, + "grad_norm": 0.14648163318634033, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0083, + "step": 11880 + }, + { + "epoch": 0.7277067139971847, + "grad_norm": 0.20028764009475708, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0052, + "step": 11890 + }, + { + "epoch": 0.7283187465573169, + "grad_norm": 0.21449612081050873, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0063, + "step": 11900 + }, + { + "epoch": 0.7289307791174491, + "grad_norm": 0.27472081780433655, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0062, + "step": 11910 + }, + { + "epoch": 0.7295428116775813, + "grad_norm": 0.2919130027294159, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0048, + "step": 11920 + }, + { + "epoch": 0.7301548442377135, + "grad_norm": 0.153092160820961, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0053, + "step": 11930 + }, + { + "epoch": 0.7307668767978457, + "grad_norm": 0.22820086777210236, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0058, + "step": 11940 + }, + { + "epoch": 0.7313789093579779, + "grad_norm": 0.24281881749629974, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0044, + "step": 11950 + }, + { + "epoch": 0.7319909419181101, + "grad_norm": 0.32581812143325806, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0051, + "step": 11960 + }, + { + "epoch": 0.7326029744782423, + "grad_norm": 0.3139822483062744, + "learning_rate": 1.435930222050582e-05, + "loss": 0.006, + "step": 11970 + }, + { + "epoch": 0.7332150070383744, + "grad_norm": 0.37985655665397644, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0052, + "step": 11980 + }, + { + "epoch": 0.7338270395985066, + "grad_norm": 0.1958508938550949, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.007, + "step": 11990 + }, + { + "epoch": 0.7344390721586388, + "grad_norm": 0.25318172574043274, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0084, + "step": 12000 + }, + { + "epoch": 0.735051104718771, + "grad_norm": 0.33245304226875305, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0051, + "step": 12010 + }, + { + "epoch": 0.7356631372789032, + "grad_norm": 0.2750372290611267, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0057, + "step": 12020 + }, + { + "epoch": 0.7362751698390354, + "grad_norm": 0.2057010382413864, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0057, + "step": 12030 + }, + { + "epoch": 0.7368872023991676, + "grad_norm": 0.30713731050491333, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0067, + "step": 12040 + }, + { + "epoch": 0.7374992349592998, + "grad_norm": 0.20423808693885803, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.006, + "step": 12050 + }, + { + "epoch": 0.738111267519432, + "grad_norm": 0.3129539489746094, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0067, + "step": 12060 + }, + { + "epoch": 0.7387233000795642, + "grad_norm": 0.25026270747184753, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0081, + "step": 12070 + }, + { + "epoch": 0.7393353326396964, + "grad_norm": 0.4147534668445587, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0061, + "step": 12080 + }, + { + "epoch": 0.7399473651998286, + "grad_norm": 0.20954278111457825, + "learning_rate": 1.425047976058418e-05, + "loss": 0.006, + "step": 12090 + }, + { + "epoch": 0.7405593977599608, + "grad_norm": 0.2700798809528351, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 0.741171430320093, + "grad_norm": 0.2597086429595947, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0059, + "step": 12110 + }, + { + "epoch": 0.7417834628802252, + "grad_norm": 0.2674495279788971, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0045, + "step": 12120 + }, + { + "epoch": 0.7423954954403574, + "grad_norm": 0.24583879113197327, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0061, + "step": 12130 + }, + { + "epoch": 0.7430075280004896, + "grad_norm": 0.23704801499843597, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0054, + "step": 12140 + }, + { + "epoch": 0.7436195605606218, + "grad_norm": 0.2381024807691574, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0073, + "step": 12150 + }, + { + "epoch": 0.744231593120754, + "grad_norm": 0.24937355518341064, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0049, + "step": 12160 + }, + { + "epoch": 0.7448436256808862, + "grad_norm": 0.20442882180213928, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0061, + "step": 12170 + }, + { + "epoch": 0.7454556582410184, + "grad_norm": 0.3053426742553711, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0087, + "step": 12180 + }, + { + "epoch": 0.7460676908011507, + "grad_norm": 0.3654315769672394, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0047, + "step": 12190 + }, + { + "epoch": 0.7466797233612829, + "grad_norm": 0.18926535546779633, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0065, + "step": 12200 + }, + { + "epoch": 0.7472917559214151, + "grad_norm": 0.21620485186576843, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0094, + "step": 12210 + }, + { + "epoch": 0.7479037884815473, + "grad_norm": 0.2754563093185425, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0059, + "step": 12220 + }, + { + "epoch": 0.7485158210416795, + "grad_norm": 0.39795419573783875, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.007, + "step": 12230 + }, + { + "epoch": 0.7491278536018117, + "grad_norm": 0.20502857863903046, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0048, + "step": 12240 + }, + { + "epoch": 0.7497398861619439, + "grad_norm": 0.23821429908275604, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0082, + "step": 12250 + }, + { + "epoch": 0.750351918722076, + "grad_norm": 0.45541366934776306, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0071, + "step": 12260 + }, + { + "epoch": 0.7509639512822082, + "grad_norm": 0.24881400167942047, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0061, + "step": 12270 + }, + { + "epoch": 0.7515759838423404, + "grad_norm": 0.2409125715494156, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0061, + "step": 12280 + }, + { + "epoch": 0.7521880164024726, + "grad_norm": 0.2930417060852051, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0054, + "step": 12290 + }, + { + "epoch": 0.7528000489626048, + "grad_norm": 0.30566394329071045, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0054, + "step": 12300 + }, + { + "epoch": 0.753412081522737, + "grad_norm": 0.32679763436317444, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0059, + "step": 12310 + }, + { + "epoch": 0.7540241140828692, + "grad_norm": 0.29273876547813416, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0067, + "step": 12320 + }, + { + "epoch": 0.7546361466430014, + "grad_norm": 0.19642773270606995, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0062, + "step": 12330 + }, + { + "epoch": 0.7552481792031336, + "grad_norm": 0.21928250789642334, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0066, + "step": 12340 + }, + { + "epoch": 0.7558602117632658, + "grad_norm": 0.2534322738647461, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0059, + "step": 12350 + }, + { + "epoch": 0.756472244323398, + "grad_norm": 0.20712649822235107, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0054, + "step": 12360 + }, + { + "epoch": 0.7570842768835302, + "grad_norm": 0.18670639395713806, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0063, + "step": 12370 + }, + { + "epoch": 0.7576963094436624, + "grad_norm": 0.26770254969596863, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0052, + "step": 12380 + }, + { + "epoch": 0.7583083420037946, + "grad_norm": 0.3621291518211365, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0056, + "step": 12390 + }, + { + "epoch": 0.7589203745639268, + "grad_norm": 0.31771939992904663, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0059, + "step": 12400 + }, + { + "epoch": 0.759532407124059, + "grad_norm": 0.44418177008628845, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0065, + "step": 12410 + }, + { + "epoch": 0.7601444396841912, + "grad_norm": 0.2183474898338318, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0046, + "step": 12420 + }, + { + "epoch": 0.7607564722443234, + "grad_norm": 0.4400590658187866, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0061, + "step": 12430 + }, + { + "epoch": 0.7613685048044556, + "grad_norm": 0.296539843082428, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0059, + "step": 12440 + }, + { + "epoch": 0.7619805373645878, + "grad_norm": 0.352870374917984, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0055, + "step": 12450 + }, + { + "epoch": 0.76259256992472, + "grad_norm": 0.19494596123695374, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0061, + "step": 12460 + }, + { + "epoch": 0.7632046024848522, + "grad_norm": 0.3799489438533783, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0057, + "step": 12470 + }, + { + "epoch": 0.7638166350449844, + "grad_norm": 0.3572365641593933, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0053, + "step": 12480 + }, + { + "epoch": 0.7644286676051166, + "grad_norm": 0.2559097707271576, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0062, + "step": 12490 + }, + { + "epoch": 0.7650407001652488, + "grad_norm": 0.13144978880882263, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0065, + "step": 12500 + }, + { + "epoch": 0.765652732725381, + "grad_norm": 0.34635287523269653, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0081, + "step": 12510 + }, + { + "epoch": 0.7662647652855132, + "grad_norm": 0.25615188479423523, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0057, + "step": 12520 + }, + { + "epoch": 0.7668767978456454, + "grad_norm": 0.17619644105434418, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0047, + "step": 12530 + }, + { + "epoch": 0.7674888304057775, + "grad_norm": 0.20169994235038757, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0068, + "step": 12540 + }, + { + "epoch": 0.7681008629659097, + "grad_norm": 0.49686071276664734, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0066, + "step": 12550 + }, + { + "epoch": 0.7687128955260419, + "grad_norm": 0.28179335594177246, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0058, + "step": 12560 + }, + { + "epoch": 0.7693249280861741, + "grad_norm": 0.28156182169914246, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.005, + "step": 12570 + }, + { + "epoch": 0.7699369606463063, + "grad_norm": 0.15054315328598022, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0051, + "step": 12580 + }, + { + "epoch": 0.7705489932064385, + "grad_norm": 0.22872644662857056, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0066, + "step": 12590 + }, + { + "epoch": 0.7711610257665708, + "grad_norm": 0.25821951031684875, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0054, + "step": 12600 + }, + { + "epoch": 0.771773058326703, + "grad_norm": 0.23592771589756012, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0059, + "step": 12610 + }, + { + "epoch": 0.7723850908868352, + "grad_norm": 0.34409141540527344, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0053, + "step": 12620 + }, + { + "epoch": 0.7729971234469674, + "grad_norm": 0.2803158760070801, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0042, + "step": 12630 + }, + { + "epoch": 0.7736091560070996, + "grad_norm": 0.32796284556388855, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0074, + "step": 12640 + }, + { + "epoch": 0.7742211885672318, + "grad_norm": 0.34749120473861694, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0054, + "step": 12650 + }, + { + "epoch": 0.774833221127364, + "grad_norm": 0.34066343307495117, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0082, + "step": 12660 + }, + { + "epoch": 0.7754452536874962, + "grad_norm": 0.4294384717941284, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0062, + "step": 12670 + }, + { + "epoch": 0.7760572862476284, + "grad_norm": 0.2355230748653412, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0069, + "step": 12680 + }, + { + "epoch": 0.7766693188077606, + "grad_norm": 0.3181976079940796, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0068, + "step": 12690 + }, + { + "epoch": 0.7772813513678928, + "grad_norm": 0.2763727605342865, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0052, + "step": 12700 + }, + { + "epoch": 0.777893383928025, + "grad_norm": 0.2938949465751648, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0041, + "step": 12710 + }, + { + "epoch": 0.7785054164881572, + "grad_norm": 0.31331220269203186, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0062, + "step": 12720 + }, + { + "epoch": 0.7791174490482894, + "grad_norm": 0.3389904797077179, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0061, + "step": 12730 + }, + { + "epoch": 0.7797294816084216, + "grad_norm": 0.2848975360393524, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0065, + "step": 12740 + }, + { + "epoch": 0.7803415141685538, + "grad_norm": 0.29838478565216064, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0061, + "step": 12750 + }, + { + "epoch": 0.780953546728686, + "grad_norm": 0.47004032135009766, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0066, + "step": 12760 + }, + { + "epoch": 0.7815655792888182, + "grad_norm": 0.26898056268692017, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0063, + "step": 12770 + }, + { + "epoch": 0.7821776118489504, + "grad_norm": 0.29459917545318604, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0065, + "step": 12780 + }, + { + "epoch": 0.7827896444090826, + "grad_norm": 0.3481508791446686, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0058, + "step": 12790 + }, + { + "epoch": 0.7834016769692148, + "grad_norm": 0.1707627922296524, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0053, + "step": 12800 + }, + { + "epoch": 0.784013709529347, + "grad_norm": 0.14735333621501923, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0058, + "step": 12810 + }, + { + "epoch": 0.7846257420894791, + "grad_norm": 0.28002044558525085, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.006, + "step": 12820 + }, + { + "epoch": 0.7852377746496113, + "grad_norm": 0.39598894119262695, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0062, + "step": 12830 + }, + { + "epoch": 0.7858498072097435, + "grad_norm": 0.19379247725009918, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0058, + "step": 12840 + }, + { + "epoch": 0.7864618397698757, + "grad_norm": 0.27260729670524597, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.006, + "step": 12850 + }, + { + "epoch": 0.7870738723300079, + "grad_norm": 0.2845087945461273, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0052, + "step": 12860 + }, + { + "epoch": 0.7876859048901401, + "grad_norm": 0.37151217460632324, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0043, + "step": 12870 + }, + { + "epoch": 0.7882979374502723, + "grad_norm": 0.3387412130832672, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0046, + "step": 12880 + }, + { + "epoch": 0.7889099700104045, + "grad_norm": 0.42672809958457947, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0065, + "step": 12890 + }, + { + "epoch": 0.7895220025705367, + "grad_norm": 0.20378202199935913, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0062, + "step": 12900 + }, + { + "epoch": 0.7901340351306689, + "grad_norm": 0.16417330503463745, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0045, + "step": 12910 + }, + { + "epoch": 0.7907460676908011, + "grad_norm": 0.1704142540693283, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0054, + "step": 12920 + }, + { + "epoch": 0.7913581002509333, + "grad_norm": 0.21494890749454498, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0061, + "step": 12930 + }, + { + "epoch": 0.7919701328110655, + "grad_norm": 0.3430638909339905, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0046, + "step": 12940 + }, + { + "epoch": 0.7925821653711977, + "grad_norm": 0.22641201317310333, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0049, + "step": 12950 + }, + { + "epoch": 0.79319419793133, + "grad_norm": 0.27153971791267395, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0057, + "step": 12960 + }, + { + "epoch": 0.7938062304914622, + "grad_norm": 0.2648560702800751, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.0048, + "step": 12970 + }, + { + "epoch": 0.7944182630515944, + "grad_norm": 0.2148633897304535, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0048, + "step": 12980 + }, + { + "epoch": 0.7950302956117266, + "grad_norm": 0.35170191526412964, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0069, + "step": 12990 + }, + { + "epoch": 0.7956423281718588, + "grad_norm": 0.3539712429046631, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0067, + "step": 13000 + }, + { + "epoch": 0.796254360731991, + "grad_norm": 0.29938259720802307, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0102, + "step": 13010 + }, + { + "epoch": 0.7968663932921232, + "grad_norm": 0.35241010785102844, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0073, + "step": 13020 + }, + { + "epoch": 0.7974784258522554, + "grad_norm": 0.2929113805294037, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0061, + "step": 13030 + }, + { + "epoch": 0.7980904584123876, + "grad_norm": 0.24052929878234863, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0046, + "step": 13040 + }, + { + "epoch": 0.7987024909725198, + "grad_norm": 0.21611042320728302, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0043, + "step": 13050 + }, + { + "epoch": 0.799314523532652, + "grad_norm": 0.23498570919036865, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0046, + "step": 13060 + }, + { + "epoch": 0.7999265560927842, + "grad_norm": 0.30229923129081726, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0068, + "step": 13070 + }, + { + "epoch": 0.8005385886529164, + "grad_norm": 0.2916681170463562, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0062, + "step": 13080 + }, + { + "epoch": 0.8011506212130486, + "grad_norm": 0.31905195116996765, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0048, + "step": 13090 + }, + { + "epoch": 0.8017626537731807, + "grad_norm": 0.22307109832763672, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0049, + "step": 13100 + }, + { + "epoch": 0.8023746863333129, + "grad_norm": 0.2815198004245758, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0054, + "step": 13110 + }, + { + "epoch": 0.8029867188934451, + "grad_norm": 0.18762829899787903, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0064, + "step": 13120 + }, + { + "epoch": 0.8035987514535773, + "grad_norm": 0.1918255090713501, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0064, + "step": 13130 + }, + { + "epoch": 0.8042107840137095, + "grad_norm": 0.3726229667663574, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0065, + "step": 13140 + }, + { + "epoch": 0.8048228165738417, + "grad_norm": 0.423285573720932, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0062, + "step": 13150 + }, + { + "epoch": 0.8054348491339739, + "grad_norm": 0.1709958165884018, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0052, + "step": 13160 + }, + { + "epoch": 0.8060468816941061, + "grad_norm": 0.3615981936454773, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0054, + "step": 13170 + }, + { + "epoch": 0.8066589142542383, + "grad_norm": 0.2101999819278717, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0053, + "step": 13180 + }, + { + "epoch": 0.8072709468143705, + "grad_norm": 0.14393582940101624, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0047, + "step": 13190 + }, + { + "epoch": 0.8078829793745027, + "grad_norm": 0.3704521656036377, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0056, + "step": 13200 + }, + { + "epoch": 0.8084950119346349, + "grad_norm": 0.23275913298130035, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0051, + "step": 13210 + }, + { + "epoch": 0.8091070444947671, + "grad_norm": 0.18429698050022125, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0045, + "step": 13220 + }, + { + "epoch": 0.8097190770548993, + "grad_norm": 0.21721667051315308, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0052, + "step": 13230 + }, + { + "epoch": 0.8103311096150315, + "grad_norm": 0.29456019401550293, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 0.8109431421751637, + "grad_norm": 0.19854630529880524, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0071, + "step": 13250 + }, + { + "epoch": 0.8115551747352959, + "grad_norm": 0.4318163990974426, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0059, + "step": 13260 + }, + { + "epoch": 0.8121672072954281, + "grad_norm": 0.3421531915664673, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.006, + "step": 13270 + }, + { + "epoch": 0.8127792398555603, + "grad_norm": 0.2370125651359558, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0054, + "step": 13280 + }, + { + "epoch": 0.8133912724156925, + "grad_norm": 0.2996460497379303, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 0.8140033049758247, + "grad_norm": 0.2911904454231262, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0053, + "step": 13300 + }, + { + "epoch": 0.8146153375359569, + "grad_norm": 0.26010408997535706, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0053, + "step": 13310 + }, + { + "epoch": 0.8152273700960891, + "grad_norm": 0.404702752828598, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0081, + "step": 13320 + }, + { + "epoch": 0.8158394026562213, + "grad_norm": 0.25591781735420227, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0057, + "step": 13330 + }, + { + "epoch": 0.8164514352163535, + "grad_norm": 0.1437849998474121, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0064, + "step": 13340 + }, + { + "epoch": 0.8170634677764858, + "grad_norm": 0.12252022325992584, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0047, + "step": 13350 + }, + { + "epoch": 0.817675500336618, + "grad_norm": 0.1861230581998825, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0063, + "step": 13360 + }, + { + "epoch": 0.8182875328967502, + "grad_norm": 0.2313026636838913, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0066, + "step": 13370 + }, + { + "epoch": 0.8188995654568824, + "grad_norm": 0.5445839166641235, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0076, + "step": 13380 + }, + { + "epoch": 0.8195115980170145, + "grad_norm": 0.21818871796131134, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0068, + "step": 13390 + }, + { + "epoch": 0.8201236305771467, + "grad_norm": 0.21823963522911072, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0072, + "step": 13400 + }, + { + "epoch": 0.8207356631372789, + "grad_norm": 0.1730659157037735, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0051, + "step": 13410 + }, + { + "epoch": 0.8213476956974111, + "grad_norm": 0.1301007866859436, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0075, + "step": 13420 + }, + { + "epoch": 0.8219597282575433, + "grad_norm": 0.32452520728111267, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.005, + "step": 13430 + }, + { + "epoch": 0.8225717608176755, + "grad_norm": 0.24771001935005188, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0058, + "step": 13440 + }, + { + "epoch": 0.8231837933778077, + "grad_norm": 0.4575227200984955, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0062, + "step": 13450 + }, + { + "epoch": 0.8237958259379399, + "grad_norm": 0.16441279649734497, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0081, + "step": 13460 + }, + { + "epoch": 0.8244078584980721, + "grad_norm": 0.26582902669906616, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0069, + "step": 13470 + }, + { + "epoch": 0.8250198910582043, + "grad_norm": 0.18871302902698517, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0068, + "step": 13480 + }, + { + "epoch": 0.8256319236183365, + "grad_norm": 0.23244783282279968, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0063, + "step": 13490 + }, + { + "epoch": 0.8262439561784687, + "grad_norm": 0.2399880290031433, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0036, + "step": 13500 + }, + { + "epoch": 0.8268559887386009, + "grad_norm": 0.25766822695732117, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0074, + "step": 13510 + }, + { + "epoch": 0.8274680212987331, + "grad_norm": 0.24792100489139557, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0066, + "step": 13520 + }, + { + "epoch": 0.8280800538588653, + "grad_norm": 0.3371896743774414, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0069, + "step": 13530 + }, + { + "epoch": 0.8286920864189975, + "grad_norm": 0.16249819099903107, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0079, + "step": 13540 + }, + { + "epoch": 0.8293041189791297, + "grad_norm": 0.2705139219760895, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0069, + "step": 13550 + }, + { + "epoch": 0.8299161515392619, + "grad_norm": 0.1905352771282196, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0055, + "step": 13560 + }, + { + "epoch": 0.8305281840993941, + "grad_norm": 0.23938500881195068, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0055, + "step": 13570 + }, + { + "epoch": 0.8311402166595263, + "grad_norm": 0.3562251031398773, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0054, + "step": 13580 + }, + { + "epoch": 0.8317522492196585, + "grad_norm": 0.2934769093990326, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0064, + "step": 13590 + }, + { + "epoch": 0.8323642817797907, + "grad_norm": 0.252366840839386, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0078, + "step": 13600 + }, + { + "epoch": 0.8329763143399229, + "grad_norm": 0.16646964848041534, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0046, + "step": 13610 + }, + { + "epoch": 0.8335883469000551, + "grad_norm": 0.22584658861160278, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0071, + "step": 13620 + }, + { + "epoch": 0.8342003794601873, + "grad_norm": 0.3578774034976959, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0049, + "step": 13630 + }, + { + "epoch": 0.8348124120203195, + "grad_norm": 0.3447739779949188, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0065, + "step": 13640 + }, + { + "epoch": 0.8354244445804517, + "grad_norm": 0.381954550743103, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0057, + "step": 13650 + }, + { + "epoch": 0.8360364771405839, + "grad_norm": 0.3563731908798218, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0065, + "step": 13660 + }, + { + "epoch": 0.836648509700716, + "grad_norm": 0.29516372084617615, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0053, + "step": 13670 + }, + { + "epoch": 0.8372605422608482, + "grad_norm": 0.22686618566513062, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0043, + "step": 13680 + }, + { + "epoch": 0.8378725748209804, + "grad_norm": 0.4608387351036072, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.005, + "step": 13690 + }, + { + "epoch": 0.8384846073811126, + "grad_norm": 0.31025534868240356, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0055, + "step": 13700 + }, + { + "epoch": 0.8390966399412448, + "grad_norm": 0.32904690504074097, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0055, + "step": 13710 + }, + { + "epoch": 0.839708672501377, + "grad_norm": 0.2547053098678589, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0061, + "step": 13720 + }, + { + "epoch": 0.8403207050615092, + "grad_norm": 0.30524104833602905, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.005, + "step": 13730 + }, + { + "epoch": 0.8409327376216414, + "grad_norm": 0.17741642892360687, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0051, + "step": 13740 + }, + { + "epoch": 0.8415447701817736, + "grad_norm": 0.23125578463077545, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0053, + "step": 13750 + }, + { + "epoch": 0.8421568027419059, + "grad_norm": 0.3080023229122162, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0075, + "step": 13760 + }, + { + "epoch": 0.842768835302038, + "grad_norm": 0.2509821951389313, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0053, + "step": 13770 + }, + { + "epoch": 0.8433808678621703, + "grad_norm": 0.17483864724636078, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.005, + "step": 13780 + }, + { + "epoch": 0.8439929004223025, + "grad_norm": 0.3952518403530121, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0056, + "step": 13790 + }, + { + "epoch": 0.8446049329824347, + "grad_norm": 0.2945535480976105, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0055, + "step": 13800 + }, + { + "epoch": 0.8452169655425669, + "grad_norm": 0.13024291396141052, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0073, + "step": 13810 + }, + { + "epoch": 0.8458289981026991, + "grad_norm": 0.1840520054101944, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0061, + "step": 13820 + }, + { + "epoch": 0.8464410306628313, + "grad_norm": 0.2368786782026291, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0058, + "step": 13830 + }, + { + "epoch": 0.8470530632229635, + "grad_norm": 0.2885456085205078, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0055, + "step": 13840 + }, + { + "epoch": 0.8476650957830957, + "grad_norm": 0.2782488167285919, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0046, + "step": 13850 + }, + { + "epoch": 0.8482771283432279, + "grad_norm": 0.1711442470550537, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0055, + "step": 13860 + }, + { + "epoch": 0.8488891609033601, + "grad_norm": 0.22235877811908722, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0056, + "step": 13870 + }, + { + "epoch": 0.8495011934634923, + "grad_norm": 0.1937183290719986, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0047, + "step": 13880 + }, + { + "epoch": 0.8501132260236245, + "grad_norm": 0.33960190415382385, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0063, + "step": 13890 + }, + { + "epoch": 0.8507252585837567, + "grad_norm": 0.1983388215303421, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0045, + "step": 13900 + }, + { + "epoch": 0.8513372911438889, + "grad_norm": 0.2968246638774872, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0051, + "step": 13910 + }, + { + "epoch": 0.8519493237040211, + "grad_norm": 0.25328314304351807, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0057, + "step": 13920 + }, + { + "epoch": 0.8525613562641533, + "grad_norm": 0.2435184270143509, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0056, + "step": 13930 + }, + { + "epoch": 0.8531733888242855, + "grad_norm": 0.24512560665607452, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0053, + "step": 13940 + }, + { + "epoch": 0.8537854213844176, + "grad_norm": 0.22028976678848267, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.006, + "step": 13950 + }, + { + "epoch": 0.8543974539445498, + "grad_norm": 0.24743935465812683, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0065, + "step": 13960 + }, + { + "epoch": 0.855009486504682, + "grad_norm": 0.1393810361623764, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0081, + "step": 13970 + }, + { + "epoch": 0.8556215190648142, + "grad_norm": 0.25975972414016724, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0063, + "step": 13980 + }, + { + "epoch": 0.8562335516249464, + "grad_norm": 0.1944616585969925, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0049, + "step": 13990 + }, + { + "epoch": 0.8568455841850786, + "grad_norm": 0.21936742961406708, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0056, + "step": 14000 + }, + { + "epoch": 0.8574576167452108, + "grad_norm": 0.1556629091501236, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0047, + "step": 14010 + }, + { + "epoch": 0.858069649305343, + "grad_norm": 0.23696991801261902, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.006, + "step": 14020 + }, + { + "epoch": 0.8586816818654752, + "grad_norm": 0.32507795095443726, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0061, + "step": 14030 + }, + { + "epoch": 0.8592937144256074, + "grad_norm": 0.35332199931144714, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0058, + "step": 14040 + }, + { + "epoch": 0.8599057469857396, + "grad_norm": 0.1835644394159317, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0046, + "step": 14050 + }, + { + "epoch": 0.8605177795458718, + "grad_norm": 0.19127517938613892, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0044, + "step": 14060 + }, + { + "epoch": 0.861129812106004, + "grad_norm": 0.30748996138572693, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0055, + "step": 14070 + }, + { + "epoch": 0.8617418446661362, + "grad_norm": 0.178785502910614, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0049, + "step": 14080 + }, + { + "epoch": 0.8623538772262684, + "grad_norm": 0.16979056596755981, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0044, + "step": 14090 + }, + { + "epoch": 0.8629659097864006, + "grad_norm": 0.19519983232021332, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0063, + "step": 14100 + }, + { + "epoch": 0.8635779423465328, + "grad_norm": 0.2722550928592682, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0057, + "step": 14110 + }, + { + "epoch": 0.864189974906665, + "grad_norm": 0.1956222504377365, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0054, + "step": 14120 + }, + { + "epoch": 0.8648020074667973, + "grad_norm": 0.32274308800697327, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0066, + "step": 14130 + }, + { + "epoch": 0.8654140400269295, + "grad_norm": 0.25953641533851624, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.0056, + "step": 14140 + }, + { + "epoch": 0.8660260725870617, + "grad_norm": 0.3293299674987793, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0057, + "step": 14150 + }, + { + "epoch": 0.8666381051471939, + "grad_norm": 0.35404127836227417, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0072, + "step": 14160 + }, + { + "epoch": 0.8672501377073261, + "grad_norm": 0.24674376845359802, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0064, + "step": 14170 + }, + { + "epoch": 0.8678621702674583, + "grad_norm": 0.23506462574005127, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0047, + "step": 14180 + }, + { + "epoch": 0.8684742028275905, + "grad_norm": 0.30500903725624084, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0059, + "step": 14190 + }, + { + "epoch": 0.8690862353877227, + "grad_norm": 0.23000167310237885, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0051, + "step": 14200 + }, + { + "epoch": 0.8696982679478549, + "grad_norm": 0.17339368164539337, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0041, + "step": 14210 + }, + { + "epoch": 0.8703103005079871, + "grad_norm": 0.2505367696285248, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.0059, + "step": 14220 + }, + { + "epoch": 0.8709223330681192, + "grad_norm": 0.22645734250545502, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0044, + "step": 14230 + }, + { + "epoch": 0.8715343656282514, + "grad_norm": 0.3509127199649811, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0043, + "step": 14240 + }, + { + "epoch": 0.8721463981883836, + "grad_norm": 0.2758972644805908, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0055, + "step": 14250 + }, + { + "epoch": 0.8727584307485158, + "grad_norm": 0.1943834275007248, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.006, + "step": 14260 + }, + { + "epoch": 0.873370463308648, + "grad_norm": 0.32881075143814087, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0061, + "step": 14270 + }, + { + "epoch": 0.8739824958687802, + "grad_norm": 0.35203438997268677, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0054, + "step": 14280 + }, + { + "epoch": 0.8745945284289124, + "grad_norm": 0.13618917763233185, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0044, + "step": 14290 + }, + { + "epoch": 0.8752065609890446, + "grad_norm": 0.22939404845237732, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0044, + "step": 14300 + }, + { + "epoch": 0.8758185935491768, + "grad_norm": 0.2027491182088852, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0051, + "step": 14310 + }, + { + "epoch": 0.876430626109309, + "grad_norm": 0.21950028836727142, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0105, + "step": 14320 + }, + { + "epoch": 0.8770426586694412, + "grad_norm": 0.307913213968277, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0051, + "step": 14330 + }, + { + "epoch": 0.8776546912295734, + "grad_norm": 0.1669110357761383, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0058, + "step": 14340 + }, + { + "epoch": 0.8782667237897056, + "grad_norm": 0.3033636808395386, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0066, + "step": 14350 + }, + { + "epoch": 0.8788787563498378, + "grad_norm": 0.25514236092567444, + "learning_rate": 1.210961823379053e-05, + "loss": 0.005, + "step": 14360 + }, + { + "epoch": 0.87949078890997, + "grad_norm": 0.2574418783187866, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0069, + "step": 14370 + }, + { + "epoch": 0.8801028214701022, + "grad_norm": 0.17803016304969788, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.006, + "step": 14380 + }, + { + "epoch": 0.8807148540302344, + "grad_norm": 0.31375741958618164, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0077, + "step": 14390 + }, + { + "epoch": 0.8813268865903666, + "grad_norm": 0.18031778931617737, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0052, + "step": 14400 + }, + { + "epoch": 0.8819389191504988, + "grad_norm": 0.18077519536018372, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0055, + "step": 14410 + }, + { + "epoch": 0.882550951710631, + "grad_norm": 0.22171644866466522, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0059, + "step": 14420 + }, + { + "epoch": 0.8831629842707632, + "grad_norm": 0.16187389194965363, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0044, + "step": 14430 + }, + { + "epoch": 0.8837750168308954, + "grad_norm": 0.27667325735092163, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0063, + "step": 14440 + }, + { + "epoch": 0.8843870493910276, + "grad_norm": 0.2493051290512085, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0072, + "step": 14450 + }, + { + "epoch": 0.8849990819511598, + "grad_norm": 0.3519611656665802, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 0.885611114511292, + "grad_norm": 0.17942464351654053, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0057, + "step": 14470 + }, + { + "epoch": 0.8862231470714242, + "grad_norm": 0.24518658220767975, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0044, + "step": 14480 + }, + { + "epoch": 0.8868351796315564, + "grad_norm": 0.28493785858154297, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0055, + "step": 14490 + }, + { + "epoch": 0.8874472121916887, + "grad_norm": 0.22260263562202454, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0062, + "step": 14500 + }, + { + "epoch": 0.8880592447518207, + "grad_norm": 0.2804561257362366, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0051, + "step": 14510 + }, + { + "epoch": 0.888671277311953, + "grad_norm": 0.24349385499954224, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0045, + "step": 14520 + }, + { + "epoch": 0.8892833098720851, + "grad_norm": 0.262207955121994, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0082, + "step": 14530 + }, + { + "epoch": 0.8898953424322174, + "grad_norm": 0.15527820587158203, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0061, + "step": 14540 + }, + { + "epoch": 0.8905073749923496, + "grad_norm": 0.23850804567337036, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0042, + "step": 14550 + }, + { + "epoch": 0.8911194075524818, + "grad_norm": 0.2665582001209259, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0053, + "step": 14560 + }, + { + "epoch": 0.891731440112614, + "grad_norm": 0.2652167081832886, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 0.8923434726727462, + "grad_norm": 0.21386243402957916, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0072, + "step": 14580 + }, + { + "epoch": 0.8929555052328784, + "grad_norm": 0.3087247312068939, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0082, + "step": 14590 + }, + { + "epoch": 0.8935675377930106, + "grad_norm": 0.2003909796476364, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0048, + "step": 14600 + }, + { + "epoch": 0.8941795703531428, + "grad_norm": 0.2214624583721161, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0062, + "step": 14610 + }, + { + "epoch": 0.894791602913275, + "grad_norm": 0.2500647306442261, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0052, + "step": 14620 + }, + { + "epoch": 0.8954036354734072, + "grad_norm": 0.2615419030189514, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0054, + "step": 14630 + }, + { + "epoch": 0.8960156680335394, + "grad_norm": 0.21347551047801971, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0048, + "step": 14640 + }, + { + "epoch": 0.8966277005936716, + "grad_norm": 0.35483887791633606, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0054, + "step": 14650 + }, + { + "epoch": 0.8972397331538038, + "grad_norm": 0.2423439472913742, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0055, + "step": 14660 + }, + { + "epoch": 0.897851765713936, + "grad_norm": 0.16826359927654266, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0067, + "step": 14670 + }, + { + "epoch": 0.8984637982740682, + "grad_norm": 0.3589499294757843, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0059, + "step": 14680 + }, + { + "epoch": 0.8990758308342004, + "grad_norm": 0.3081042468547821, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0057, + "step": 14690 + }, + { + "epoch": 0.8996878633943326, + "grad_norm": 0.31996914744377136, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0063, + "step": 14700 + }, + { + "epoch": 0.9002998959544648, + "grad_norm": 0.301209419965744, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0073, + "step": 14710 + }, + { + "epoch": 0.900911928514597, + "grad_norm": 0.19257168471813202, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0055, + "step": 14720 + }, + { + "epoch": 0.9015239610747292, + "grad_norm": 0.15221600234508514, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0053, + "step": 14730 + }, + { + "epoch": 0.9021359936348614, + "grad_norm": 0.21519577503204346, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0055, + "step": 14740 + }, + { + "epoch": 0.9027480261949936, + "grad_norm": 0.23772196471691132, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.006, + "step": 14750 + }, + { + "epoch": 0.9033600587551258, + "grad_norm": 0.2872219979763031, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0057, + "step": 14760 + }, + { + "epoch": 0.903972091315258, + "grad_norm": 0.2589483857154846, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0056, + "step": 14770 + }, + { + "epoch": 0.9045841238753902, + "grad_norm": 0.31850162148475647, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0051, + "step": 14780 + }, + { + "epoch": 0.9051961564355223, + "grad_norm": 0.27179282903671265, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0051, + "step": 14790 + }, + { + "epoch": 0.9058081889956545, + "grad_norm": 0.4132739007472992, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.005, + "step": 14800 + }, + { + "epoch": 0.9064202215557867, + "grad_norm": 0.19336774945259094, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0041, + "step": 14810 + }, + { + "epoch": 0.9070322541159189, + "grad_norm": 0.20783282816410065, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0065, + "step": 14820 + }, + { + "epoch": 0.9076442866760511, + "grad_norm": 0.26141899824142456, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0069, + "step": 14830 + }, + { + "epoch": 0.9082563192361833, + "grad_norm": 0.2158539742231369, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0081, + "step": 14840 + }, + { + "epoch": 0.9088683517963155, + "grad_norm": 0.3233732581138611, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0065, + "step": 14850 + }, + { + "epoch": 0.9094803843564477, + "grad_norm": 0.23924769461154938, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0042, + "step": 14860 + }, + { + "epoch": 0.9100924169165799, + "grad_norm": 0.17663812637329102, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.004, + "step": 14870 + }, + { + "epoch": 0.9107044494767121, + "grad_norm": 0.34379643201828003, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.005, + "step": 14880 + }, + { + "epoch": 0.9113164820368443, + "grad_norm": 0.29971349239349365, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0059, + "step": 14890 + }, + { + "epoch": 0.9119285145969765, + "grad_norm": 0.24832949042320251, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0042, + "step": 14900 + }, + { + "epoch": 0.9125405471571088, + "grad_norm": 0.22288024425506592, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0065, + "step": 14910 + }, + { + "epoch": 0.913152579717241, + "grad_norm": 0.2806689441204071, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0043, + "step": 14920 + }, + { + "epoch": 0.9137646122773732, + "grad_norm": 0.3908274173736572, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0047, + "step": 14930 + }, + { + "epoch": 0.9143766448375054, + "grad_norm": 0.16255778074264526, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0068, + "step": 14940 + }, + { + "epoch": 0.9149886773976376, + "grad_norm": 0.430791437625885, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0085, + "step": 14950 + }, + { + "epoch": 0.9156007099577698, + "grad_norm": 0.1739969551563263, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0056, + "step": 14960 + }, + { + "epoch": 0.916212742517902, + "grad_norm": 0.24298283457756042, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0055, + "step": 14970 + }, + { + "epoch": 0.9168247750780342, + "grad_norm": 0.21269915997982025, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0051, + "step": 14980 + }, + { + "epoch": 0.9174368076381664, + "grad_norm": 0.263388991355896, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0058, + "step": 14990 + }, + { + "epoch": 0.9180488401982986, + "grad_norm": 0.28030532598495483, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0068, + "step": 15000 + }, + { + "epoch": 0.9186608727584308, + "grad_norm": 0.17051894962787628, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 0.919272905318563, + "grad_norm": 0.2763383388519287, + "learning_rate": 1.146875176249365e-05, + "loss": 0.004, + "step": 15020 + }, + { + "epoch": 0.9198849378786952, + "grad_norm": 0.2616822421550751, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0052, + "step": 15030 + }, + { + "epoch": 0.9204969704388274, + "grad_norm": 0.21407093107700348, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0062, + "step": 15040 + }, + { + "epoch": 0.9211090029989596, + "grad_norm": 0.23936578631401062, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0073, + "step": 15050 + }, + { + "epoch": 0.9217210355590918, + "grad_norm": 0.26383110880851746, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.006, + "step": 15060 + }, + { + "epoch": 0.922333068119224, + "grad_norm": 0.19477945566177368, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0043, + "step": 15070 + }, + { + "epoch": 0.9229451006793561, + "grad_norm": 0.16677282750606537, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0061, + "step": 15080 + }, + { + "epoch": 0.9235571332394883, + "grad_norm": 0.26856037974357605, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0065, + "step": 15090 + }, + { + "epoch": 0.9241691657996205, + "grad_norm": 0.20086173713207245, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0056, + "step": 15100 + }, + { + "epoch": 0.9247811983597527, + "grad_norm": 0.26998719573020935, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0034, + "step": 15110 + }, + { + "epoch": 0.9253932309198849, + "grad_norm": 0.12727728486061096, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0043, + "step": 15120 + }, + { + "epoch": 0.9260052634800171, + "grad_norm": 0.11288347095251083, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0055, + "step": 15130 + }, + { + "epoch": 0.9266172960401493, + "grad_norm": 0.1109771579504013, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0048, + "step": 15140 + }, + { + "epoch": 0.9272293286002815, + "grad_norm": 0.2556479275226593, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0056, + "step": 15150 + }, + { + "epoch": 0.9278413611604137, + "grad_norm": 0.2149561196565628, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.005, + "step": 15160 + }, + { + "epoch": 0.9284533937205459, + "grad_norm": 0.16953054070472717, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0063, + "step": 15170 + }, + { + "epoch": 0.9290654262806781, + "grad_norm": 0.18306049704551697, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.004, + "step": 15180 + }, + { + "epoch": 0.9296774588408103, + "grad_norm": 0.15755385160446167, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0035, + "step": 15190 + }, + { + "epoch": 0.9302894914009425, + "grad_norm": 0.21062517166137695, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0049, + "step": 15200 + }, + { + "epoch": 0.9309015239610747, + "grad_norm": 0.1403888463973999, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0051, + "step": 15210 + }, + { + "epoch": 0.9315135565212069, + "grad_norm": 0.4044550359249115, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0062, + "step": 15220 + }, + { + "epoch": 0.9321255890813391, + "grad_norm": 0.22543896734714508, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 0.9327376216414713, + "grad_norm": 0.2025403380393982, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0055, + "step": 15240 + }, + { + "epoch": 0.9333496542016035, + "grad_norm": 1.0549683570861816, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0092, + "step": 15250 + }, + { + "epoch": 0.9339616867617357, + "grad_norm": 0.3442397117614746, + "learning_rate": 1.123494277220359e-05, + "loss": 0.005, + "step": 15260 + }, + { + "epoch": 0.934573719321868, + "grad_norm": 0.1678813248872757, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.005, + "step": 15270 + }, + { + "epoch": 0.9351857518820001, + "grad_norm": 0.31081119179725647, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0052, + "step": 15280 + }, + { + "epoch": 0.9357977844421324, + "grad_norm": 0.25498780608177185, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.006, + "step": 15290 + }, + { + "epoch": 0.9364098170022646, + "grad_norm": 0.21825125813484192, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0054, + "step": 15300 + }, + { + "epoch": 0.9370218495623968, + "grad_norm": 0.19719983637332916, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0074, + "step": 15310 + }, + { + "epoch": 0.937633882122529, + "grad_norm": 0.32297465205192566, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0058, + "step": 15320 + }, + { + "epoch": 0.9382459146826612, + "grad_norm": 0.2717733383178711, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0035, + "step": 15330 + }, + { + "epoch": 0.9388579472427934, + "grad_norm": 0.22138433158397675, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0048, + "step": 15340 + }, + { + "epoch": 0.9394699798029256, + "grad_norm": 0.1943465769290924, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0063, + "step": 15350 + }, + { + "epoch": 0.9400820123630577, + "grad_norm": 0.18422184884548187, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0054, + "step": 15360 + }, + { + "epoch": 0.9406940449231899, + "grad_norm": 0.17614246904850006, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0054, + "step": 15370 + }, + { + "epoch": 0.9413060774833221, + "grad_norm": 0.17661592364311218, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0053, + "step": 15380 + }, + { + "epoch": 0.9419181100434543, + "grad_norm": 0.42976850271224976, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0054, + "step": 15390 + }, + { + "epoch": 0.9425301426035865, + "grad_norm": 0.34272316098213196, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0044, + "step": 15400 + }, + { + "epoch": 0.9431421751637187, + "grad_norm": 0.3346613645553589, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0042, + "step": 15410 + }, + { + "epoch": 0.9437542077238509, + "grad_norm": 0.15300114452838898, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0057, + "step": 15420 + }, + { + "epoch": 0.9443662402839831, + "grad_norm": 0.23935656249523163, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0084, + "step": 15430 + }, + { + "epoch": 0.9449782728441153, + "grad_norm": 0.21595227718353271, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0051, + "step": 15440 + }, + { + "epoch": 0.9455903054042475, + "grad_norm": 0.2670149505138397, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0059, + "step": 15450 + }, + { + "epoch": 0.9462023379643797, + "grad_norm": 0.2214009314775467, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0054, + "step": 15460 + }, + { + "epoch": 0.9468143705245119, + "grad_norm": 0.3491996228694916, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 0.9474264030846441, + "grad_norm": 0.28213024139404297, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0054, + "step": 15480 + }, + { + "epoch": 0.9480384356447763, + "grad_norm": 0.30218765139579773, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0049, + "step": 15490 + }, + { + "epoch": 0.9486504682049085, + "grad_norm": 0.17068025469779968, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0046, + "step": 15500 + }, + { + "epoch": 0.9492625007650407, + "grad_norm": 0.23325121402740479, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0054, + "step": 15510 + }, + { + "epoch": 0.9498745333251729, + "grad_norm": 0.22118528187274933, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0035, + "step": 15520 + }, + { + "epoch": 0.9504865658853051, + "grad_norm": 0.20202121138572693, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0043, + "step": 15530 + }, + { + "epoch": 0.9510985984454373, + "grad_norm": 0.28455010056495667, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0039, + "step": 15540 + }, + { + "epoch": 0.9517106310055695, + "grad_norm": 0.26871445775032043, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0046, + "step": 15550 + }, + { + "epoch": 0.9523226635657017, + "grad_norm": 0.33665943145751953, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0058, + "step": 15560 + }, + { + "epoch": 0.9529346961258339, + "grad_norm": 0.3182595670223236, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0053, + "step": 15570 + }, + { + "epoch": 0.9535467286859661, + "grad_norm": 0.2867930829524994, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0068, + "step": 15580 + }, + { + "epoch": 0.9541587612460983, + "grad_norm": 0.21562239527702332, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.0051, + "step": 15590 + }, + { + "epoch": 0.9547707938062305, + "grad_norm": 0.19122859835624695, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0046, + "step": 15600 + }, + { + "epoch": 0.9553828263663627, + "grad_norm": 0.24596959352493286, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.005, + "step": 15610 + }, + { + "epoch": 0.9559948589264949, + "grad_norm": 0.182195246219635, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0038, + "step": 15620 + }, + { + "epoch": 0.9566068914866271, + "grad_norm": 0.3122585415840149, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0053, + "step": 15630 + }, + { + "epoch": 0.9572189240467592, + "grad_norm": 0.25725093483924866, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0063, + "step": 15640 + }, + { + "epoch": 0.9578309566068914, + "grad_norm": 0.19965514540672302, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0053, + "step": 15650 + }, + { + "epoch": 0.9584429891670236, + "grad_norm": 0.3474758267402649, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.006, + "step": 15660 + }, + { + "epoch": 0.9590550217271558, + "grad_norm": 0.18151336908340454, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0048, + "step": 15670 + }, + { + "epoch": 0.959667054287288, + "grad_norm": 0.18923020362854004, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0037, + "step": 15680 + }, + { + "epoch": 0.9602790868474202, + "grad_norm": 0.19792871177196503, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0049, + "step": 15690 + }, + { + "epoch": 0.9608911194075525, + "grad_norm": 0.20296797156333923, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0047, + "step": 15700 + }, + { + "epoch": 0.9615031519676847, + "grad_norm": 0.2556051015853882, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0054, + "step": 15710 + }, + { + "epoch": 0.9621151845278169, + "grad_norm": 0.35538288950920105, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0037, + "step": 15720 + }, + { + "epoch": 0.9627272170879491, + "grad_norm": 0.45357266068458557, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0065, + "step": 15730 + }, + { + "epoch": 0.9633392496480813, + "grad_norm": 0.23721693456172943, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0046, + "step": 15740 + }, + { + "epoch": 0.9639512822082135, + "grad_norm": 0.2727845013141632, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0052, + "step": 15750 + }, + { + "epoch": 0.9645633147683457, + "grad_norm": 0.2647950351238251, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0054, + "step": 15760 + }, + { + "epoch": 0.9651753473284779, + "grad_norm": 0.23364882171154022, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.005, + "step": 15770 + }, + { + "epoch": 0.9657873798886101, + "grad_norm": 0.2035825401544571, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0054, + "step": 15780 + }, + { + "epoch": 0.9663994124487423, + "grad_norm": 0.2411692589521408, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0062, + "step": 15790 + }, + { + "epoch": 0.9670114450088745, + "grad_norm": 0.23559266328811646, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 0.9676234775690067, + "grad_norm": 0.23872418701648712, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0063, + "step": 15810 + }, + { + "epoch": 0.9682355101291389, + "grad_norm": 0.27072128653526306, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0052, + "step": 15820 + }, + { + "epoch": 0.9688475426892711, + "grad_norm": 0.42610588669776917, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0056, + "step": 15830 + }, + { + "epoch": 0.9694595752494033, + "grad_norm": 0.13065233826637268, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0044, + "step": 15840 + }, + { + "epoch": 0.9700716078095355, + "grad_norm": 0.2479996383190155, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0049, + "step": 15850 + }, + { + "epoch": 0.9706836403696677, + "grad_norm": 0.22867974638938904, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 0.9712956729297999, + "grad_norm": 0.21570387482643127, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0048, + "step": 15870 + }, + { + "epoch": 0.9719077054899321, + "grad_norm": 0.26354169845581055, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0073, + "step": 15880 + }, + { + "epoch": 0.9725197380500643, + "grad_norm": 0.19785451889038086, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0044, + "step": 15890 + }, + { + "epoch": 0.9731317706101965, + "grad_norm": 0.09346124529838562, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0051, + "step": 15900 + }, + { + "epoch": 0.9737438031703287, + "grad_norm": 0.18946298956871033, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0049, + "step": 15910 + }, + { + "epoch": 0.9743558357304608, + "grad_norm": 0.1761726588010788, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0057, + "step": 15920 + }, + { + "epoch": 0.974967868290593, + "grad_norm": 0.2610328495502472, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0061, + "step": 15930 + }, + { + "epoch": 0.9755799008507252, + "grad_norm": 0.1841743141412735, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0046, + "step": 15940 + }, + { + "epoch": 0.9761919334108574, + "grad_norm": 0.14279355108737946, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0038, + "step": 15950 + }, + { + "epoch": 0.9768039659709896, + "grad_norm": 0.1717681884765625, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0035, + "step": 15960 + }, + { + "epoch": 0.9774159985311218, + "grad_norm": 0.2102527618408203, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.007, + "step": 15970 + }, + { + "epoch": 0.978028031091254, + "grad_norm": 0.29462379217147827, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0058, + "step": 15980 + }, + { + "epoch": 0.9786400636513862, + "grad_norm": 0.1863207072019577, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0058, + "step": 15990 + }, + { + "epoch": 0.9792520962115184, + "grad_norm": 0.2764773964881897, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0051, + "step": 16000 + }, + { + "epoch": 0.9798641287716506, + "grad_norm": 0.2723250091075897, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0056, + "step": 16010 + }, + { + "epoch": 0.9804761613317828, + "grad_norm": 0.21564331650733948, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0048, + "step": 16020 + }, + { + "epoch": 0.981088193891915, + "grad_norm": 0.20242232084274292, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0058, + "step": 16030 + }, + { + "epoch": 0.9817002264520472, + "grad_norm": 0.21522754430770874, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0039, + "step": 16040 + }, + { + "epoch": 0.9823122590121794, + "grad_norm": 0.20013833045959473, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0051, + "step": 16050 + }, + { + "epoch": 0.9829242915723116, + "grad_norm": 0.3008810579776764, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 0.9835363241324439, + "grad_norm": 0.2994979918003082, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0033, + "step": 16070 + }, + { + "epoch": 0.984148356692576, + "grad_norm": 0.22704628109931946, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0046, + "step": 16080 + }, + { + "epoch": 0.9847603892527083, + "grad_norm": 0.3253551423549652, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0043, + "step": 16090 + }, + { + "epoch": 0.9853724218128405, + "grad_norm": 0.14902091026306152, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0042, + "step": 16100 + }, + { + "epoch": 0.9859844543729727, + "grad_norm": 0.15155524015426636, + "learning_rate": 1.04066696184376e-05, + "loss": 0.005, + "step": 16110 + }, + { + "epoch": 0.9865964869331049, + "grad_norm": 0.1859518140554428, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0063, + "step": 16120 + }, + { + "epoch": 0.9872085194932371, + "grad_norm": 0.5434902906417847, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0072, + "step": 16130 + }, + { + "epoch": 0.9878205520533693, + "grad_norm": 0.19308103621006012, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0046, + "step": 16140 + }, + { + "epoch": 0.9884325846135015, + "grad_norm": 0.21260593831539154, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0077, + "step": 16150 + }, + { + "epoch": 0.9890446171736337, + "grad_norm": 0.15255668759346008, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0059, + "step": 16160 + }, + { + "epoch": 0.9896566497337659, + "grad_norm": 0.18739885091781616, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0047, + "step": 16170 + }, + { + "epoch": 0.9902686822938981, + "grad_norm": 0.2112029641866684, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0049, + "step": 16180 + }, + { + "epoch": 0.9908807148540303, + "grad_norm": 0.35941991209983826, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.005, + "step": 16190 + }, + { + "epoch": 0.9914927474141624, + "grad_norm": 0.16792108118534088, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0051, + "step": 16200 + }, + { + "epoch": 0.9921047799742946, + "grad_norm": 0.1985466182231903, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0063, + "step": 16210 + }, + { + "epoch": 0.9927168125344268, + "grad_norm": 0.17579570412635803, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0046, + "step": 16220 + }, + { + "epoch": 0.993328845094559, + "grad_norm": 0.23352178931236267, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0061, + "step": 16230 + }, + { + "epoch": 0.9939408776546912, + "grad_norm": 0.3543553054332733, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0054, + "step": 16240 + }, + { + "epoch": 0.9945529102148234, + "grad_norm": 0.18603719770908356, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0049, + "step": 16250 + }, + { + "epoch": 0.9951649427749556, + "grad_norm": 0.31745344400405884, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0061, + "step": 16260 + }, + { + "epoch": 0.9957769753350878, + "grad_norm": 0.1416773498058319, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0063, + "step": 16270 + }, + { + "epoch": 0.99638900789522, + "grad_norm": 0.18451642990112305, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0055, + "step": 16280 + }, + { + "epoch": 0.9970010404553522, + "grad_norm": 0.13422183692455292, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0047, + "step": 16290 + }, + { + "epoch": 0.9976130730154844, + "grad_norm": 0.15831588208675385, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0045, + "step": 16300 + }, + { + "epoch": 0.9982251055756166, + "grad_norm": 0.42520084977149963, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0053, + "step": 16310 + }, + { + "epoch": 0.9988371381357488, + "grad_norm": 0.20889437198638916, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0043, + "step": 16320 + }, + { + "epoch": 0.999449170695881, + "grad_norm": 0.17016667127609253, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0072, + "step": 16330 + }, + { + "epoch": 1.0000612032560132, + "grad_norm": 0.3129214346408844, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0054, + "step": 16340 + }, + { + "epoch": 1.0006732358161454, + "grad_norm": 0.334224134683609, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.0037, + "step": 16350 + }, + { + "epoch": 1.0012852683762776, + "grad_norm": 0.28502705693244934, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0065, + "step": 16360 + }, + { + "epoch": 1.0018973009364098, + "grad_norm": 0.21431966125965118, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0046, + "step": 16370 + }, + { + "epoch": 1.002509333496542, + "grad_norm": 0.22898051142692566, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.006, + "step": 16380 + }, + { + "epoch": 1.0031213660566742, + "grad_norm": 0.41625624895095825, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0068, + "step": 16390 + }, + { + "epoch": 1.0037333986168064, + "grad_norm": 0.2510327398777008, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0043, + "step": 16400 + }, + { + "epoch": 1.0043454311769386, + "grad_norm": 0.23560962080955505, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0049, + "step": 16410 + }, + { + "epoch": 1.0049574637370708, + "grad_norm": 0.2081199437379837, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0061, + "step": 16420 + }, + { + "epoch": 1.005569496297203, + "grad_norm": 0.12456244230270386, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0057, + "step": 16430 + }, + { + "epoch": 1.0061815288573353, + "grad_norm": 0.22212636470794678, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0052, + "step": 16440 + }, + { + "epoch": 1.0067935614174675, + "grad_norm": 0.27772897481918335, + "learning_rate": 1.007637577910799e-05, + "loss": 0.007, + "step": 16450 + }, + { + "epoch": 1.0074055939775997, + "grad_norm": 0.40040507912635803, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0051, + "step": 16460 + }, + { + "epoch": 1.0080176265377319, + "grad_norm": 0.19763565063476562, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0047, + "step": 16470 + }, + { + "epoch": 1.008629659097864, + "grad_norm": 0.2906181514263153, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0055, + "step": 16480 + }, + { + "epoch": 1.0092416916579963, + "grad_norm": 0.29949888586997986, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0045, + "step": 16490 + }, + { + "epoch": 1.0098537242181285, + "grad_norm": 0.3900962769985199, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0053, + "step": 16500 + }, + { + "epoch": 1.0104657567782607, + "grad_norm": 0.22380846738815308, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0043, + "step": 16510 + }, + { + "epoch": 1.0110777893383929, + "grad_norm": 0.3426673412322998, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0052, + "step": 16520 + }, + { + "epoch": 1.011689821898525, + "grad_norm": 0.2452230006456375, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0055, + "step": 16530 + }, + { + "epoch": 1.0123018544586573, + "grad_norm": 0.24280408024787903, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0042, + "step": 16540 + }, + { + "epoch": 1.0129138870187895, + "grad_norm": 0.18271701037883759, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0047, + "step": 16550 + }, + { + "epoch": 1.0135259195789217, + "grad_norm": 0.2874322235584259, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0059, + "step": 16560 + }, + { + "epoch": 1.0141379521390539, + "grad_norm": 0.17367394268512726, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0048, + "step": 16570 + }, + { + "epoch": 1.014749984699186, + "grad_norm": 0.167460098862648, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0074, + "step": 16580 + }, + { + "epoch": 1.0153620172593183, + "grad_norm": 0.21867765486240387, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0042, + "step": 16590 + }, + { + "epoch": 1.0159740498194505, + "grad_norm": 0.2539086639881134, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0057, + "step": 16600 + }, + { + "epoch": 1.0165860823795827, + "grad_norm": 0.1415795534849167, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0044, + "step": 16610 + }, + { + "epoch": 1.0171981149397147, + "grad_norm": 0.12702493369579315, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0038, + "step": 16620 + }, + { + "epoch": 1.0178101474998469, + "grad_norm": 0.16548305749893188, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0042, + "step": 16630 + }, + { + "epoch": 1.018422180059979, + "grad_norm": 0.4413173496723175, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0059, + "step": 16640 + }, + { + "epoch": 1.0190342126201113, + "grad_norm": 0.30871614813804626, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0045, + "step": 16650 + }, + { + "epoch": 1.0196462451802435, + "grad_norm": 0.259650319814682, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0044, + "step": 16660 + }, + { + "epoch": 1.0202582777403757, + "grad_norm": 0.36035388708114624, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0068, + "step": 16670 + }, + { + "epoch": 1.020870310300508, + "grad_norm": 0.3487808406352997, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0038, + "step": 16680 + }, + { + "epoch": 1.02148234286064, + "grad_norm": 0.2898370623588562, + "learning_rate": 9.843955128197274e-06, + "loss": 0.004, + "step": 16690 + }, + { + "epoch": 1.0220943754207723, + "grad_norm": 0.2942182719707489, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0042, + "step": 16700 + }, + { + "epoch": 1.0227064079809045, + "grad_norm": 0.27839869260787964, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0042, + "step": 16710 + }, + { + "epoch": 1.0233184405410367, + "grad_norm": 0.17199957370758057, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0059, + "step": 16720 + }, + { + "epoch": 1.023930473101169, + "grad_norm": 0.2521669566631317, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0073, + "step": 16730 + }, + { + "epoch": 1.0245425056613011, + "grad_norm": 0.19908513128757477, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0047, + "step": 16740 + }, + { + "epoch": 1.0251545382214333, + "grad_norm": 0.23300328850746155, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0055, + "step": 16750 + }, + { + "epoch": 1.0257665707815655, + "grad_norm": 0.24671277403831482, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0043, + "step": 16760 + }, + { + "epoch": 1.0263786033416977, + "grad_norm": 0.23183101415634155, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0052, + "step": 16770 + }, + { + "epoch": 1.02699063590183, + "grad_norm": 0.13460612297058105, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0035, + "step": 16780 + }, + { + "epoch": 1.0276026684619621, + "grad_norm": 0.1990940123796463, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0044, + "step": 16790 + }, + { + "epoch": 1.0282147010220943, + "grad_norm": 0.21223406493663788, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0036, + "step": 16800 + }, + { + "epoch": 1.0288267335822265, + "grad_norm": 0.2649106979370117, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0044, + "step": 16810 + }, + { + "epoch": 1.0294387661423587, + "grad_norm": 0.2524845600128174, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0048, + "step": 16820 + }, + { + "epoch": 1.030050798702491, + "grad_norm": 0.22169779241085052, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 1.0306628312626231, + "grad_norm": 0.16642418503761292, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0048, + "step": 16840 + }, + { + "epoch": 1.0312748638227553, + "grad_norm": 0.22939598560333252, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0048, + "step": 16850 + }, + { + "epoch": 1.0318868963828876, + "grad_norm": 0.2131129503250122, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0055, + "step": 16860 + }, + { + "epoch": 1.0324989289430198, + "grad_norm": 0.20492705702781677, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0041, + "step": 16870 + }, + { + "epoch": 1.033110961503152, + "grad_norm": 0.2988845705986023, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0042, + "step": 16880 + }, + { + "epoch": 1.0337229940632842, + "grad_norm": 0.18579600751399994, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0042, + "step": 16890 + }, + { + "epoch": 1.0343350266234164, + "grad_norm": 0.2553490698337555, + "learning_rate": 9.641222698101725e-06, + "loss": 0.005, + "step": 16900 + }, + { + "epoch": 1.0349470591835486, + "grad_norm": 0.338440865278244, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0036, + "step": 16910 + }, + { + "epoch": 1.0355590917436808, + "grad_norm": 0.12755723297595978, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0044, + "step": 16920 + }, + { + "epoch": 1.036171124303813, + "grad_norm": 0.12222232669591904, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0037, + "step": 16930 + }, + { + "epoch": 1.0367831568639452, + "grad_norm": 0.20246204733848572, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0055, + "step": 16940 + }, + { + "epoch": 1.0373951894240774, + "grad_norm": 0.36903291940689087, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0051, + "step": 16950 + }, + { + "epoch": 1.0380072219842096, + "grad_norm": 0.3166116178035736, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0045, + "step": 16960 + }, + { + "epoch": 1.0386192545443418, + "grad_norm": 0.2777375280857086, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0041, + "step": 16970 + }, + { + "epoch": 1.039231287104474, + "grad_norm": 0.3173989951610565, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0053, + "step": 16980 + }, + { + "epoch": 1.0398433196646062, + "grad_norm": 0.2135571539402008, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0051, + "step": 16990 + }, + { + "epoch": 1.0404553522247384, + "grad_norm": 0.18536782264709473, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0037, + "step": 17000 + }, + { + "epoch": 1.0410673847848706, + "grad_norm": 0.17782410979270935, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0052, + "step": 17010 + }, + { + "epoch": 1.0416794173450028, + "grad_norm": 0.31509512662887573, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0099, + "step": 17020 + }, + { + "epoch": 1.042291449905135, + "grad_norm": 0.22748225927352905, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 1.0429034824652672, + "grad_norm": 0.14924705028533936, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0053, + "step": 17040 + }, + { + "epoch": 1.0435155150253994, + "grad_norm": 0.21390999853610992, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0044, + "step": 17050 + }, + { + "epoch": 1.0441275475855316, + "grad_norm": 0.25828516483306885, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0042, + "step": 17060 + }, + { + "epoch": 1.0447395801456638, + "grad_norm": 0.24069662392139435, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0069, + "step": 17070 + }, + { + "epoch": 1.045351612705796, + "grad_norm": 0.1090504601597786, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0036, + "step": 17080 + }, + { + "epoch": 1.0459636452659282, + "grad_norm": 0.17990687489509583, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0049, + "step": 17090 + }, + { + "epoch": 1.0465756778260604, + "grad_norm": 0.21505555510520935, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0051, + "step": 17100 + }, + { + "epoch": 1.0471877103861926, + "grad_norm": 0.2157493680715561, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0063, + "step": 17110 + }, + { + "epoch": 1.0477997429463248, + "grad_norm": 0.30865493416786194, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0053, + "step": 17120 + }, + { + "epoch": 1.048411775506457, + "grad_norm": 0.16882938146591187, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0043, + "step": 17130 + }, + { + "epoch": 1.0490238080665892, + "grad_norm": 0.14921846985816956, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0043, + "step": 17140 + }, + { + "epoch": 1.0496358406267214, + "grad_norm": 0.15723800659179688, + "learning_rate": 9.400800085133245e-06, + "loss": 0.005, + "step": 17150 + }, + { + "epoch": 1.0502478731868536, + "grad_norm": 0.19597285985946655, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0046, + "step": 17160 + }, + { + "epoch": 1.0508599057469858, + "grad_norm": 0.1684723198413849, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0053, + "step": 17170 + }, + { + "epoch": 1.051471938307118, + "grad_norm": 0.1733175367116928, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0053, + "step": 17180 + }, + { + "epoch": 1.0520839708672503, + "grad_norm": 0.23111647367477417, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0048, + "step": 17190 + }, + { + "epoch": 1.0526960034273822, + "grad_norm": 0.36174628138542175, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0049, + "step": 17200 + }, + { + "epoch": 1.0533080359875144, + "grad_norm": 0.15791575610637665, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0048, + "step": 17210 + }, + { + "epoch": 1.0539200685476466, + "grad_norm": 0.16026809811592102, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0047, + "step": 17220 + }, + { + "epoch": 1.0545321011077788, + "grad_norm": 0.13964296877384186, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0033, + "step": 17230 + }, + { + "epoch": 1.055144133667911, + "grad_norm": 0.22623896598815918, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0041, + "step": 17240 + }, + { + "epoch": 1.0557561662280432, + "grad_norm": 0.15534555912017822, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0067, + "step": 17250 + }, + { + "epoch": 1.0563681987881754, + "grad_norm": 0.09519665688276291, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0035, + "step": 17260 + }, + { + "epoch": 1.0569802313483077, + "grad_norm": 0.19323785603046417, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0045, + "step": 17270 + }, + { + "epoch": 1.0575922639084399, + "grad_norm": 0.21194952726364136, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0047, + "step": 17280 + }, + { + "epoch": 1.058204296468572, + "grad_norm": 0.28977999091148376, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0049, + "step": 17290 + }, + { + "epoch": 1.0588163290287043, + "grad_norm": 0.1739121824502945, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0062, + "step": 17300 + }, + { + "epoch": 1.0594283615888365, + "grad_norm": 0.23189865052700043, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0055, + "step": 17310 + }, + { + "epoch": 1.0600403941489687, + "grad_norm": 0.15705449879169464, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0033, + "step": 17320 + }, + { + "epoch": 1.0606524267091009, + "grad_norm": 0.23189882934093475, + "learning_rate": 9.228411903689187e-06, + "loss": 0.003, + "step": 17330 + }, + { + "epoch": 1.061264459269233, + "grad_norm": 0.19559095799922943, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0051, + "step": 17340 + }, + { + "epoch": 1.0618764918293653, + "grad_norm": 0.2560543715953827, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0049, + "step": 17350 + }, + { + "epoch": 1.0624885243894975, + "grad_norm": 0.35167232155799866, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0042, + "step": 17360 + }, + { + "epoch": 1.0631005569496297, + "grad_norm": 0.17626497149467468, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0039, + "step": 17370 + }, + { + "epoch": 1.0637125895097619, + "grad_norm": 0.18818546831607819, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0043, + "step": 17380 + }, + { + "epoch": 1.064324622069894, + "grad_norm": 0.10237561911344528, + "learning_rate": 9.171095634265995e-06, + "loss": 0.0053, + "step": 17390 + }, + { + "epoch": 1.0649366546300263, + "grad_norm": 0.21828459203243256, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0042, + "step": 17400 + }, + { + "epoch": 1.0655486871901585, + "grad_norm": 0.09354235231876373, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0034, + "step": 17410 + }, + { + "epoch": 1.0661607197502907, + "grad_norm": 0.18106088042259216, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0051, + "step": 17420 + }, + { + "epoch": 1.066772752310423, + "grad_norm": 0.21538101136684418, + "learning_rate": 9.132927564918328e-06, + "loss": 0.0056, + "step": 17430 + }, + { + "epoch": 1.067384784870555, + "grad_norm": 0.18729519844055176, + "learning_rate": 9.1233909973763e-06, + "loss": 0.004, + "step": 17440 + }, + { + "epoch": 1.0679968174306873, + "grad_norm": 0.3791484832763672, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0052, + "step": 17450 + }, + { + "epoch": 1.0686088499908195, + "grad_norm": 0.19206254184246063, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0042, + "step": 17460 + }, + { + "epoch": 1.0692208825509517, + "grad_norm": 0.15434518456459045, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0061, + "step": 17470 + }, + { + "epoch": 1.069832915111084, + "grad_norm": 0.17898093163967133, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0045, + "step": 17480 + }, + { + "epoch": 1.0704449476712161, + "grad_norm": 0.21975649893283844, + "learning_rate": 9.07574141798717e-06, + "loss": 0.005, + "step": 17490 + }, + { + "epoch": 1.0710569802313483, + "grad_norm": 0.1380346417427063, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0032, + "step": 17500 + }, + { + "epoch": 1.0716690127914805, + "grad_norm": 0.28567400574684143, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0044, + "step": 17510 + }, + { + "epoch": 1.0722810453516127, + "grad_norm": 0.22925534844398499, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0041, + "step": 17520 + }, + { + "epoch": 1.072893077911745, + "grad_norm": 0.27094215154647827, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0047, + "step": 17530 + }, + { + "epoch": 1.0735051104718771, + "grad_norm": 0.32299691438674927, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0048, + "step": 17540 + }, + { + "epoch": 1.0741171430320093, + "grad_norm": 0.26789531111717224, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0047, + "step": 17550 + }, + { + "epoch": 1.0747291755921415, + "grad_norm": 0.3175952434539795, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0077, + "step": 17560 + }, + { + "epoch": 1.0753412081522737, + "grad_norm": 0.24784249067306519, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0048, + "step": 17570 + }, + { + "epoch": 1.075953240712406, + "grad_norm": 0.3081960380077362, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0046, + "step": 17580 + }, + { + "epoch": 1.0765652732725381, + "grad_norm": 0.25334152579307556, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0039, + "step": 17590 + }, + { + "epoch": 1.0771773058326704, + "grad_norm": 0.24747619032859802, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0059, + "step": 17600 + }, + { + "epoch": 1.0777893383928026, + "grad_norm": 0.19048908352851868, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0049, + "step": 17610 + }, + { + "epoch": 1.0784013709529348, + "grad_norm": 0.18883349001407623, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0047, + "step": 17620 + }, + { + "epoch": 1.079013403513067, + "grad_norm": 0.18653099238872528, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0044, + "step": 17630 + }, + { + "epoch": 1.0796254360731992, + "grad_norm": 0.1320251226425171, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0042, + "step": 17640 + }, + { + "epoch": 1.0802374686333314, + "grad_norm": 0.14996238052845, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0041, + "step": 17650 + }, + { + "epoch": 1.0808495011934636, + "grad_norm": 0.4576573073863983, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0059, + "step": 17660 + }, + { + "epoch": 1.0814615337535958, + "grad_norm": 0.19582511484622955, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0051, + "step": 17670 + }, + { + "epoch": 1.082073566313728, + "grad_norm": 0.21973003447055817, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0059, + "step": 17680 + }, + { + "epoch": 1.0826855988738602, + "grad_norm": 0.18183568120002747, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0056, + "step": 17690 + }, + { + "epoch": 1.0832976314339924, + "grad_norm": 0.1761978417634964, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0049, + "step": 17700 + }, + { + "epoch": 1.0839096639941246, + "grad_norm": 0.10185366123914719, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0041, + "step": 17710 + }, + { + "epoch": 1.0845216965542568, + "grad_norm": 0.262513130903244, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0046, + "step": 17720 + }, + { + "epoch": 1.0851337291143888, + "grad_norm": 0.36413198709487915, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0043, + "step": 17730 + }, + { + "epoch": 1.085745761674521, + "grad_norm": 0.2258218675851822, + "learning_rate": 8.83836825410936e-06, + "loss": 0.005, + "step": 17740 + }, + { + "epoch": 1.0863577942346532, + "grad_norm": 0.20840497314929962, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0067, + "step": 17750 + }, + { + "epoch": 1.0869698267947854, + "grad_norm": 0.33392995595932007, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0063, + "step": 17760 + }, + { + "epoch": 1.0875818593549176, + "grad_norm": 0.18477876484394073, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0039, + "step": 17770 + }, + { + "epoch": 1.0881938919150498, + "grad_norm": 0.14785899221897125, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0063, + "step": 17780 + }, + { + "epoch": 1.088805924475182, + "grad_norm": 0.12930043041706085, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0055, + "step": 17790 + }, + { + "epoch": 1.0894179570353142, + "grad_norm": 0.1541786789894104, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0035, + "step": 17800 + }, + { + "epoch": 1.0900299895954464, + "grad_norm": 0.1781499683856964, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0054, + "step": 17810 + }, + { + "epoch": 1.0906420221555786, + "grad_norm": 0.13659314811229706, + "learning_rate": 8.762735374981932e-06, + "loss": 0.0047, + "step": 17820 + }, + { + "epoch": 1.0912540547157108, + "grad_norm": 0.18936918675899506, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0041, + "step": 17830 + }, + { + "epoch": 1.091866087275843, + "grad_norm": 0.24795638024806976, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0043, + "step": 17840 + }, + { + "epoch": 1.0924781198359752, + "grad_norm": 0.28090324997901917, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0036, + "step": 17850 + }, + { + "epoch": 1.0930901523961074, + "grad_norm": 0.3130576014518738, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0042, + "step": 17860 + }, + { + "epoch": 1.0937021849562396, + "grad_norm": 0.19758646190166473, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0047, + "step": 17870 + }, + { + "epoch": 1.0943142175163718, + "grad_norm": 0.20309071242809296, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0042, + "step": 17880 + }, + { + "epoch": 1.094926250076504, + "grad_norm": 0.19741898775100708, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0057, + "step": 17890 + }, + { + "epoch": 1.0955382826366362, + "grad_norm": 0.19182747602462769, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0042, + "step": 17900 + }, + { + "epoch": 1.0961503151967684, + "grad_norm": 0.14508575201034546, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0044, + "step": 17910 + }, + { + "epoch": 1.0967623477569006, + "grad_norm": 0.19854849576950073, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0064, + "step": 17920 + }, + { + "epoch": 1.0973743803170328, + "grad_norm": 0.15055720508098602, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0049, + "step": 17930 + }, + { + "epoch": 1.097986412877165, + "grad_norm": 0.1855372190475464, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0043, + "step": 17940 + }, + { + "epoch": 1.0985984454372972, + "grad_norm": 0.13770940899848938, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0058, + "step": 17950 + }, + { + "epoch": 1.0992104779974294, + "grad_norm": 0.24905221164226532, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0048, + "step": 17960 + }, + { + "epoch": 1.0998225105575616, + "grad_norm": 0.1951165348291397, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0043, + "step": 17970 + }, + { + "epoch": 1.1004345431176938, + "grad_norm": 0.18365852534770966, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0051, + "step": 17980 + }, + { + "epoch": 1.101046575677826, + "grad_norm": 0.16304127871990204, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0034, + "step": 17990 + }, + { + "epoch": 1.1016586082379582, + "grad_norm": 0.262677401304245, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0042, + "step": 18000 + }, + { + "epoch": 1.1022706407980905, + "grad_norm": 0.6157310605049133, + "learning_rate": 8.583791146965244e-06, + "loss": 0.007, + "step": 18010 + }, + { + "epoch": 1.1028826733582227, + "grad_norm": 0.2832951247692108, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0056, + "step": 18020 + }, + { + "epoch": 1.1034947059183549, + "grad_norm": 0.1781810224056244, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0049, + "step": 18030 + }, + { + "epoch": 1.104106738478487, + "grad_norm": 0.23228950798511505, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0045, + "step": 18040 + }, + { + "epoch": 1.1047187710386193, + "grad_norm": 0.2573170065879822, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0048, + "step": 18050 + }, + { + "epoch": 1.1053308035987515, + "grad_norm": 0.30996036529541016, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0054, + "step": 18060 + }, + { + "epoch": 1.1059428361588837, + "grad_norm": 0.24979132413864136, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0045, + "step": 18070 + }, + { + "epoch": 1.1065548687190159, + "grad_norm": 0.17564314603805542, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0033, + "step": 18080 + }, + { + "epoch": 1.107166901279148, + "grad_norm": 0.14539776742458344, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0047, + "step": 18090 + }, + { + "epoch": 1.1077789338392803, + "grad_norm": 0.2530387341976166, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0058, + "step": 18100 + }, + { + "epoch": 1.1083909663994125, + "grad_norm": 0.2038760781288147, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0052, + "step": 18110 + }, + { + "epoch": 1.1090029989595447, + "grad_norm": 0.1769075244665146, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0043, + "step": 18120 + }, + { + "epoch": 1.1096150315196769, + "grad_norm": 0.1686626374721527, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0055, + "step": 18130 + }, + { + "epoch": 1.110227064079809, + "grad_norm": 0.21752336621284485, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0052, + "step": 18140 + }, + { + "epoch": 1.1108390966399413, + "grad_norm": 0.2739295959472656, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0039, + "step": 18150 + }, + { + "epoch": 1.1114511292000735, + "grad_norm": 0.18259567022323608, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0038, + "step": 18160 + }, + { + "epoch": 1.1120631617602057, + "grad_norm": 0.21565310657024384, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0043, + "step": 18170 + }, + { + "epoch": 1.112675194320338, + "grad_norm": 0.2141607403755188, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0059, + "step": 18180 + }, + { + "epoch": 1.11328722688047, + "grad_norm": 0.3017563819885254, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0044, + "step": 18190 + }, + { + "epoch": 1.1138992594406023, + "grad_norm": 0.2021455019712448, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0044, + "step": 18200 + }, + { + "epoch": 1.1145112920007345, + "grad_norm": 0.2113070785999298, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0048, + "step": 18210 + }, + { + "epoch": 1.1151233245608667, + "grad_norm": 0.18945784866809845, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0029, + "step": 18220 + }, + { + "epoch": 1.115735357120999, + "grad_norm": 0.15259192883968353, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0043, + "step": 18230 + }, + { + "epoch": 1.1163473896811311, + "grad_norm": 0.17555822432041168, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0039, + "step": 18240 + }, + { + "epoch": 1.1169594222412633, + "grad_norm": 0.20105648040771484, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0058, + "step": 18250 + }, + { + "epoch": 1.1175714548013955, + "grad_norm": 0.31626567244529724, + "learning_rate": 8.349909816537207e-06, + "loss": 0.0044, + "step": 18260 + }, + { + "epoch": 1.1181834873615277, + "grad_norm": 0.16219007968902588, + "learning_rate": 8.340593854157868e-06, + "loss": 0.005, + "step": 18270 + }, + { + "epoch": 1.11879551992166, + "grad_norm": 0.2174186110496521, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0055, + "step": 18280 + }, + { + "epoch": 1.1194075524817921, + "grad_norm": 0.13639339804649353, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0043, + "step": 18290 + }, + { + "epoch": 1.1200195850419243, + "grad_norm": 0.15100249648094177, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0042, + "step": 18300 + }, + { + "epoch": 1.1206316176020565, + "grad_norm": 0.2114904671907425, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0043, + "step": 18310 + }, + { + "epoch": 1.1212436501621887, + "grad_norm": 0.2941966950893402, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0052, + "step": 18320 + }, + { + "epoch": 1.1218556827223207, + "grad_norm": 0.21695150434970856, + "learning_rate": 8.28476400245882e-06, + "loss": 0.005, + "step": 18330 + }, + { + "epoch": 1.122467715282453, + "grad_norm": 0.11768218129873276, + "learning_rate": 8.275470116190976e-06, + "loss": 0.005, + "step": 18340 + }, + { + "epoch": 1.1230797478425851, + "grad_norm": 0.1427483856678009, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0039, + "step": 18350 + }, + { + "epoch": 1.1236917804027173, + "grad_norm": 0.1837971955537796, + "learning_rate": 8.256891946721157e-06, + "loss": 0.004, + "step": 18360 + }, + { + "epoch": 1.1243038129628495, + "grad_norm": 0.30968883633613586, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0037, + "step": 18370 + }, + { + "epoch": 1.1249158455229817, + "grad_norm": 0.13366396725177765, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0042, + "step": 18380 + }, + { + "epoch": 1.125527878083114, + "grad_norm": 0.1829235553741455, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0039, + "step": 18390 + }, + { + "epoch": 1.1261399106432461, + "grad_norm": 0.3106991648674011, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0052, + "step": 18400 + }, + { + "epoch": 1.1267519432033783, + "grad_norm": 0.38655754923820496, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0046, + "step": 18410 + }, + { + "epoch": 1.1273639757635106, + "grad_norm": 0.23598383367061615, + "learning_rate": 8.201235047388747e-06, + "loss": 0.004, + "step": 18420 + }, + { + "epoch": 1.1279760083236428, + "grad_norm": 0.17428012192249298, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0046, + "step": 18430 + }, + { + "epoch": 1.128588040883775, + "grad_norm": 0.1847466081380844, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0043, + "step": 18440 + }, + { + "epoch": 1.1292000734439072, + "grad_norm": 0.14917762577533722, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0049, + "step": 18450 + }, + { + "epoch": 1.1298121060040394, + "grad_norm": 0.2882528305053711, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0046, + "step": 18460 + }, + { + "epoch": 1.1304241385641716, + "grad_norm": 0.36186549067497253, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0065, + "step": 18470 + }, + { + "epoch": 1.1310361711243038, + "grad_norm": 0.1604463905096054, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0037, + "step": 18480 + }, + { + "epoch": 1.131648203684436, + "grad_norm": 0.17751921713352203, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0034, + "step": 18490 + }, + { + "epoch": 1.1322602362445682, + "grad_norm": 0.15355733036994934, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0059, + "step": 18500 + }, + { + "epoch": 1.1328722688047004, + "grad_norm": 0.21558596193790436, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0044, + "step": 18510 + }, + { + "epoch": 1.1334843013648326, + "grad_norm": 0.20114412903785706, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 1.1340963339249648, + "grad_norm": 0.17260855436325073, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0053, + "step": 18530 + }, + { + "epoch": 1.134708366485097, + "grad_norm": 0.16089287400245667, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0032, + "step": 18540 + }, + { + "epoch": 1.1353203990452292, + "grad_norm": 0.14655937254428864, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0043, + "step": 18550 + }, + { + "epoch": 1.1359324316053614, + "grad_norm": 0.16373249888420105, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0058, + "step": 18560 + }, + { + "epoch": 1.1365444641654936, + "grad_norm": 0.14543801546096802, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0043, + "step": 18570 + }, + { + "epoch": 1.1371564967256258, + "grad_norm": 0.3515278100967407, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0043, + "step": 18580 + }, + { + "epoch": 1.137768529285758, + "grad_norm": 0.21776945888996124, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0053, + "step": 18590 + }, + { + "epoch": 1.1383805618458902, + "grad_norm": 0.21879829466342926, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0051, + "step": 18600 + }, + { + "epoch": 1.1389925944060224, + "grad_norm": 0.16967973113059998, + "learning_rate": 8.025779439806006e-06, + "loss": 0.0048, + "step": 18610 + }, + { + "epoch": 1.1396046269661546, + "grad_norm": 0.4298441410064697, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0056, + "step": 18620 + }, + { + "epoch": 1.1402166595262868, + "grad_norm": 0.1858961284160614, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0067, + "step": 18630 + }, + { + "epoch": 1.140828692086419, + "grad_norm": 0.25853803753852844, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0057, + "step": 18640 + }, + { + "epoch": 1.1414407246465512, + "grad_norm": 0.18566234409809113, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0048, + "step": 18650 + }, + { + "epoch": 1.1420527572066834, + "grad_norm": 0.3471083343029022, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0042, + "step": 18660 + }, + { + "epoch": 1.1426647897668156, + "grad_norm": 0.2092636376619339, + "learning_rate": 7.970630670012853e-06, + "loss": 0.004, + "step": 18670 + }, + { + "epoch": 1.1432768223269478, + "grad_norm": 0.3432580828666687, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0044, + "step": 18680 + }, + { + "epoch": 1.14388885488708, + "grad_norm": 0.14227882027626038, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0041, + "step": 18690 + }, + { + "epoch": 1.1445008874472122, + "grad_norm": 0.2128007709980011, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0037, + "step": 18700 + }, + { + "epoch": 1.1451129200073444, + "grad_norm": 0.25377482175827026, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0049, + "step": 18710 + }, + { + "epoch": 1.1457249525674766, + "grad_norm": 0.1905982494354248, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0037, + "step": 18720 + }, + { + "epoch": 1.1463369851276088, + "grad_norm": 0.3090096712112427, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0079, + "step": 18730 + }, + { + "epoch": 1.146949017687741, + "grad_norm": 0.15604345500469208, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0037, + "step": 18740 + }, + { + "epoch": 1.1475610502478732, + "grad_norm": 0.21756386756896973, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0043, + "step": 18750 + }, + { + "epoch": 1.1481730828080055, + "grad_norm": 0.23869304358959198, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0038, + "step": 18760 + }, + { + "epoch": 1.1487851153681377, + "grad_norm": 0.18082380294799805, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0073, + "step": 18770 + }, + { + "epoch": 1.1493971479282699, + "grad_norm": 0.4032754898071289, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0061, + "step": 18780 + }, + { + "epoch": 1.150009180488402, + "grad_norm": 0.3173290491104126, + "learning_rate": 7.860719408056385e-06, + "loss": 0.004, + "step": 18790 + }, + { + "epoch": 1.1506212130485343, + "grad_norm": 0.18892645835876465, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0044, + "step": 18800 + }, + { + "epoch": 1.1512332456086665, + "grad_norm": 0.26740241050720215, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0056, + "step": 18810 + }, + { + "epoch": 1.1518452781687987, + "grad_norm": 0.3046218752861023, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0072, + "step": 18820 + }, + { + "epoch": 1.1524573107289309, + "grad_norm": 0.17181983590126038, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0034, + "step": 18830 + }, + { + "epoch": 1.1530693432890629, + "grad_norm": 0.22095724940299988, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0045, + "step": 18840 + }, + { + "epoch": 1.153681375849195, + "grad_norm": 0.1514609307050705, + "learning_rate": 7.80596155940873e-06, + "loss": 0.004, + "step": 18850 + }, + { + "epoch": 1.1542934084093273, + "grad_norm": 0.15244366228580475, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0047, + "step": 18860 + }, + { + "epoch": 1.1549054409694595, + "grad_norm": 0.24359947443008423, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0039, + "step": 18870 + }, + { + "epoch": 1.1555174735295917, + "grad_norm": 0.15558156371116638, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0036, + "step": 18880 + }, + { + "epoch": 1.1561295060897239, + "grad_norm": 0.33679234981536865, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0041, + "step": 18890 + }, + { + "epoch": 1.156741538649856, + "grad_norm": 0.15811999142169952, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0062, + "step": 18900 + }, + { + "epoch": 1.1573535712099883, + "grad_norm": 0.14838527143001556, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0029, + "step": 18910 + }, + { + "epoch": 1.1579656037701205, + "grad_norm": 0.23024815320968628, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0038, + "step": 18920 + }, + { + "epoch": 1.1585776363302527, + "grad_norm": 0.18455618619918823, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0044, + "step": 18930 + }, + { + "epoch": 1.1591896688903849, + "grad_norm": 0.20213079452514648, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0038, + "step": 18940 + }, + { + "epoch": 1.159801701450517, + "grad_norm": 0.19000643491744995, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0043, + "step": 18950 + }, + { + "epoch": 1.1604137340106493, + "grad_norm": 0.14075686037540436, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0047, + "step": 18960 + }, + { + "epoch": 1.1610257665707815, + "grad_norm": 0.22101792693138123, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0045, + "step": 18970 + }, + { + "epoch": 1.1616377991309137, + "grad_norm": 0.1097906231880188, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0037, + "step": 18980 + }, + { + "epoch": 1.162249831691046, + "grad_norm": 0.16169370710849762, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0033, + "step": 18990 + }, + { + "epoch": 1.162861864251178, + "grad_norm": 0.32931753993034363, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0052, + "step": 19000 + }, + { + "epoch": 1.1634738968113103, + "grad_norm": 0.2494741678237915, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0057, + "step": 19010 + }, + { + "epoch": 1.1640859293714425, + "grad_norm": 0.18492171168327332, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0056, + "step": 19020 + }, + { + "epoch": 1.1646979619315747, + "grad_norm": 0.18830963969230652, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0036, + "step": 19030 + }, + { + "epoch": 1.165309994491707, + "grad_norm": 0.1331586092710495, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0046, + "step": 19040 + }, + { + "epoch": 1.1659220270518391, + "grad_norm": 0.2433806210756302, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0053, + "step": 19050 + }, + { + "epoch": 1.1665340596119713, + "grad_norm": 0.24491485953330994, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0037, + "step": 19060 + }, + { + "epoch": 1.1671460921721035, + "grad_norm": 0.1789211630821228, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0046, + "step": 19070 + }, + { + "epoch": 1.1677581247322357, + "grad_norm": 0.2729121148586273, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0043, + "step": 19080 + }, + { + "epoch": 1.168370157292368, + "grad_norm": 0.19535189867019653, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0056, + "step": 19090 + }, + { + "epoch": 1.1689821898525001, + "grad_norm": 0.2282983660697937, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0048, + "step": 19100 + }, + { + "epoch": 1.1695942224126323, + "grad_norm": 0.1281195729970932, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0045, + "step": 19110 + }, + { + "epoch": 1.1702062549727645, + "grad_norm": 0.2850968539714813, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0034, + "step": 19120 + }, + { + "epoch": 1.1708182875328967, + "grad_norm": 0.12891536951065063, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0054, + "step": 19130 + }, + { + "epoch": 1.171430320093029, + "grad_norm": 0.13464727997779846, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0033, + "step": 19140 + }, + { + "epoch": 1.1720423526531611, + "grad_norm": 0.2415568083524704, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0041, + "step": 19150 + }, + { + "epoch": 1.1726543852132933, + "grad_norm": 0.15686331689357758, + "learning_rate": 7.525246655150879e-06, + "loss": 0.004, + "step": 19160 + }, + { + "epoch": 1.1732664177734256, + "grad_norm": 0.15490666031837463, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0039, + "step": 19170 + }, + { + "epoch": 1.1738784503335578, + "grad_norm": 0.14095450937747955, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0034, + "step": 19180 + }, + { + "epoch": 1.17449048289369, + "grad_norm": 0.19024531543254852, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0084, + "step": 19190 + }, + { + "epoch": 1.1751025154538222, + "grad_norm": 0.2583692669868469, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0042, + "step": 19200 + }, + { + "epoch": 1.1757145480139544, + "grad_norm": 0.19117654860019684, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0038, + "step": 19210 + }, + { + "epoch": 1.1763265805740866, + "grad_norm": 0.15838374197483063, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0036, + "step": 19220 + }, + { + "epoch": 1.1769386131342188, + "grad_norm": 0.30352044105529785, + "learning_rate": 7.4623904967312e-06, + "loss": 0.0052, + "step": 19230 + }, + { + "epoch": 1.177550645694351, + "grad_norm": 0.229969322681427, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0043, + "step": 19240 + }, + { + "epoch": 1.1781626782544832, + "grad_norm": 0.17781461775302887, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0046, + "step": 19250 + }, + { + "epoch": 1.1787747108146154, + "grad_norm": 0.1306339055299759, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0032, + "step": 19260 + }, + { + "epoch": 1.1793867433747476, + "grad_norm": 0.15727253258228302, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0045, + "step": 19270 + }, + { + "epoch": 1.1799987759348798, + "grad_norm": 0.24909166991710663, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0045, + "step": 19280 + }, + { + "epoch": 1.180610808495012, + "grad_norm": 0.4604126811027527, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0053, + "step": 19290 + }, + { + "epoch": 1.1812228410551442, + "grad_norm": 0.12739762663841248, + "learning_rate": 7.399737764864619e-06, + "loss": 0.004, + "step": 19300 + }, + { + "epoch": 1.1818348736152764, + "grad_norm": 0.2849223017692566, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0043, + "step": 19310 + }, + { + "epoch": 1.1824469061754086, + "grad_norm": 0.26089897751808167, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0044, + "step": 19320 + }, + { + "epoch": 1.1830589387355408, + "grad_norm": 0.1752242147922516, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0076, + "step": 19330 + }, + { + "epoch": 1.183670971295673, + "grad_norm": 0.14917130768299103, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0097, + "step": 19340 + }, + { + "epoch": 1.1842830038558052, + "grad_norm": 0.1599114090204239, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0061, + "step": 19350 + }, + { + "epoch": 1.1848950364159374, + "grad_norm": 0.16370004415512085, + "learning_rate": 7.346200065486093e-06, + "loss": 0.0035, + "step": 19360 + }, + { + "epoch": 1.1855070689760696, + "grad_norm": 0.19354844093322754, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0032, + "step": 19370 + }, + { + "epoch": 1.1861191015362018, + "grad_norm": 0.19689561426639557, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0067, + "step": 19380 + }, + { + "epoch": 1.186731134096334, + "grad_norm": 0.22203278541564941, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0041, + "step": 19390 + }, + { + "epoch": 1.1873431666564662, + "grad_norm": 0.13579773902893066, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0048, + "step": 19400 + }, + { + "epoch": 1.1879551992165984, + "grad_norm": 0.12321218848228455, + "learning_rate": 7.301703138094429e-06, + "loss": 0.004, + "step": 19410 + }, + { + "epoch": 1.1885672317767306, + "grad_norm": 0.28819525241851807, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0042, + "step": 19420 + }, + { + "epoch": 1.1891792643368628, + "grad_norm": 0.2577916085720062, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0039, + "step": 19430 + }, + { + "epoch": 1.189791296896995, + "grad_norm": 0.26840633153915405, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0062, + "step": 19440 + }, + { + "epoch": 1.1904033294571272, + "grad_norm": 0.24222144484519958, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0044, + "step": 19450 + }, + { + "epoch": 1.1910153620172594, + "grad_norm": 0.157009556889534, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0038, + "step": 19460 + }, + { + "epoch": 1.1916273945773916, + "grad_norm": 0.19925500452518463, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0042, + "step": 19470 + }, + { + "epoch": 1.1922394271375236, + "grad_norm": 0.19200846552848816, + "learning_rate": 7.239590017751423e-06, + "loss": 0.004, + "step": 19480 + }, + { + "epoch": 1.1928514596976558, + "grad_norm": 0.18441490828990936, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0056, + "step": 19490 + }, + { + "epoch": 1.193463492257788, + "grad_norm": 0.27565324306488037, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0046, + "step": 19500 + }, + { + "epoch": 1.1940755248179202, + "grad_norm": 0.17830556631088257, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0043, + "step": 19510 + }, + { + "epoch": 1.1946875573780524, + "grad_norm": 0.2769330143928528, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0037, + "step": 19520 + }, + { + "epoch": 1.1952995899381846, + "grad_norm": 0.168451189994812, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0039, + "step": 19530 + }, + { + "epoch": 1.1959116224983168, + "grad_norm": 0.31246763467788696, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0046, + "step": 19540 + }, + { + "epoch": 1.196523655058449, + "grad_norm": 0.21112671494483948, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0041, + "step": 19550 + }, + { + "epoch": 1.1971356876185812, + "grad_norm": 0.31681302189826965, + "learning_rate": 7.168868583990693e-06, + "loss": 0.005, + "step": 19560 + }, + { + "epoch": 1.1977477201787134, + "grad_norm": 0.18634411692619324, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0042, + "step": 19570 + }, + { + "epoch": 1.1983597527388457, + "grad_norm": 0.17780153453350067, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0057, + "step": 19580 + }, + { + "epoch": 1.1989717852989779, + "grad_norm": 0.19183002412319183, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0043, + "step": 19590 + }, + { + "epoch": 1.19958381785911, + "grad_norm": 0.28469574451446533, + "learning_rate": 7.133615440411572e-06, + "loss": 0.004, + "step": 19600 + }, + { + "epoch": 1.2001958504192423, + "grad_norm": 0.22470368444919586, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0044, + "step": 19610 + }, + { + "epoch": 1.2008078829793745, + "grad_norm": 0.23563240468502045, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0041, + "step": 19620 + }, + { + "epoch": 1.2014199155395067, + "grad_norm": 0.18467430770397186, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0053, + "step": 19630 + }, + { + "epoch": 1.2020319480996389, + "grad_norm": 0.12539178133010864, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0047, + "step": 19640 + }, + { + "epoch": 1.202643980659771, + "grad_norm": 0.2552005648612976, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.004, + "step": 19650 + }, + { + "epoch": 1.2032560132199033, + "grad_norm": 0.13963459432125092, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0035, + "step": 19660 + }, + { + "epoch": 1.2038680457800355, + "grad_norm": 0.17387327551841736, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0038, + "step": 19670 + }, + { + "epoch": 1.2044800783401677, + "grad_norm": 0.1284111589193344, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0044, + "step": 19680 + }, + { + "epoch": 1.2050921109002999, + "grad_norm": 0.22337380051612854, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0041, + "step": 19690 + }, + { + "epoch": 1.205704143460432, + "grad_norm": 0.2254808247089386, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0033, + "step": 19700 + }, + { + "epoch": 1.2063161760205643, + "grad_norm": 0.19316980242729187, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0041, + "step": 19710 + }, + { + "epoch": 1.2069282085806965, + "grad_norm": 0.17951075732707977, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0038, + "step": 19720 + }, + { + "epoch": 1.2075402411408287, + "grad_norm": 0.3105165660381317, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0043, + "step": 19730 + }, + { + "epoch": 1.208152273700961, + "grad_norm": 0.21083533763885498, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0039, + "step": 19740 + }, + { + "epoch": 1.208764306261093, + "grad_norm": 0.20121195912361145, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0035, + "step": 19750 + }, + { + "epoch": 1.2093763388212253, + "grad_norm": 0.20067447423934937, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0047, + "step": 19760 + }, + { + "epoch": 1.2099883713813575, + "grad_norm": 0.15943066775798798, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0039, + "step": 19770 + }, + { + "epoch": 1.2106004039414897, + "grad_norm": 0.21581032872200012, + "learning_rate": 6.975884226362e-06, + "loss": 0.0045, + "step": 19780 + }, + { + "epoch": 1.211212436501622, + "grad_norm": 0.16258753836154938, + "learning_rate": 6.967165692827958e-06, + "loss": 0.004, + "step": 19790 + }, + { + "epoch": 1.2118244690617541, + "grad_norm": 0.18742400407791138, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0047, + "step": 19800 + }, + { + "epoch": 1.2124365016218863, + "grad_norm": 0.09035168588161469, + "learning_rate": 6.949742834253074e-06, + "loss": 0.004, + "step": 19810 + }, + { + "epoch": 1.2130485341820185, + "grad_norm": 0.21749694645404816, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0054, + "step": 19820 + }, + { + "epoch": 1.2136605667421507, + "grad_norm": 0.3189448416233063, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0043, + "step": 19830 + }, + { + "epoch": 1.214272599302283, + "grad_norm": 0.26815512776374817, + "learning_rate": 6.923644220932124e-06, + "loss": 0.005, + "step": 19840 + }, + { + "epoch": 1.2148846318624151, + "grad_norm": 0.19533704221248627, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0037, + "step": 19850 + }, + { + "epoch": 1.2154966644225473, + "grad_norm": 0.36249589920043945, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0064, + "step": 19860 + }, + { + "epoch": 1.2161086969826795, + "grad_norm": 0.19801265001296997, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0042, + "step": 19870 + }, + { + "epoch": 1.2167207295428117, + "grad_norm": 0.10341386497020721, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0053, + "step": 19880 + }, + { + "epoch": 1.217332762102944, + "grad_norm": 0.17985381186008453, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0045, + "step": 19890 + }, + { + "epoch": 1.2179447946630761, + "grad_norm": 0.18160982429981232, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0061, + "step": 19900 + }, + { + "epoch": 1.2185568272232083, + "grad_norm": 0.15552182495594025, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0047, + "step": 19910 + }, + { + "epoch": 1.2191688597833406, + "grad_norm": 0.34908807277679443, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0046, + "step": 19920 + }, + { + "epoch": 1.2197808923434728, + "grad_norm": 0.14835652709007263, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0042, + "step": 19930 + }, + { + "epoch": 1.220392924903605, + "grad_norm": 0.23276430368423462, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0042, + "step": 19940 + }, + { + "epoch": 1.2210049574637372, + "grad_norm": 0.1900823563337326, + "learning_rate": 6.828319751504063e-06, + "loss": 0.004, + "step": 19950 + }, + { + "epoch": 1.2216169900238694, + "grad_norm": 0.134046271443367, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0039, + "step": 19960 + }, + { + "epoch": 1.2222290225840013, + "grad_norm": 0.17264600098133087, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0036, + "step": 19970 + }, + { + "epoch": 1.2228410551441335, + "grad_norm": 0.24845834076404572, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0044, + "step": 19980 + }, + { + "epoch": 1.2234530877042658, + "grad_norm": 0.14805762469768524, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0049, + "step": 19990 + }, + { + "epoch": 1.224065120264398, + "grad_norm": 0.228907972574234, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0043, + "step": 20000 + }, + { + "epoch": 1.2246771528245302, + "grad_norm": 0.16869507730007172, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0041, + "step": 20010 + }, + { + "epoch": 1.2252891853846624, + "grad_norm": 0.1983603835105896, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0041, + "step": 20020 + }, + { + "epoch": 1.2259012179447946, + "grad_norm": 0.17656362056732178, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0028, + "step": 20030 + }, + { + "epoch": 1.2265132505049268, + "grad_norm": 0.1360313892364502, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0069, + "step": 20040 + }, + { + "epoch": 1.227125283065059, + "grad_norm": 0.21057721972465515, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0051, + "step": 20050 + }, + { + "epoch": 1.2277373156251912, + "grad_norm": 0.138632670044899, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0038, + "step": 20060 + }, + { + "epoch": 1.2283493481853234, + "grad_norm": 0.17815573513507843, + "learning_rate": 6.725005485342219e-06, + "loss": 0.003, + "step": 20070 + }, + { + "epoch": 1.2289613807454556, + "grad_norm": 0.1769353598356247, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0066, + "step": 20080 + }, + { + "epoch": 1.2295734133055878, + "grad_norm": 0.23068928718566895, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0048, + "step": 20090 + }, + { + "epoch": 1.23018544586572, + "grad_norm": 0.25139328837394714, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0049, + "step": 20100 + }, + { + "epoch": 1.2307974784258522, + "grad_norm": 0.09128634631633759, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0042, + "step": 20110 + }, + { + "epoch": 1.2314095109859844, + "grad_norm": 0.20516613125801086, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0031, + "step": 20120 + }, + { + "epoch": 1.2320215435461166, + "grad_norm": 0.1518358588218689, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0049, + "step": 20130 + }, + { + "epoch": 1.2326335761062488, + "grad_norm": 0.1673758625984192, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0044, + "step": 20140 + }, + { + "epoch": 1.233245608666381, + "grad_norm": 0.14084585011005402, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0053, + "step": 20150 + }, + { + "epoch": 1.2338576412265132, + "grad_norm": 0.23316942155361176, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0042, + "step": 20160 + }, + { + "epoch": 1.2344696737866454, + "grad_norm": 0.23793813586235046, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0045, + "step": 20170 + }, + { + "epoch": 1.2350817063467776, + "grad_norm": 0.4269389510154724, + "learning_rate": 6.630934952049143e-06, + "loss": 0.005, + "step": 20180 + }, + { + "epoch": 1.2356937389069098, + "grad_norm": 0.15654191374778748, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0039, + "step": 20190 + }, + { + "epoch": 1.236305771467042, + "grad_norm": 0.19204623997211456, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0032, + "step": 20200 + }, + { + "epoch": 1.2369178040271742, + "grad_norm": 0.15817691385746002, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0044, + "step": 20210 + }, + { + "epoch": 1.2375298365873064, + "grad_norm": 0.12637947499752045, + "learning_rate": 6.596880604028027e-06, + "loss": 0.004, + "step": 20220 + }, + { + "epoch": 1.2381418691474386, + "grad_norm": 0.26657921075820923, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0041, + "step": 20230 + }, + { + "epoch": 1.2387539017075708, + "grad_norm": 0.15207791328430176, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0045, + "step": 20240 + }, + { + "epoch": 1.239365934267703, + "grad_norm": 0.32583367824554443, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0042, + "step": 20250 + }, + { + "epoch": 1.2399779668278352, + "grad_norm": 0.15617726743221283, + "learning_rate": 6.562908932779455e-06, + "loss": 0.004, + "step": 20260 + }, + { + "epoch": 1.2405899993879674, + "grad_norm": 0.1935809850692749, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0041, + "step": 20270 + }, + { + "epoch": 1.2412020319480996, + "grad_norm": 0.17422369122505188, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0035, + "step": 20280 + }, + { + "epoch": 1.2418140645082318, + "grad_norm": 0.15332955121994019, + "learning_rate": 6.53748481975927e-06, + "loss": 0.0049, + "step": 20290 + }, + { + "epoch": 1.242426097068364, + "grad_norm": 0.16183018684387207, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0042, + "step": 20300 + }, + { + "epoch": 1.2430381296284962, + "grad_norm": 0.28421106934547424, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0045, + "step": 20310 + }, + { + "epoch": 1.2436501621886284, + "grad_norm": 0.23288874328136444, + "learning_rate": 6.512107839793337e-06, + "loss": 0.004, + "step": 20320 + }, + { + "epoch": 1.2442621947487607, + "grad_norm": 0.17955242097377777, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0036, + "step": 20330 + }, + { + "epoch": 1.2448742273088929, + "grad_norm": 0.20192117989063263, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0055, + "step": 20340 + }, + { + "epoch": 1.245486259869025, + "grad_norm": 0.15365810692310333, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0034, + "step": 20350 + }, + { + "epoch": 1.2460982924291573, + "grad_norm": 0.25220832228660583, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0039, + "step": 20360 + }, + { + "epoch": 1.2467103249892895, + "grad_norm": 0.25777462124824524, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0053, + "step": 20370 + }, + { + "epoch": 1.2473223575494217, + "grad_norm": 0.2693277895450592, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0053, + "step": 20380 + }, + { + "epoch": 1.2479343901095539, + "grad_norm": 0.22846420109272003, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0033, + "step": 20390 + }, + { + "epoch": 1.248546422669686, + "grad_norm": 0.17022505402565002, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0046, + "step": 20400 + }, + { + "epoch": 1.2491584552298183, + "grad_norm": 0.08295682072639465, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0035, + "step": 20410 + }, + { + "epoch": 1.2497704877899505, + "grad_norm": 0.2745625972747803, + "learning_rate": 6.427861749601945e-06, + "loss": 0.0044, + "step": 20420 + }, + { + "epoch": 1.2503825203500827, + "grad_norm": 0.12855033576488495, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0042, + "step": 20430 + }, + { + "epoch": 1.2509945529102149, + "grad_norm": 0.30358386039733887, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0049, + "step": 20440 + }, + { + "epoch": 1.251606585470347, + "grad_norm": 0.15514959394931793, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0048, + "step": 20450 + }, + { + "epoch": 1.2522186180304793, + "grad_norm": 0.1414988487958908, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0088, + "step": 20460 + }, + { + "epoch": 1.2528306505906115, + "grad_norm": 0.17399665713310242, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0046, + "step": 20470 + }, + { + "epoch": 1.2534426831507437, + "grad_norm": 0.22629426419734955, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0038, + "step": 20480 + }, + { + "epoch": 1.254054715710876, + "grad_norm": 0.30595293641090393, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0027, + "step": 20490 + }, + { + "epoch": 1.254666748271008, + "grad_norm": 0.17980262637138367, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0034, + "step": 20500 + }, + { + "epoch": 1.2552787808311403, + "grad_norm": 0.19016452133655548, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0029, + "step": 20510 + }, + { + "epoch": 1.2558908133912725, + "grad_norm": 0.20200394093990326, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0037, + "step": 20520 + }, + { + "epoch": 1.2565028459514047, + "grad_norm": 0.15347513556480408, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0041, + "step": 20530 + }, + { + "epoch": 1.257114878511537, + "grad_norm": 0.1851687729358673, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0042, + "step": 20540 + }, + { + "epoch": 1.2577269110716691, + "grad_norm": 0.2529662549495697, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0037, + "step": 20550 + }, + { + "epoch": 1.2583389436318013, + "grad_norm": 0.18209592998027802, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0037, + "step": 20560 + }, + { + "epoch": 1.2589509761919335, + "grad_norm": 0.18981963396072388, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0036, + "step": 20570 + }, + { + "epoch": 1.2595630087520657, + "grad_norm": 0.13232728838920593, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0035, + "step": 20580 + }, + { + "epoch": 1.260175041312198, + "grad_norm": 0.133514404296875, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0039, + "step": 20590 + }, + { + "epoch": 1.2607870738723301, + "grad_norm": 0.14339123666286469, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0043, + "step": 20600 + }, + { + "epoch": 1.2613991064324623, + "grad_norm": 0.48857489228248596, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0045, + "step": 20610 + }, + { + "epoch": 1.2620111389925945, + "grad_norm": 0.1513262242078781, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0029, + "step": 20620 + }, + { + "epoch": 1.2626231715527267, + "grad_norm": 0.1497354805469513, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0039, + "step": 20630 + }, + { + "epoch": 1.2632352041128587, + "grad_norm": 0.132791206240654, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0037, + "step": 20640 + }, + { + "epoch": 1.263847236672991, + "grad_norm": 0.13804496824741364, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0035, + "step": 20650 + }, + { + "epoch": 1.2644592692331231, + "grad_norm": 0.19393391907215118, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0049, + "step": 20660 + }, + { + "epoch": 1.2650713017932553, + "grad_norm": 0.17623338103294373, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0038, + "step": 20670 + }, + { + "epoch": 1.2656833343533875, + "grad_norm": 0.26931124925613403, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0042, + "step": 20680 + }, + { + "epoch": 1.2662953669135197, + "grad_norm": 0.17984439432621002, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0036, + "step": 20690 + }, + { + "epoch": 1.266907399473652, + "grad_norm": 0.19648219645023346, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0046, + "step": 20700 + }, + { + "epoch": 1.2675194320337841, + "grad_norm": 0.1464766263961792, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0024, + "step": 20710 + }, + { + "epoch": 1.2681314645939163, + "grad_norm": 0.1271074265241623, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0042, + "step": 20720 + }, + { + "epoch": 1.2687434971540485, + "grad_norm": 0.15960967540740967, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0079, + "step": 20730 + }, + { + "epoch": 1.2693555297141808, + "grad_norm": 0.13636153936386108, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0046, + "step": 20740 + }, + { + "epoch": 1.269967562274313, + "grad_norm": 0.19099050760269165, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0046, + "step": 20750 + }, + { + "epoch": 1.2705795948344452, + "grad_norm": 0.28632739186286926, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0036, + "step": 20760 + }, + { + "epoch": 1.2711916273945774, + "grad_norm": 0.2565019726753235, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0055, + "step": 20770 + }, + { + "epoch": 1.2718036599547096, + "grad_norm": 0.24443399906158447, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0053, + "step": 20780 + }, + { + "epoch": 1.2724156925148418, + "grad_norm": 0.1396762877702713, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0029, + "step": 20790 + }, + { + "epoch": 1.273027725074974, + "grad_norm": 0.3028377890586853, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0061, + "step": 20800 + }, + { + "epoch": 1.2736397576351062, + "grad_norm": 0.18195804953575134, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0034, + "step": 20810 + }, + { + "epoch": 1.2742517901952384, + "grad_norm": 0.16194652020931244, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0054, + "step": 20820 + }, + { + "epoch": 1.2748638227553706, + "grad_norm": 0.13011956214904785, + "learning_rate": 6.08816828695283e-06, + "loss": 0.003, + "step": 20830 + }, + { + "epoch": 1.2754758553155028, + "grad_norm": 0.23294220864772797, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0041, + "step": 20840 + }, + { + "epoch": 1.276087887875635, + "grad_norm": 0.1892961710691452, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0031, + "step": 20850 + }, + { + "epoch": 1.2766999204357672, + "grad_norm": 0.1984476000070572, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0046, + "step": 20860 + }, + { + "epoch": 1.2773119529958994, + "grad_norm": 0.158709317445755, + "learning_rate": 6.055535530104466e-06, + "loss": 0.003, + "step": 20870 + }, + { + "epoch": 1.2779239855560316, + "grad_norm": 0.16505110263824463, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0039, + "step": 20880 + }, + { + "epoch": 1.2785360181161638, + "grad_norm": 0.18332232534885406, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0036, + "step": 20890 + }, + { + "epoch": 1.279148050676296, + "grad_norm": 0.1797804981470108, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0049, + "step": 20900 + }, + { + "epoch": 1.2797600832364282, + "grad_norm": 0.19247964024543762, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0053, + "step": 20910 + }, + { + "epoch": 1.2803721157965604, + "grad_norm": 0.17845408618450165, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0045, + "step": 20920 + }, + { + "epoch": 1.2809841483566926, + "grad_norm": 0.09454555809497833, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0027, + "step": 20930 + }, + { + "epoch": 1.2815961809168248, + "grad_norm": 0.12647129595279694, + "learning_rate": 5.998651973182953e-06, + "loss": 0.004, + "step": 20940 + }, + { + "epoch": 1.282208213476957, + "grad_norm": 0.39115941524505615, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0051, + "step": 20950 + }, + { + "epoch": 1.2828202460370892, + "grad_norm": 0.29081296920776367, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0044, + "step": 20960 + }, + { + "epoch": 1.2834322785972214, + "grad_norm": 0.1849275827407837, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0042, + "step": 20970 + }, + { + "epoch": 1.2840443111573536, + "grad_norm": 0.24075689911842346, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0031, + "step": 20980 + }, + { + "epoch": 1.2846563437174858, + "grad_norm": 0.12463482469320297, + "learning_rate": 5.958196751005967e-06, + "loss": 0.003, + "step": 20990 + }, + { + "epoch": 1.285268376277618, + "grad_norm": 0.16987742483615875, + "learning_rate": 5.950123419134817e-06, + "loss": 0.004, + "step": 21000 + }, + { + "epoch": 1.2858804088377502, + "grad_norm": 0.20316782593727112, + "learning_rate": 5.942056013575106e-06, + "loss": 0.004, + "step": 21010 + }, + { + "epoch": 1.2864924413978824, + "grad_norm": 0.20989514887332916, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0053, + "step": 21020 + }, + { + "epoch": 1.2871044739580146, + "grad_norm": 0.33795273303985596, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0048, + "step": 21030 + }, + { + "epoch": 1.2877165065181468, + "grad_norm": 0.13918501138687134, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.004, + "step": 21040 + }, + { + "epoch": 1.288328539078279, + "grad_norm": 0.2992899715900421, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0038, + "step": 21050 + }, + { + "epoch": 1.288940571638411, + "grad_norm": 0.2540164589881897, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0037, + "step": 21060 + }, + { + "epoch": 1.2895526041985432, + "grad_norm": 0.161032035946846, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0047, + "step": 21070 + }, + { + "epoch": 1.2901646367586754, + "grad_norm": 0.1743200421333313, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0037, + "step": 21080 + }, + { + "epoch": 1.2907766693188076, + "grad_norm": 0.26604363322257996, + "learning_rate": 5.877731250949785e-06, + "loss": 0.004, + "step": 21090 + }, + { + "epoch": 1.2913887018789398, + "grad_norm": 0.275696724653244, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0044, + "step": 21100 + }, + { + "epoch": 1.292000734439072, + "grad_norm": 0.16888457536697388, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0042, + "step": 21110 + }, + { + "epoch": 1.2926127669992042, + "grad_norm": 0.12902231514453888, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0048, + "step": 21120 + }, + { + "epoch": 1.2932247995593364, + "grad_norm": 0.14577728509902954, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0046, + "step": 21130 + }, + { + "epoch": 1.2938368321194686, + "grad_norm": 0.1544434279203415, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0031, + "step": 21140 + }, + { + "epoch": 1.2944488646796009, + "grad_norm": 0.09238115698099136, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0035, + "step": 21150 + }, + { + "epoch": 1.295060897239733, + "grad_norm": 0.1770051270723343, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0033, + "step": 21160 + }, + { + "epoch": 1.2956729297998653, + "grad_norm": 0.20360831916332245, + "learning_rate": 5.813791207086085e-06, + "loss": 0.0037, + "step": 21170 + }, + { + "epoch": 1.2962849623599975, + "grad_norm": 0.18503794074058533, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0045, + "step": 21180 + }, + { + "epoch": 1.2968969949201297, + "grad_norm": 0.12918968498706818, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0048, + "step": 21190 + }, + { + "epoch": 1.2975090274802619, + "grad_norm": 0.14289438724517822, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0041, + "step": 21200 + }, + { + "epoch": 1.298121060040394, + "grad_norm": 0.17546117305755615, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0042, + "step": 21210 + }, + { + "epoch": 1.2987330926005263, + "grad_norm": 0.2919277846813202, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0051, + "step": 21220 + }, + { + "epoch": 1.2993451251606585, + "grad_norm": 0.0988069474697113, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0044, + "step": 21230 + }, + { + "epoch": 1.2999571577207907, + "grad_norm": 0.19284513592720032, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0037, + "step": 21240 + }, + { + "epoch": 1.3005691902809229, + "grad_norm": 0.12894058227539062, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0031, + "step": 21250 + }, + { + "epoch": 1.301181222841055, + "grad_norm": 0.14740346372127533, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0037, + "step": 21260 + }, + { + "epoch": 1.3017932554011873, + "grad_norm": 0.16817794740200043, + "learning_rate": 5.734414476316747e-06, + "loss": 0.005, + "step": 21270 + }, + { + "epoch": 1.3024052879613195, + "grad_norm": 0.29237234592437744, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0039, + "step": 21280 + }, + { + "epoch": 1.3030173205214517, + "grad_norm": 0.12649856507778168, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0035, + "step": 21290 + }, + { + "epoch": 1.303629353081584, + "grad_norm": 0.11057443916797638, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0039, + "step": 21300 + }, + { + "epoch": 1.304241385641716, + "grad_norm": 0.13494674861431122, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0035, + "step": 21310 + }, + { + "epoch": 1.3048534182018483, + "grad_norm": 0.3079472482204437, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0042, + "step": 21320 + }, + { + "epoch": 1.3054654507619805, + "grad_norm": 0.13513535261154175, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0048, + "step": 21330 + }, + { + "epoch": 1.3060774833221127, + "grad_norm": 0.39266663789749146, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0046, + "step": 21340 + }, + { + "epoch": 1.306689515882245, + "grad_norm": 0.15097978711128235, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0047, + "step": 21350 + }, + { + "epoch": 1.3073015484423771, + "grad_norm": 0.25206202268600464, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0049, + "step": 21360 + }, + { + "epoch": 1.3079135810025093, + "grad_norm": 0.16765817999839783, + "learning_rate": 5.655655685355026e-06, + "loss": 0.005, + "step": 21370 + }, + { + "epoch": 1.3085256135626415, + "grad_norm": 0.2137158215045929, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0048, + "step": 21380 + }, + { + "epoch": 1.3091376461227737, + "grad_norm": 0.19711454212665558, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0043, + "step": 21390 + }, + { + "epoch": 1.309749678682906, + "grad_norm": 0.1722051054239273, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0044, + "step": 21400 + }, + { + "epoch": 1.3103617112430381, + "grad_norm": 0.1807536482810974, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0045, + "step": 21410 + }, + { + "epoch": 1.3109737438031703, + "grad_norm": 0.15052185952663422, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.004, + "step": 21420 + }, + { + "epoch": 1.3115857763633025, + "grad_norm": 0.1485220491886139, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0036, + "step": 21430 + }, + { + "epoch": 1.3121978089234347, + "grad_norm": 0.15065325796604156, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0037, + "step": 21440 + }, + { + "epoch": 1.312809841483567, + "grad_norm": 0.17903591692447662, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0047, + "step": 21450 + }, + { + "epoch": 1.3134218740436991, + "grad_norm": 0.14310622215270996, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0043, + "step": 21460 + }, + { + "epoch": 1.3140339066038313, + "grad_norm": 0.12117830663919449, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0053, + "step": 21470 + }, + { + "epoch": 1.3146459391639636, + "grad_norm": 0.1484573632478714, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0036, + "step": 21480 + }, + { + "epoch": 1.3152579717240958, + "grad_norm": 0.16559219360351562, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0037, + "step": 21490 + }, + { + "epoch": 1.315870004284228, + "grad_norm": 0.21626432240009308, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0031, + "step": 21500 + }, + { + "epoch": 1.3164820368443602, + "grad_norm": 0.08177383989095688, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0052, + "step": 21510 + }, + { + "epoch": 1.3170940694044924, + "grad_norm": 0.18640732765197754, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0062, + "step": 21520 + }, + { + "epoch": 1.3177061019646246, + "grad_norm": 0.2599853277206421, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0039, + "step": 21530 + }, + { + "epoch": 1.3183181345247568, + "grad_norm": 0.1591203212738037, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0034, + "step": 21540 + }, + { + "epoch": 1.318930167084889, + "grad_norm": 0.2834412455558777, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0037, + "step": 21550 + }, + { + "epoch": 1.3195421996450212, + "grad_norm": 0.13853803277015686, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0038, + "step": 21560 + }, + { + "epoch": 1.3201542322051534, + "grad_norm": 0.14707128703594208, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0042, + "step": 21570 + }, + { + "epoch": 1.3207662647652856, + "grad_norm": 0.12561920285224915, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0038, + "step": 21580 + }, + { + "epoch": 1.3213782973254178, + "grad_norm": 0.4156799018383026, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0051, + "step": 21590 + }, + { + "epoch": 1.32199032988555, + "grad_norm": 0.11400662362575531, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0031, + "step": 21600 + }, + { + "epoch": 1.3226023624456822, + "grad_norm": 0.15658807754516602, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0052, + "step": 21610 + }, + { + "epoch": 1.3232143950058144, + "grad_norm": 0.1212862953543663, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0034, + "step": 21620 + }, + { + "epoch": 1.3238264275659466, + "grad_norm": 0.2201654314994812, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0036, + "step": 21630 + }, + { + "epoch": 1.3244384601260788, + "grad_norm": 0.11623375117778778, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0032, + "step": 21640 + }, + { + "epoch": 1.325050492686211, + "grad_norm": 0.13092897832393646, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0035, + "step": 21650 + }, + { + "epoch": 1.3256625252463432, + "grad_norm": 0.15409153699874878, + "learning_rate": 5.430834687545416e-06, + "loss": 0.004, + "step": 21660 + }, + { + "epoch": 1.3262745578064754, + "grad_norm": 0.3148297369480133, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0031, + "step": 21670 + }, + { + "epoch": 1.3268865903666076, + "grad_norm": 0.13435055315494537, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0033, + "step": 21680 + }, + { + "epoch": 1.3274986229267398, + "grad_norm": 0.17878089845180511, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0029, + "step": 21690 + }, + { + "epoch": 1.328110655486872, + "grad_norm": 0.1823783665895462, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0039, + "step": 21700 + }, + { + "epoch": 1.3287226880470042, + "grad_norm": 0.14492660760879517, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0033, + "step": 21710 + }, + { + "epoch": 1.3293347206071364, + "grad_norm": 0.1730341762304306, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0041, + "step": 21720 + }, + { + "epoch": 1.3299467531672686, + "grad_norm": 0.07961586117744446, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0039, + "step": 21730 + }, + { + "epoch": 1.3305587857274008, + "grad_norm": 0.14440582692623138, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0038, + "step": 21740 + }, + { + "epoch": 1.331170818287533, + "grad_norm": 0.22034496068954468, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0023, + "step": 21750 + }, + { + "epoch": 1.3317828508476652, + "grad_norm": 0.1861305832862854, + "learning_rate": 5.354573491223212e-06, + "loss": 0.005, + "step": 21760 + }, + { + "epoch": 1.3323948834077972, + "grad_norm": 0.15587164461612701, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.0044, + "step": 21770 + }, + { + "epoch": 1.3330069159679294, + "grad_norm": 0.6852900981903076, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0075, + "step": 21780 + }, + { + "epoch": 1.3336189485280616, + "grad_norm": 0.14315280318260193, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0034, + "step": 21790 + }, + { + "epoch": 1.3342309810881938, + "grad_norm": 0.350981205701828, + "learning_rate": 5.324254018551227e-06, + "loss": 0.004, + "step": 21800 + }, + { + "epoch": 1.334843013648326, + "grad_norm": 0.12344911694526672, + "learning_rate": 5.316690780174352e-06, + "loss": 0.004, + "step": 21810 + }, + { + "epoch": 1.3354550462084582, + "grad_norm": 0.18744061887264252, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0035, + "step": 21820 + }, + { + "epoch": 1.3360670787685904, + "grad_norm": 0.22747837007045746, + "learning_rate": 5.301584321328435e-06, + "loss": 0.004, + "step": 21830 + }, + { + "epoch": 1.3366791113287226, + "grad_norm": 0.22695699334144592, + "learning_rate": 5.294041118587667e-06, + "loss": 0.0042, + "step": 21840 + }, + { + "epoch": 1.3372911438888548, + "grad_norm": 0.17258964478969574, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0044, + "step": 21850 + }, + { + "epoch": 1.337903176448987, + "grad_norm": 0.1523793637752533, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0047, + "step": 21860 + }, + { + "epoch": 1.3385152090091192, + "grad_norm": 0.1983587145805359, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0037, + "step": 21870 + }, + { + "epoch": 1.3391272415692514, + "grad_norm": 0.1263747215270996, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0034, + "step": 21880 + }, + { + "epoch": 1.3397392741293837, + "grad_norm": 0.1550009399652481, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0037, + "step": 21890 + }, + { + "epoch": 1.3403513066895159, + "grad_norm": 0.14963915944099426, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0038, + "step": 21900 + }, + { + "epoch": 1.340963339249648, + "grad_norm": 0.17783671617507935, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0036, + "step": 21910 + }, + { + "epoch": 1.3415753718097803, + "grad_norm": 0.2715896964073181, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0036, + "step": 21920 + }, + { + "epoch": 1.3421874043699125, + "grad_norm": 0.22924886643886566, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0037, + "step": 21930 + }, + { + "epoch": 1.3427994369300447, + "grad_norm": 0.13689789175987244, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0033, + "step": 21940 + }, + { + "epoch": 1.3434114694901769, + "grad_norm": 0.09137748926877975, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0031, + "step": 21950 + }, + { + "epoch": 1.344023502050309, + "grad_norm": 0.17097881436347961, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0031, + "step": 21960 + }, + { + "epoch": 1.3446355346104413, + "grad_norm": 0.23919200897216797, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0046, + "step": 21970 + }, + { + "epoch": 1.3452475671705735, + "grad_norm": 0.14261527359485626, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0037, + "step": 21980 + }, + { + "epoch": 1.3458595997307057, + "grad_norm": 0.156734898686409, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0043, + "step": 21990 + }, + { + "epoch": 1.3464716322908379, + "grad_norm": 0.21755588054656982, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0032, + "step": 22000 + }, + { + "epoch": 1.34708366485097, + "grad_norm": 0.1373317390680313, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0033, + "step": 22010 + }, + { + "epoch": 1.3476956974111023, + "grad_norm": 0.1646856814622879, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0047, + "step": 22020 + }, + { + "epoch": 1.3483077299712345, + "grad_norm": 0.1908850073814392, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0044, + "step": 22030 + }, + { + "epoch": 1.3489197625313667, + "grad_norm": 0.24862833321094513, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0041, + "step": 22040 + }, + { + "epoch": 1.349531795091499, + "grad_norm": 0.15980397164821625, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0033, + "step": 22050 + }, + { + "epoch": 1.350143827651631, + "grad_norm": 0.1157977357506752, + "learning_rate": 5.129800405815733e-06, + "loss": 0.0036, + "step": 22060 + }, + { + "epoch": 1.3507558602117633, + "grad_norm": 0.11186888068914413, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0046, + "step": 22070 + }, + { + "epoch": 1.3513678927718955, + "grad_norm": 0.17715996503829956, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0035, + "step": 22080 + }, + { + "epoch": 1.3519799253320277, + "grad_norm": 0.1265174001455307, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0048, + "step": 22090 + }, + { + "epoch": 1.35259195789216, + "grad_norm": 0.13969522714614868, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0028, + "step": 22100 + }, + { + "epoch": 1.3532039904522921, + "grad_norm": 0.13246525824069977, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0026, + "step": 22110 + }, + { + "epoch": 1.3538160230124243, + "grad_norm": 0.14675064384937286, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0082, + "step": 22120 + }, + { + "epoch": 1.3544280555725565, + "grad_norm": 0.15810683369636536, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0031, + "step": 22130 + }, + { + "epoch": 1.3550400881326887, + "grad_norm": 0.20675864815711975, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0035, + "step": 22140 + }, + { + "epoch": 1.355652120692821, + "grad_norm": 0.1921442300081253, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0038, + "step": 22150 + }, + { + "epoch": 1.3562641532529531, + "grad_norm": 0.14300711452960968, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0035, + "step": 22160 + }, + { + "epoch": 1.3568761858130853, + "grad_norm": 0.0656728520989418, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0047, + "step": 22170 + }, + { + "epoch": 1.3574882183732175, + "grad_norm": 0.148203507065773, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0041, + "step": 22180 + }, + { + "epoch": 1.3581002509333495, + "grad_norm": 0.15472126007080078, + "learning_rate": 5.034310349217475e-06, + "loss": 0.004, + "step": 22190 + }, + { + "epoch": 1.3587122834934817, + "grad_norm": 0.12006669491529465, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0034, + "step": 22200 + }, + { + "epoch": 1.359324316053614, + "grad_norm": 0.15345145761966705, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0032, + "step": 22210 + }, + { + "epoch": 1.3599363486137461, + "grad_norm": 0.17429186403751373, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0039, + "step": 22220 + }, + { + "epoch": 1.3605483811738783, + "grad_norm": 0.20691345632076263, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0029, + "step": 22230 + }, + { + "epoch": 1.3611604137340105, + "grad_norm": 0.1874946504831314, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0042, + "step": 22240 + }, + { + "epoch": 1.3617724462941427, + "grad_norm": 0.12159912288188934, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0033, + "step": 22250 + }, + { + "epoch": 1.362384478854275, + "grad_norm": 0.29434919357299805, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0044, + "step": 22260 + }, + { + "epoch": 1.3629965114144071, + "grad_norm": 0.06661798804998398, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0031, + "step": 22270 + }, + { + "epoch": 1.3636085439745393, + "grad_norm": 0.14819994568824768, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0039, + "step": 22280 + }, + { + "epoch": 1.3642205765346715, + "grad_norm": 0.17289887368679047, + "learning_rate": 4.961660586405147e-06, + "loss": 0.0035, + "step": 22290 + }, + { + "epoch": 1.3648326090948038, + "grad_norm": 0.18789313733577728, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0036, + "step": 22300 + }, + { + "epoch": 1.365444641654936, + "grad_norm": 0.1877586394548416, + "learning_rate": 4.947215397583639e-06, + "loss": 0.004, + "step": 22310 + }, + { + "epoch": 1.3660566742150682, + "grad_norm": 0.11696574836969376, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0036, + "step": 22320 + }, + { + "epoch": 1.3666687067752004, + "grad_norm": 0.2511763274669647, + "learning_rate": 4.932798621873274e-06, + "loss": 0.004, + "step": 22330 + }, + { + "epoch": 1.3672807393353326, + "grad_norm": 0.15005314350128174, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0046, + "step": 22340 + }, + { + "epoch": 1.3678927718954648, + "grad_norm": 0.16856855154037476, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0056, + "step": 22350 + }, + { + "epoch": 1.368504804455597, + "grad_norm": 0.24532385170459747, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0035, + "step": 22360 + }, + { + "epoch": 1.3691168370157292, + "grad_norm": 0.29320162534713745, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0047, + "step": 22370 + }, + { + "epoch": 1.3697288695758614, + "grad_norm": 0.1518300473690033, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0041, + "step": 22380 + }, + { + "epoch": 1.3703409021359936, + "grad_norm": 0.13431201875209808, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0033, + "step": 22390 + }, + { + "epoch": 1.3709529346961258, + "grad_norm": 0.17390409111976624, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0039, + "step": 22400 + }, + { + "epoch": 1.371564967256258, + "grad_norm": 0.16482478380203247, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.007, + "step": 22410 + }, + { + "epoch": 1.3721769998163902, + "grad_norm": 0.11469490826129913, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0041, + "step": 22420 + }, + { + "epoch": 1.3727890323765224, + "grad_norm": 0.2327135056257248, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0043, + "step": 22430 + }, + { + "epoch": 1.3734010649366546, + "grad_norm": 0.1373092532157898, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0036, + "step": 22440 + }, + { + "epoch": 1.3740130974967868, + "grad_norm": 0.1534084528684616, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0028, + "step": 22450 + }, + { + "epoch": 1.374625130056919, + "grad_norm": 0.3217960596084595, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0044, + "step": 22460 + }, + { + "epoch": 1.3752371626170512, + "grad_norm": 0.14245563745498657, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0039, + "step": 22470 + }, + { + "epoch": 1.3758491951771834, + "grad_norm": 0.17652876675128937, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0031, + "step": 22480 + }, + { + "epoch": 1.3764612277373156, + "grad_norm": 0.1996244192123413, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0034, + "step": 22490 + }, + { + "epoch": 1.3770732602974478, + "grad_norm": 0.1658472716808319, + "learning_rate": 4.81141273556404e-06, + "loss": 0.003, + "step": 22500 + }, + { + "epoch": 1.37768529285758, + "grad_norm": 0.16233472526073456, + "learning_rate": 4.804337352679613e-06, + "loss": 0.004, + "step": 22510 + }, + { + "epoch": 1.3782973254177122, + "grad_norm": 0.13045033812522888, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0049, + "step": 22520 + }, + { + "epoch": 1.3789093579778444, + "grad_norm": 0.1195274218916893, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0042, + "step": 22530 + }, + { + "epoch": 1.3795213905379766, + "grad_norm": 0.14395804703235626, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0036, + "step": 22540 + }, + { + "epoch": 1.3801334230981088, + "grad_norm": 0.24495497345924377, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0047, + "step": 22550 + }, + { + "epoch": 1.380745455658241, + "grad_norm": 0.14288006722927094, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0044, + "step": 22560 + }, + { + "epoch": 1.3813574882183732, + "grad_norm": 0.16967979073524475, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0051, + "step": 22570 + }, + { + "epoch": 1.3819695207785054, + "grad_norm": 0.2023036777973175, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0032, + "step": 22580 + }, + { + "epoch": 1.3825815533386376, + "grad_norm": 0.1191902756690979, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0026, + "step": 22590 + }, + { + "epoch": 1.3831935858987698, + "grad_norm": 0.16922403872013092, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0037, + "step": 22600 + }, + { + "epoch": 1.383805618458902, + "grad_norm": 0.12394976615905762, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0038, + "step": 22610 + }, + { + "epoch": 1.3844176510190342, + "grad_norm": 0.23889753222465515, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0041, + "step": 22620 + }, + { + "epoch": 1.3850296835791664, + "grad_norm": 0.31215062737464905, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0036, + "step": 22630 + }, + { + "epoch": 1.3856417161392987, + "grad_norm": 0.1519152820110321, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0042, + "step": 22640 + }, + { + "epoch": 1.3862537486994309, + "grad_norm": 0.3375433683395386, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0039, + "step": 22650 + }, + { + "epoch": 1.386865781259563, + "grad_norm": 0.21715323626995087, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0027, + "step": 22660 + }, + { + "epoch": 1.3874778138196953, + "grad_norm": 0.2066027969121933, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0033, + "step": 22670 + }, + { + "epoch": 1.3880898463798275, + "grad_norm": 0.11542408168315887, + "learning_rate": 4.6851750421442e-06, + "loss": 0.004, + "step": 22680 + }, + { + "epoch": 1.3887018789399597, + "grad_norm": 0.1183561235666275, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0037, + "step": 22690 + }, + { + "epoch": 1.3893139115000919, + "grad_norm": 0.24478662014007568, + "learning_rate": 4.67129597392514e-06, + "loss": 0.004, + "step": 22700 + }, + { + "epoch": 1.389925944060224, + "grad_norm": 0.28880801796913147, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0039, + "step": 22710 + }, + { + "epoch": 1.3905379766203563, + "grad_norm": 0.14014701545238495, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0034, + "step": 22720 + }, + { + "epoch": 1.3911500091804885, + "grad_norm": 0.1549793928861618, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0033, + "step": 22730 + }, + { + "epoch": 1.3917620417406207, + "grad_norm": 0.1423012614250183, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0041, + "step": 22740 + }, + { + "epoch": 1.3923740743007529, + "grad_norm": 0.291273832321167, + "learning_rate": 4.636728419531758e-06, + "loss": 0.004, + "step": 22750 + }, + { + "epoch": 1.392986106860885, + "grad_norm": 0.38278621435165405, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0045, + "step": 22760 + }, + { + "epoch": 1.3935981394210173, + "grad_norm": 0.20528365671634674, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0042, + "step": 22770 + }, + { + "epoch": 1.3942101719811495, + "grad_norm": 0.11913729459047318, + "learning_rate": 4.616077433849538e-06, + "loss": 0.003, + "step": 22780 + }, + { + "epoch": 1.3948222045412817, + "grad_norm": 0.21683627367019653, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0027, + "step": 22790 + }, + { + "epoch": 1.395434237101414, + "grad_norm": 0.12143554538488388, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0031, + "step": 22800 + }, + { + "epoch": 1.396046269661546, + "grad_norm": 0.14171159267425537, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0039, + "step": 22810 + }, + { + "epoch": 1.3966583022216783, + "grad_norm": 0.19254790246486664, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0043, + "step": 22820 + }, + { + "epoch": 1.3972703347818105, + "grad_norm": 0.12295825034379959, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0045, + "step": 22830 + }, + { + "epoch": 1.3978823673419427, + "grad_norm": 0.1274985820055008, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0037, + "step": 22840 + }, + { + "epoch": 1.398494399902075, + "grad_norm": 0.2940427362918854, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0059, + "step": 22850 + }, + { + "epoch": 1.3991064324622071, + "grad_norm": 0.15357589721679688, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0032, + "step": 22860 + }, + { + "epoch": 1.3997184650223393, + "grad_norm": 0.12781603634357452, + "learning_rate": 4.554529907376127e-06, + "loss": 0.003, + "step": 22870 + }, + { + "epoch": 1.4003304975824715, + "grad_norm": 0.34976109862327576, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0047, + "step": 22880 + }, + { + "epoch": 1.4009425301426035, + "grad_norm": 0.1797824203968048, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0034, + "step": 22890 + }, + { + "epoch": 1.4015545627027357, + "grad_norm": 0.13750647008419037, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0046, + "step": 22900 + }, + { + "epoch": 1.402166595262868, + "grad_norm": 0.22893266379833221, + "learning_rate": 4.527371771040039e-06, + "loss": 0.005, + "step": 22910 + }, + { + "epoch": 1.4027786278230001, + "grad_norm": 0.1595923751592636, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0045, + "step": 22920 + }, + { + "epoch": 1.4033906603831323, + "grad_norm": 0.11474192142486572, + "learning_rate": 4.513838246961138e-06, + "loss": 0.003, + "step": 22930 + }, + { + "epoch": 1.4040026929432645, + "grad_norm": 0.12208060175180435, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0038, + "step": 22940 + }, + { + "epoch": 1.4046147255033967, + "grad_norm": 0.2919016480445862, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0036, + "step": 22950 + }, + { + "epoch": 1.405226758063529, + "grad_norm": 0.19161155819892883, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0049, + "step": 22960 + }, + { + "epoch": 1.4058387906236611, + "grad_norm": 0.1454700380563736, + "learning_rate": 4.486862604628113e-06, + "loss": 0.004, + "step": 22970 + }, + { + "epoch": 1.4064508231837933, + "grad_norm": 0.227305606007576, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0043, + "step": 22980 + }, + { + "epoch": 1.4070628557439255, + "grad_norm": 0.09430288523435593, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0051, + "step": 22990 + }, + { + "epoch": 1.4076748883040577, + "grad_norm": 0.09664178639650345, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0036, + "step": 23000 + }, + { + "epoch": 1.40828692086419, + "grad_norm": 0.21268269419670105, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0031, + "step": 23010 + }, + { + "epoch": 1.4088989534243221, + "grad_norm": 0.09796992689371109, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0041, + "step": 23020 + }, + { + "epoch": 1.4095109859844543, + "grad_norm": 0.18376071751117706, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0039, + "step": 23030 + }, + { + "epoch": 1.4101230185445865, + "grad_norm": 0.10276145488023758, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0035, + "step": 23040 + }, + { + "epoch": 1.4107350511047188, + "grad_norm": 0.16089564561843872, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0051, + "step": 23050 + }, + { + "epoch": 1.411347083664851, + "grad_norm": 0.1825491487979889, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0036, + "step": 23060 + }, + { + "epoch": 1.4119591162249832, + "grad_norm": 0.24405492842197418, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0028, + "step": 23070 + }, + { + "epoch": 1.4125711487851154, + "grad_norm": 0.14085668325424194, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0039, + "step": 23080 + }, + { + "epoch": 1.4131831813452476, + "grad_norm": 0.11708472669124603, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0035, + "step": 23090 + }, + { + "epoch": 1.4137952139053798, + "grad_norm": 0.12108796834945679, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0036, + "step": 23100 + }, + { + "epoch": 1.414407246465512, + "grad_norm": 0.14601854979991913, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0036, + "step": 23110 + }, + { + "epoch": 1.4150192790256442, + "grad_norm": 0.10614772886037827, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0031, + "step": 23120 + }, + { + "epoch": 1.4156313115857764, + "grad_norm": 0.09014416486024857, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0027, + "step": 23130 + }, + { + "epoch": 1.4162433441459086, + "grad_norm": 0.15246634185314178, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0031, + "step": 23140 + }, + { + "epoch": 1.4168553767060408, + "grad_norm": 0.20104879140853882, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0075, + "step": 23150 + }, + { + "epoch": 1.417467409266173, + "grad_norm": 0.1359969973564148, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0036, + "step": 23160 + }, + { + "epoch": 1.4180794418263052, + "grad_norm": 0.19849587976932526, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0054, + "step": 23170 + }, + { + "epoch": 1.4186914743864374, + "grad_norm": 0.12617377936840057, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0024, + "step": 23180 + }, + { + "epoch": 1.4193035069465696, + "grad_norm": 0.15024134516716003, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0034, + "step": 23190 + }, + { + "epoch": 1.4199155395067018, + "grad_norm": 0.2345605194568634, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0036, + "step": 23200 + }, + { + "epoch": 1.420527572066834, + "grad_norm": 0.13125917315483093, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0035, + "step": 23210 + }, + { + "epoch": 1.4211396046269662, + "grad_norm": 0.20977836847305298, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0038, + "step": 23220 + }, + { + "epoch": 1.4217516371870984, + "grad_norm": 0.3925677537918091, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0043, + "step": 23230 + }, + { + "epoch": 1.4223636697472306, + "grad_norm": 0.17691555619239807, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0048, + "step": 23240 + }, + { + "epoch": 1.4229757023073628, + "grad_norm": 0.18366187810897827, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0033, + "step": 23250 + }, + { + "epoch": 1.423587734867495, + "grad_norm": 0.15539205074310303, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0039, + "step": 23260 + }, + { + "epoch": 1.4241997674276272, + "grad_norm": 0.15048520267009735, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0032, + "step": 23270 + }, + { + "epoch": 1.4248117999877594, + "grad_norm": 0.2631739675998688, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0061, + "step": 23280 + }, + { + "epoch": 1.4254238325478916, + "grad_norm": 0.18545641005039215, + "learning_rate": 4.275502195405868e-06, + "loss": 0.005, + "step": 23290 + }, + { + "epoch": 1.4260358651080238, + "grad_norm": 0.25486356019973755, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0033, + "step": 23300 + }, + { + "epoch": 1.426647897668156, + "grad_norm": 0.2514204978942871, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0043, + "step": 23310 + }, + { + "epoch": 1.427259930228288, + "grad_norm": 0.12997376918792725, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0034, + "step": 23320 + }, + { + "epoch": 1.4278719627884202, + "grad_norm": 0.26096200942993164, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0047, + "step": 23330 + }, + { + "epoch": 1.4284839953485524, + "grad_norm": 0.2292930781841278, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0038, + "step": 23340 + }, + { + "epoch": 1.4290960279086846, + "grad_norm": 0.20056717097759247, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0037, + "step": 23350 + }, + { + "epoch": 1.4297080604688168, + "grad_norm": 0.1608581393957138, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0032, + "step": 23360 + }, + { + "epoch": 1.430320093028949, + "grad_norm": 0.235102578997612, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0034, + "step": 23370 + }, + { + "epoch": 1.4309321255890812, + "grad_norm": 0.11869259178638458, + "learning_rate": 4.217502203129258e-06, + "loss": 0.005, + "step": 23380 + }, + { + "epoch": 1.4315441581492134, + "grad_norm": 0.167036771774292, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0045, + "step": 23390 + }, + { + "epoch": 1.4321561907093456, + "grad_norm": 0.13766071200370789, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0044, + "step": 23400 + }, + { + "epoch": 1.4327682232694778, + "grad_norm": 0.15444986522197723, + "learning_rate": 4.198311874248223e-06, + "loss": 0.004, + "step": 23410 + }, + { + "epoch": 1.43338025582961, + "grad_norm": 0.11997724324464798, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0034, + "step": 23420 + }, + { + "epoch": 1.4339922883897422, + "grad_norm": 0.1533307433128357, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0038, + "step": 23430 + }, + { + "epoch": 1.4346043209498744, + "grad_norm": 0.10954161733388901, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0045, + "step": 23440 + }, + { + "epoch": 1.4352163535100066, + "grad_norm": 0.16601058840751648, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0039, + "step": 23450 + }, + { + "epoch": 1.4358283860701389, + "grad_norm": 0.1756889373064041, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0035, + "step": 23460 + }, + { + "epoch": 1.436440418630271, + "grad_norm": 0.12633845210075378, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0046, + "step": 23470 + }, + { + "epoch": 1.4370524511904033, + "grad_norm": 0.15678541362285614, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0025, + "step": 23480 + }, + { + "epoch": 1.4376644837505355, + "grad_norm": 0.13923659920692444, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0052, + "step": 23490 + }, + { + "epoch": 1.4382765163106677, + "grad_norm": 0.28792211413383484, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0053, + "step": 23500 + }, + { + "epoch": 1.4388885488707999, + "grad_norm": 0.16125047206878662, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0037, + "step": 23510 + }, + { + "epoch": 1.439500581430932, + "grad_norm": 0.2653597593307495, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0067, + "step": 23520 + }, + { + "epoch": 1.4401126139910643, + "grad_norm": 0.2692917585372925, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0031, + "step": 23530 + }, + { + "epoch": 1.4407246465511965, + "grad_norm": 0.2234862893819809, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0044, + "step": 23540 + }, + { + "epoch": 1.4413366791113287, + "grad_norm": 0.17526887357234955, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0025, + "step": 23550 + }, + { + "epoch": 1.4419487116714609, + "grad_norm": 0.10404029488563538, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0041, + "step": 23560 + }, + { + "epoch": 1.442560744231593, + "grad_norm": 0.1385052353143692, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0027, + "step": 23570 + }, + { + "epoch": 1.4431727767917253, + "grad_norm": 0.30865412950515747, + "learning_rate": 4.090929556079854e-06, + "loss": 0.004, + "step": 23580 + }, + { + "epoch": 1.4437848093518575, + "grad_norm": 0.10908320546150208, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0041, + "step": 23590 + }, + { + "epoch": 1.4443968419119897, + "grad_norm": 0.09885916113853455, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0045, + "step": 23600 + }, + { + "epoch": 1.445008874472122, + "grad_norm": 0.1685211956501007, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0031, + "step": 23610 + }, + { + "epoch": 1.445620907032254, + "grad_norm": 0.0967954769730568, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0031, + "step": 23620 + }, + { + "epoch": 1.4462329395923863, + "grad_norm": 0.07489120960235596, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0038, + "step": 23630 + }, + { + "epoch": 1.4468449721525185, + "grad_norm": 0.20616063475608826, + "learning_rate": 4.053587511509546e-06, + "loss": 0.0043, + "step": 23640 + }, + { + "epoch": 1.4474570047126507, + "grad_norm": 0.15788249671459198, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0031, + "step": 23650 + }, + { + "epoch": 1.448069037272783, + "grad_norm": 0.10360633581876755, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0027, + "step": 23660 + }, + { + "epoch": 1.4486810698329151, + "grad_norm": 0.2871163785457611, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0028, + "step": 23670 + }, + { + "epoch": 1.4492931023930473, + "grad_norm": 0.15280364453792572, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0042, + "step": 23680 + }, + { + "epoch": 1.4499051349531795, + "grad_norm": 0.17502477765083313, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0031, + "step": 23690 + }, + { + "epoch": 1.4505171675133117, + "grad_norm": 0.2154005616903305, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0036, + "step": 23700 + }, + { + "epoch": 1.451129200073444, + "grad_norm": 0.15002919733524323, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0033, + "step": 23710 + }, + { + "epoch": 1.4517412326335761, + "grad_norm": 0.10422170162200928, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0032, + "step": 23720 + }, + { + "epoch": 1.4523532651937083, + "grad_norm": 0.15197636187076569, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0043, + "step": 23730 + }, + { + "epoch": 1.4529652977538405, + "grad_norm": 0.2571481466293335, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0039, + "step": 23740 + }, + { + "epoch": 1.4535773303139727, + "grad_norm": 0.12697578966617584, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0034, + "step": 23750 + }, + { + "epoch": 1.454189362874105, + "grad_norm": 0.14347535371780396, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0051, + "step": 23760 + }, + { + "epoch": 1.4548013954342371, + "grad_norm": 0.1494351178407669, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0037, + "step": 23770 + }, + { + "epoch": 1.4554134279943693, + "grad_norm": 0.23901797831058502, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0032, + "step": 23780 + }, + { + "epoch": 1.4560254605545015, + "grad_norm": 0.1434790939092636, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0036, + "step": 23790 + }, + { + "epoch": 1.4566374931146338, + "grad_norm": 0.1456829458475113, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0034, + "step": 23800 + }, + { + "epoch": 1.457249525674766, + "grad_norm": 0.33969590067863464, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0038, + "step": 23810 + }, + { + "epoch": 1.4578615582348982, + "grad_norm": 0.1768753081560135, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0027, + "step": 23820 + }, + { + "epoch": 1.4584735907950304, + "grad_norm": 0.15212708711624146, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0032, + "step": 23830 + }, + { + "epoch": 1.4590856233551626, + "grad_norm": 0.10870973765850067, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0033, + "step": 23840 + }, + { + "epoch": 1.4596976559152948, + "grad_norm": 0.17898528277873993, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0022, + "step": 23850 + }, + { + "epoch": 1.460309688475427, + "grad_norm": 0.15515227615833282, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0034, + "step": 23860 + }, + { + "epoch": 1.4609217210355592, + "grad_norm": 0.11047070473432541, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0032, + "step": 23870 + }, + { + "epoch": 1.4615337535956914, + "grad_norm": 0.08628113567829132, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0032, + "step": 23880 + }, + { + "epoch": 1.4621457861558236, + "grad_norm": 0.358903706073761, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0043, + "step": 23890 + }, + { + "epoch": 1.4627578187159558, + "grad_norm": 0.13986052572727203, + "learning_rate": 3.895183209452123e-06, + "loss": 0.003, + "step": 23900 + }, + { + "epoch": 1.463369851276088, + "grad_norm": 0.09236793220043182, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0029, + "step": 23910 + }, + { + "epoch": 1.4639818838362202, + "grad_norm": 0.14616963267326355, + "learning_rate": 3.883230136754435e-06, + "loss": 0.005, + "step": 23920 + }, + { + "epoch": 1.4645939163963524, + "grad_norm": 0.0754290223121643, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0031, + "step": 23930 + }, + { + "epoch": 1.4652059489564846, + "grad_norm": 0.16520163416862488, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0037, + "step": 23940 + }, + { + "epoch": 1.4658179815166168, + "grad_norm": 0.06801608204841614, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0023, + "step": 23950 + }, + { + "epoch": 1.466430014076749, + "grad_norm": 0.3087909519672394, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0042, + "step": 23960 + }, + { + "epoch": 1.4670420466368812, + "grad_norm": 0.23470532894134521, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0046, + "step": 23970 + }, + { + "epoch": 1.4676540791970134, + "grad_norm": 0.10248749703168869, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0025, + "step": 23980 + }, + { + "epoch": 1.4682661117571456, + "grad_norm": 0.12478570640087128, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0035, + "step": 23990 + }, + { + "epoch": 1.4688781443172778, + "grad_norm": 0.16669252514839172, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0034, + "step": 24000 + }, + { + "epoch": 1.46949017687741, + "grad_norm": 0.12477939575910568, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0033, + "step": 24010 + }, + { + "epoch": 1.470102209437542, + "grad_norm": 0.1738445907831192, + "learning_rate": 3.823967005382315e-06, + "loss": 0.003, + "step": 24020 + }, + { + "epoch": 1.4707142419976742, + "grad_norm": 0.11228524148464203, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0029, + "step": 24030 + }, + { + "epoch": 1.4713262745578064, + "grad_norm": 0.28472721576690674, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0035, + "step": 24040 + }, + { + "epoch": 1.4719383071179386, + "grad_norm": 0.18087328970432281, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0037, + "step": 24050 + }, + { + "epoch": 1.4725503396780708, + "grad_norm": 0.39030423760414124, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0043, + "step": 24060 + }, + { + "epoch": 1.473162372238203, + "grad_norm": 0.164345845580101, + "learning_rate": 3.794650811106129e-06, + "loss": 0.0029, + "step": 24070 + }, + { + "epoch": 1.4737744047983352, + "grad_norm": 0.14081600308418274, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.0038, + "step": 24080 + }, + { + "epoch": 1.4743864373584674, + "grad_norm": 0.27649205923080444, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0037, + "step": 24090 + }, + { + "epoch": 1.4749984699185996, + "grad_norm": 0.08673480153083801, + "learning_rate": 3.777162510056721e-06, + "loss": 0.004, + "step": 24100 + }, + { + "epoch": 1.4756105024787318, + "grad_norm": 0.11770286411046982, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0033, + "step": 24110 + }, + { + "epoch": 1.476222535038864, + "grad_norm": 0.11967290937900543, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0035, + "step": 24120 + }, + { + "epoch": 1.4768345675989962, + "grad_norm": 0.12635833024978638, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0043, + "step": 24130 + }, + { + "epoch": 1.4774466001591284, + "grad_norm": 0.13505803048610687, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.0034, + "step": 24140 + }, + { + "epoch": 1.4780586327192606, + "grad_norm": 0.17781652510166168, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0031, + "step": 24150 + }, + { + "epoch": 1.4786706652793928, + "grad_norm": 0.18974725902080536, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0051, + "step": 24160 + }, + { + "epoch": 1.479282697839525, + "grad_norm": 0.12072815746068954, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0027, + "step": 24170 + }, + { + "epoch": 1.4798947303996572, + "grad_norm": 0.10813914984464645, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0023, + "step": 24180 + }, + { + "epoch": 1.4805067629597894, + "grad_norm": 0.07975378632545471, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0035, + "step": 24190 + }, + { + "epoch": 1.4811187955199216, + "grad_norm": 0.0948014184832573, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0033, + "step": 24200 + }, + { + "epoch": 1.4817308280800539, + "grad_norm": 0.11943913251161575, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0038, + "step": 24210 + }, + { + "epoch": 1.482342860640186, + "grad_norm": 0.34374934434890747, + "learning_rate": 3.707974016467e-06, + "loss": 0.0043, + "step": 24220 + }, + { + "epoch": 1.4829548932003183, + "grad_norm": 0.264528751373291, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0037, + "step": 24230 + }, + { + "epoch": 1.4835669257604505, + "grad_norm": 0.08419078588485718, + "learning_rate": 3.696562092850226e-06, + "loss": 0.0031, + "step": 24240 + }, + { + "epoch": 1.4841789583205827, + "grad_norm": 0.3805602192878723, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0039, + "step": 24250 + }, + { + "epoch": 1.4847909908807149, + "grad_norm": 0.09091196954250336, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0034, + "step": 24260 + }, + { + "epoch": 1.485403023440847, + "grad_norm": 0.1352047175168991, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0033, + "step": 24270 + }, + { + "epoch": 1.4860150560009793, + "grad_norm": 0.14287787675857544, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0033, + "step": 24280 + }, + { + "epoch": 1.4866270885611115, + "grad_norm": 0.15490861237049103, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0037, + "step": 24290 + }, + { + "epoch": 1.4872391211212437, + "grad_norm": 0.08607941120862961, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0042, + "step": 24300 + }, + { + "epoch": 1.4878511536813759, + "grad_norm": 0.2872561514377594, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0039, + "step": 24310 + }, + { + "epoch": 1.488463186241508, + "grad_norm": 0.09383561462163925, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.0036, + "step": 24320 + }, + { + "epoch": 1.4890752188016403, + "grad_norm": 0.13576671481132507, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0039, + "step": 24330 + }, + { + "epoch": 1.4896872513617725, + "grad_norm": 0.21924526989459991, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0038, + "step": 24340 + }, + { + "epoch": 1.4902992839219047, + "grad_norm": 0.24333837628364563, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0034, + "step": 24350 + }, + { + "epoch": 1.490911316482037, + "grad_norm": 0.08171682059764862, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0034, + "step": 24360 + }, + { + "epoch": 1.491523349042169, + "grad_norm": 0.11815544962882996, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.0027, + "step": 24370 + }, + { + "epoch": 1.4921353816023013, + "grad_norm": 0.15248773992061615, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0035, + "step": 24380 + }, + { + "epoch": 1.4927474141624335, + "grad_norm": 0.13664020597934723, + "learning_rate": 3.612069140022124e-06, + "loss": 0.0032, + "step": 24390 + }, + { + "epoch": 1.4933594467225657, + "grad_norm": 0.2877022624015808, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0034, + "step": 24400 + }, + { + "epoch": 1.493971479282698, + "grad_norm": 0.1447642594575882, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0033, + "step": 24410 + }, + { + "epoch": 1.4945835118428301, + "grad_norm": 0.18032193183898926, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0036, + "step": 24420 + }, + { + "epoch": 1.4951955444029623, + "grad_norm": 0.1249038353562355, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0023, + "step": 24430 + }, + { + "epoch": 1.4958075769630943, + "grad_norm": 0.21674089133739471, + "learning_rate": 3.584337233394337e-06, + "loss": 0.0036, + "step": 24440 + }, + { + "epoch": 1.4964196095232265, + "grad_norm": 0.2503979504108429, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0039, + "step": 24450 + }, + { + "epoch": 1.4970316420833587, + "grad_norm": 0.15412171185016632, + "learning_rate": 3.573305344104808e-06, + "loss": 0.0035, + "step": 24460 + }, + { + "epoch": 1.497643674643491, + "grad_norm": 0.17718803882598877, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0055, + "step": 24470 + }, + { + "epoch": 1.498255707203623, + "grad_norm": 0.24290283024311066, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0033, + "step": 24480 + }, + { + "epoch": 1.4988677397637553, + "grad_norm": 0.20131447911262512, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.0035, + "step": 24490 + }, + { + "epoch": 1.4994797723238875, + "grad_norm": 0.18041104078292847, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0037, + "step": 24500 + }, + { + "epoch": 1.5000918048840197, + "grad_norm": 0.11311472952365875, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0027, + "step": 24510 + }, + { + "epoch": 1.500703837444152, + "grad_norm": 0.10401099175214767, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0035, + "step": 24520 + }, + { + "epoch": 1.5013158700042841, + "grad_norm": 0.16640698909759521, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.0029, + "step": 24530 + }, + { + "epoch": 1.5019279025644163, + "grad_norm": 0.1116192489862442, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0027, + "step": 24540 + }, + { + "epoch": 1.5025399351245485, + "grad_norm": 0.14617346227169037, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0037, + "step": 24550 + }, + { + "epoch": 1.5031519676846807, + "grad_norm": 0.10546499490737915, + "learning_rate": 3.518669865884119e-06, + "loss": 0.0025, + "step": 24560 + }, + { + "epoch": 1.503764000244813, + "grad_norm": 0.11696954816579819, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0039, + "step": 24570 + }, + { + "epoch": 1.5043760328049451, + "grad_norm": 0.1503429412841797, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0038, + "step": 24580 + }, + { + "epoch": 1.5049880653650773, + "grad_norm": 0.13094773888587952, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0037, + "step": 24590 + }, + { + "epoch": 1.5056000979252095, + "grad_norm": 0.1519947648048401, + "learning_rate": 3.497061149826966e-06, + "loss": 0.0027, + "step": 24600 + }, + { + "epoch": 1.5062121304853417, + "grad_norm": 0.3586391806602478, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0041, + "step": 24610 + }, + { + "epoch": 1.506824163045474, + "grad_norm": 0.14964115619659424, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0029, + "step": 24620 + }, + { + "epoch": 1.5074361956056062, + "grad_norm": 0.2676304578781128, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.0033, + "step": 24630 + }, + { + "epoch": 1.5080482281657384, + "grad_norm": 0.117411769926548, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0041, + "step": 24640 + }, + { + "epoch": 1.5086602607258706, + "grad_norm": 0.11224953830242157, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0035, + "step": 24650 + }, + { + "epoch": 1.5092722932860028, + "grad_norm": 0.14367471635341644, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.0033, + "step": 24660 + }, + { + "epoch": 1.509884325846135, + "grad_norm": 0.27663105726242065, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.004, + "step": 24670 + }, + { + "epoch": 1.5104963584062672, + "grad_norm": 0.08599471300840378, + "learning_rate": 3.454266765790622e-06, + "loss": 0.0031, + "step": 24680 + }, + { + "epoch": 1.5111083909663994, + "grad_norm": 0.11320041120052338, + "learning_rate": 3.448957251110008e-06, + "loss": 0.0033, + "step": 24690 + }, + { + "epoch": 1.5117204235265316, + "grad_norm": 0.0896427258849144, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0032, + "step": 24700 + }, + { + "epoch": 1.5123324560866638, + "grad_norm": 0.1055784597992897, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0028, + "step": 24710 + }, + { + "epoch": 1.512944488646796, + "grad_norm": 0.0936208963394165, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0036, + "step": 24720 + }, + { + "epoch": 1.5135565212069282, + "grad_norm": 0.13069137930870056, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.0033, + "step": 24730 + }, + { + "epoch": 1.5141685537670604, + "grad_norm": 0.17260710895061493, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0034, + "step": 24740 + }, + { + "epoch": 1.5147805863271926, + "grad_norm": 0.26109611988067627, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.0044, + "step": 24750 + }, + { + "epoch": 1.5153926188873248, + "grad_norm": 0.22439827024936676, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.0042, + "step": 24760 + }, + { + "epoch": 1.516004651447457, + "grad_norm": 0.2269357591867447, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0039, + "step": 24770 + }, + { + "epoch": 1.5166166840075892, + "grad_norm": 0.20416954159736633, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0039, + "step": 24780 + }, + { + "epoch": 1.5172287165677214, + "grad_norm": 0.1766926646232605, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0031, + "step": 24790 + }, + { + "epoch": 1.5178407491278536, + "grad_norm": 0.05759773403406143, + "learning_rate": 3.391138816571675e-06, + "loss": 0.0029, + "step": 24800 + }, + { + "epoch": 1.5184527816879858, + "grad_norm": 0.19152496755123138, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0034, + "step": 24810 + }, + { + "epoch": 1.519064814248118, + "grad_norm": 0.09876703470945358, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0032, + "step": 24820 + }, + { + "epoch": 1.5196768468082502, + "grad_norm": 0.11626110225915909, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0037, + "step": 24830 + }, + { + "epoch": 1.5202888793683824, + "grad_norm": 0.13713783025741577, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0027, + "step": 24840 + }, + { + "epoch": 1.5209009119285146, + "grad_norm": 0.19144660234451294, + "learning_rate": 3.36521439484193e-06, + "loss": 0.004, + "step": 24850 + }, + { + "epoch": 1.5215129444886468, + "grad_norm": 0.1376778483390808, + "learning_rate": 3.36005636574796e-06, + "loss": 0.0037, + "step": 24860 + }, + { + "epoch": 1.522124977048779, + "grad_norm": 0.4120432436466217, + "learning_rate": 3.354907302553392e-06, + "loss": 0.0042, + "step": 24870 + }, + { + "epoch": 1.5227370096089112, + "grad_norm": 0.14245551824569702, + "learning_rate": 3.349767211300933e-06, + "loss": 0.003, + "step": 24880 + }, + { + "epoch": 1.5233490421690434, + "grad_norm": 0.19136923551559448, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0053, + "step": 24890 + }, + { + "epoch": 1.5239610747291756, + "grad_norm": 0.28412777185440063, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.0031, + "step": 24900 + }, + { + "epoch": 1.5245731072893078, + "grad_norm": 0.18925072252750397, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.003, + "step": 24910 + }, + { + "epoch": 1.52518513984944, + "grad_norm": 0.21378494799137115, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0034, + "step": 24920 + }, + { + "epoch": 1.5257971724095722, + "grad_norm": 0.19160443544387817, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0037, + "step": 24930 + }, + { + "epoch": 1.5264092049697044, + "grad_norm": 0.19070027768611908, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0038, + "step": 24940 + }, + { + "epoch": 1.5270212375298367, + "grad_norm": 0.20489074289798737, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.003, + "step": 24950 + }, + { + "epoch": 1.5276332700899689, + "grad_norm": 0.15747228264808655, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0037, + "step": 24960 + }, + { + "epoch": 1.528245302650101, + "grad_norm": 0.21312901377677917, + "learning_rate": 3.303911119253872e-06, + "loss": 0.0035, + "step": 24970 + }, + { + "epoch": 1.5288573352102333, + "grad_norm": 0.10329846292734146, + "learning_rate": 3.298861077451818e-06, + "loss": 0.0033, + "step": 24980 + }, + { + "epoch": 1.5294693677703655, + "grad_norm": 0.13872355222702026, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0056, + "step": 24990 + }, + { + "epoch": 1.5300814003304977, + "grad_norm": 0.08532251417636871, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0026, + "step": 25000 + }, + { + "epoch": 1.5306934328906299, + "grad_norm": 0.1309783011674881, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.0038, + "step": 25010 + }, + { + "epoch": 1.531305465450762, + "grad_norm": 0.16484731435775757, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.0034, + "step": 25020 + }, + { + "epoch": 1.5319174980108943, + "grad_norm": 0.1756003201007843, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0044, + "step": 25030 + }, + { + "epoch": 1.5325295305710265, + "grad_norm": 0.13745243847370148, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0063, + "step": 25040 + }, + { + "epoch": 1.5331415631311587, + "grad_norm": 0.1077183336019516, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0032, + "step": 25050 + }, + { + "epoch": 1.5337535956912909, + "grad_norm": 0.3091605007648468, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0036, + "step": 25060 + }, + { + "epoch": 1.534365628251423, + "grad_norm": 0.13469856977462769, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0031, + "step": 25070 + }, + { + "epoch": 1.5349776608115553, + "grad_norm": 0.2445354014635086, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0037, + "step": 25080 + }, + { + "epoch": 1.5355896933716875, + "grad_norm": 0.1065889522433281, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0038, + "step": 25090 + }, + { + "epoch": 1.5362017259318197, + "grad_norm": 0.1539459079504013, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0036, + "step": 25100 + }, + { + "epoch": 1.536813758491952, + "grad_norm": 0.23242861032485962, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0037, + "step": 25110 + }, + { + "epoch": 1.537425791052084, + "grad_norm": 0.18660615384578705, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0034, + "step": 25120 + }, + { + "epoch": 1.5380378236122163, + "grad_norm": 0.14089861512184143, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0057, + "step": 25130 + }, + { + "epoch": 1.5386498561723485, + "grad_norm": 0.30568358302116394, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0036, + "step": 25140 + }, + { + "epoch": 1.5392618887324807, + "grad_norm": 0.0965384691953659, + "learning_rate": 3.214397932123149e-06, + "loss": 0.0029, + "step": 25150 + }, + { + "epoch": 1.539873921292613, + "grad_norm": 0.12925416231155396, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0028, + "step": 25160 + }, + { + "epoch": 1.5404859538527451, + "grad_norm": 0.10820749402046204, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0025, + "step": 25170 + }, + { + "epoch": 1.5410979864128773, + "grad_norm": 0.200232595205307, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0038, + "step": 25180 + }, + { + "epoch": 1.5417100189730095, + "grad_norm": 0.13515910506248474, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.0037, + "step": 25190 + }, + { + "epoch": 1.5423220515331417, + "grad_norm": 0.08493158221244812, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0026, + "step": 25200 + }, + { + "epoch": 1.542934084093274, + "grad_norm": 0.21674226224422455, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0047, + "step": 25210 + }, + { + "epoch": 1.543546116653406, + "grad_norm": 0.18259066343307495, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0029, + "step": 25220 + }, + { + "epoch": 1.5441581492135381, + "grad_norm": 0.14857260882854462, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0029, + "step": 25230 + }, + { + "epoch": 1.5447701817736703, + "grad_norm": 0.1540914922952652, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.0026, + "step": 25240 + }, + { + "epoch": 1.5453822143338025, + "grad_norm": 0.08827090263366699, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0029, + "step": 25250 + }, + { + "epoch": 1.5459942468939347, + "grad_norm": 0.07511961460113525, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0035, + "step": 25260 + }, + { + "epoch": 1.546606279454067, + "grad_norm": 0.26209381222724915, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.0033, + "step": 25270 + }, + { + "epoch": 1.5472183120141991, + "grad_norm": 0.08861620724201202, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.0033, + "step": 25280 + }, + { + "epoch": 1.5478303445743313, + "grad_norm": 0.1642802655696869, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0032, + "step": 25290 + }, + { + "epoch": 1.5484423771344635, + "grad_norm": 0.24771225452423096, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0034, + "step": 25300 + }, + { + "epoch": 1.5490544096945957, + "grad_norm": 0.2717854976654053, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.004, + "step": 25310 + }, + { + "epoch": 1.549666442254728, + "grad_norm": 0.12177802622318268, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.0029, + "step": 25320 + }, + { + "epoch": 1.5502784748148601, + "grad_norm": 0.09988416731357574, + "learning_rate": 3.127844986891409e-06, + "loss": 0.0052, + "step": 25330 + }, + { + "epoch": 1.5508905073749923, + "grad_norm": 0.08877446502447128, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0024, + "step": 25340 + }, + { + "epoch": 1.5515025399351245, + "grad_norm": 0.16233091056346893, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.003, + "step": 25350 + }, + { + "epoch": 1.5521145724952568, + "grad_norm": 0.10167178511619568, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.0034, + "step": 25360 + }, + { + "epoch": 1.552726605055389, + "grad_norm": 0.14738866686820984, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0036, + "step": 25370 + }, + { + "epoch": 1.5533386376155212, + "grad_norm": 0.07526370882987976, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0036, + "step": 25380 + }, + { + "epoch": 1.5539506701756534, + "grad_norm": 0.1659732311964035, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0032, + "step": 25390 + }, + { + "epoch": 1.5545627027357856, + "grad_norm": 0.18707287311553955, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0023, + "step": 25400 + }, + { + "epoch": 1.5551747352959178, + "grad_norm": 0.21416662633419037, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0029, + "step": 25410 + }, + { + "epoch": 1.55578676785605, + "grad_norm": 0.3034561574459076, + "learning_rate": 3.085688933413021e-06, + "loss": 0.003, + "step": 25420 + }, + { + "epoch": 1.5563988004161822, + "grad_norm": 0.18879717588424683, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0035, + "step": 25430 + }, + { + "epoch": 1.5570108329763144, + "grad_norm": 0.12917254865169525, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.0038, + "step": 25440 + }, + { + "epoch": 1.5576228655364466, + "grad_norm": 0.0970548763871193, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0046, + "step": 25450 + }, + { + "epoch": 1.5582348980965788, + "grad_norm": 0.17424598336219788, + "learning_rate": 3.067194157156521e-06, + "loss": 0.003, + "step": 25460 + }, + { + "epoch": 1.558846930656711, + "grad_norm": 0.11429346352815628, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0039, + "step": 25470 + }, + { + "epoch": 1.5594589632168432, + "grad_norm": 0.19154596328735352, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0028, + "step": 25480 + }, + { + "epoch": 1.5600709957769754, + "grad_norm": 0.1475156843662262, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.0028, + "step": 25490 + }, + { + "epoch": 1.5606830283371074, + "grad_norm": 0.29066604375839233, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0037, + "step": 25500 + }, + { + "epoch": 1.5612950608972396, + "grad_norm": 0.21379634737968445, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.004, + "step": 25510 + }, + { + "epoch": 1.5619070934573718, + "grad_norm": 0.1648091822862625, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.003, + "step": 25520 + }, + { + "epoch": 1.562519126017504, + "grad_norm": 0.2791198790073395, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0038, + "step": 25530 + }, + { + "epoch": 1.5631311585776362, + "grad_norm": 0.13038018345832825, + "learning_rate": 3.030651808761638e-06, + "loss": 0.0028, + "step": 25540 + }, + { + "epoch": 1.5637431911377684, + "grad_norm": 0.07513634115457535, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0032, + "step": 25550 + }, + { + "epoch": 1.5643552236979006, + "grad_norm": 0.34259703755378723, + "learning_rate": 3.021609639602321e-06, + "loss": 0.0034, + "step": 25560 + }, + { + "epoch": 1.5649672562580328, + "grad_norm": 0.1602829545736313, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.0036, + "step": 25570 + }, + { + "epoch": 1.565579288818165, + "grad_norm": 0.11303776502609253, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.003, + "step": 25580 + }, + { + "epoch": 1.5661913213782972, + "grad_norm": 0.06348636001348495, + "learning_rate": 3.008116622200155e-06, + "loss": 0.0038, + "step": 25590 + }, + { + "epoch": 1.5668033539384294, + "grad_norm": 0.2563594579696655, + "learning_rate": 3.003637700546652e-06, + "loss": 0.0027, + "step": 25600 + }, + { + "epoch": 1.5674153864985616, + "grad_norm": 0.08260748535394669, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0029, + "step": 25610 + }, + { + "epoch": 1.5680274190586938, + "grad_norm": 0.15986980497837067, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0044, + "step": 25620 + }, + { + "epoch": 1.568639451618826, + "grad_norm": 0.19412761926651, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.004, + "step": 25630 + }, + { + "epoch": 1.5692514841789582, + "grad_norm": 0.16794568300247192, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0042, + "step": 25640 + }, + { + "epoch": 1.5698635167390904, + "grad_norm": 0.34898805618286133, + "learning_rate": 2.981383959667165e-06, + "loss": 0.003, + "step": 25650 + }, + { + "epoch": 1.5704755492992226, + "grad_norm": 0.11825685203075409, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0037, + "step": 25660 + }, + { + "epoch": 1.5710875818593548, + "grad_norm": 0.1430155634880066, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0049, + "step": 25670 + }, + { + "epoch": 1.571699614419487, + "grad_norm": 0.13148540258407593, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0031, + "step": 25680 + }, + { + "epoch": 1.5723116469796192, + "grad_norm": 0.14384756982326508, + "learning_rate": 2.963750320724704e-06, + "loss": 0.0036, + "step": 25690 + }, + { + "epoch": 1.5729236795397514, + "grad_norm": 0.11322541534900665, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0028, + "step": 25700 + }, + { + "epoch": 1.5735357120998836, + "grad_norm": 0.1428067833185196, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0033, + "step": 25710 + }, + { + "epoch": 1.5741477446600158, + "grad_norm": 0.1169947013258934, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.0024, + "step": 25720 + }, + { + "epoch": 1.574759777220148, + "grad_norm": 0.33150142431259155, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0038, + "step": 25730 + }, + { + "epoch": 1.5753718097802802, + "grad_norm": 0.12486983090639114, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.003, + "step": 25740 + }, + { + "epoch": 1.5759838423404124, + "grad_norm": 0.12485318630933762, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0025, + "step": 25750 + }, + { + "epoch": 1.5765958749005446, + "grad_norm": 0.10158280283212662, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0032, + "step": 25760 + }, + { + "epoch": 1.5772079074606769, + "grad_norm": 0.13820113241672516, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0041, + "step": 25770 + }, + { + "epoch": 1.577819940020809, + "grad_norm": 0.18718287348747253, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0027, + "step": 25780 + }, + { + "epoch": 1.5784319725809413, + "grad_norm": 0.154324010014534, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.004, + "step": 25790 + }, + { + "epoch": 1.5790440051410735, + "grad_norm": 0.10862802714109421, + "learning_rate": 2.916036854664115e-06, + "loss": 0.0039, + "step": 25800 + }, + { + "epoch": 1.5796560377012057, + "grad_norm": 0.11738114804029465, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0032, + "step": 25810 + }, + { + "epoch": 1.5802680702613379, + "grad_norm": 0.08674368262290955, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0035, + "step": 25820 + }, + { + "epoch": 1.58088010282147, + "grad_norm": 0.16917847096920013, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0032, + "step": 25830 + }, + { + "epoch": 1.5814921353816023, + "grad_norm": 0.10122957825660706, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0044, + "step": 25840 + }, + { + "epoch": 1.5821041679417345, + "grad_norm": 0.14450572431087494, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.0031, + "step": 25850 + }, + { + "epoch": 1.5827162005018667, + "grad_norm": 0.11220426112413406, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0035, + "step": 25860 + }, + { + "epoch": 1.5833282330619989, + "grad_norm": 0.15793107450008392, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0034, + "step": 25870 + }, + { + "epoch": 1.583940265622131, + "grad_norm": 0.11485118418931961, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0049, + "step": 25880 + }, + { + "epoch": 1.5845522981822633, + "grad_norm": 0.11588255316019058, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0032, + "step": 25890 + }, + { + "epoch": 1.5851643307423955, + "grad_norm": 0.09770877659320831, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0033, + "step": 25900 + }, + { + "epoch": 1.5857763633025277, + "grad_norm": 0.4078996479511261, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0041, + "step": 25910 + }, + { + "epoch": 1.58638839586266, + "grad_norm": 0.16744333505630493, + "learning_rate": 2.865295218604555e-06, + "loss": 0.003, + "step": 25920 + }, + { + "epoch": 1.587000428422792, + "grad_norm": 0.10358662158250809, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0034, + "step": 25930 + }, + { + "epoch": 1.5876124609829243, + "grad_norm": 0.1420212686061859, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0024, + "step": 25940 + }, + { + "epoch": 1.5882244935430565, + "grad_norm": 0.1387208104133606, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0035, + "step": 25950 + }, + { + "epoch": 1.5888365261031887, + "grad_norm": 0.2383398711681366, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0044, + "step": 25960 + }, + { + "epoch": 1.589448558663321, + "grad_norm": 0.1263049691915512, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.0029, + "step": 25970 + }, + { + "epoch": 1.5900605912234531, + "grad_norm": 0.10938797891139984, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0029, + "step": 25980 + }, + { + "epoch": 1.5906726237835853, + "grad_norm": 0.18173988163471222, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0033, + "step": 25990 + }, + { + "epoch": 1.5912846563437175, + "grad_norm": 0.20956522226333618, + "learning_rate": 2.832230653119002e-06, + "loss": 0.003, + "step": 26000 + }, + { + "epoch": 1.5918966889038497, + "grad_norm": 0.5168828368186951, + "learning_rate": 2.828140665735232e-06, + "loss": 0.0038, + "step": 26010 + }, + { + "epoch": 1.592508721463982, + "grad_norm": 0.19130735099315643, + "learning_rate": 2.8240602684835614e-06, + "loss": 0.003, + "step": 26020 + }, + { + "epoch": 1.5931207540241141, + "grad_norm": 0.2398800253868103, + "learning_rate": 2.8199894661525695e-06, + "loss": 0.0031, + "step": 26030 + }, + { + "epoch": 1.5937327865842463, + "grad_norm": 0.13288211822509766, + "learning_rate": 2.8159282635195604e-06, + "loss": 0.0039, + "step": 26040 + }, + { + "epoch": 1.5943448191443785, + "grad_norm": 0.12008156627416611, + "learning_rate": 2.8118766653505857e-06, + "loss": 0.005, + "step": 26050 + }, + { + "epoch": 1.5949568517045107, + "grad_norm": 0.06939925253391266, + "learning_rate": 2.8078346764004217e-06, + "loss": 0.0026, + "step": 26060 + }, + { + "epoch": 1.595568884264643, + "grad_norm": 0.11179028451442719, + "learning_rate": 2.8038023014125693e-06, + "loss": 0.0032, + "step": 26070 + }, + { + "epoch": 1.5961809168247751, + "grad_norm": 0.07841819524765015, + "learning_rate": 2.799779545119241e-06, + "loss": 0.0035, + "step": 26080 + }, + { + "epoch": 1.5967929493849073, + "grad_norm": 0.3470489978790283, + "learning_rate": 2.7957664122413685e-06, + "loss": 0.0067, + "step": 26090 + }, + { + "epoch": 1.5974049819450395, + "grad_norm": 0.13002917170524597, + "learning_rate": 2.7917629074885855e-06, + "loss": 0.0035, + "step": 26100 + }, + { + "epoch": 1.5980170145051718, + "grad_norm": 0.10265816748142242, + "learning_rate": 2.78776903555923e-06, + "loss": 0.0026, + "step": 26110 + }, + { + "epoch": 1.598629047065304, + "grad_norm": 0.0917414203286171, + "learning_rate": 2.7837848011403307e-06, + "loss": 0.003, + "step": 26120 + }, + { + "epoch": 1.5992410796254362, + "grad_norm": 0.11112091690301895, + "learning_rate": 2.7798102089076096e-06, + "loss": 0.0039, + "step": 26130 + }, + { + "epoch": 1.5998531121855684, + "grad_norm": 0.08949574083089828, + "learning_rate": 2.7758452635254706e-06, + "loss": 0.0035, + "step": 26140 + }, + { + "epoch": 1.6004651447457006, + "grad_norm": 0.10606437176465988, + "learning_rate": 2.771889969647e-06, + "loss": 0.0046, + "step": 26150 + }, + { + "epoch": 1.6010771773058328, + "grad_norm": 0.1891089379787445, + "learning_rate": 2.7679443319139547e-06, + "loss": 0.0025, + "step": 26160 + }, + { + "epoch": 1.601689209865965, + "grad_norm": 0.11007837951183319, + "learning_rate": 2.76400835495676e-06, + "loss": 0.0032, + "step": 26170 + }, + { + "epoch": 1.6023012424260972, + "grad_norm": 0.2129961997270584, + "learning_rate": 2.760082043394504e-06, + "loss": 0.0032, + "step": 26180 + }, + { + "epoch": 1.6029132749862294, + "grad_norm": 0.2265758216381073, + "learning_rate": 2.756165401834933e-06, + "loss": 0.003, + "step": 26190 + }, + { + "epoch": 1.6035253075463616, + "grad_norm": 0.29450783133506775, + "learning_rate": 2.7522584348744443e-06, + "loss": 0.0039, + "step": 26200 + }, + { + "epoch": 1.6041373401064938, + "grad_norm": 0.48828232288360596, + "learning_rate": 2.748361147098079e-06, + "loss": 0.0054, + "step": 26210 + }, + { + "epoch": 1.604749372666626, + "grad_norm": 0.2561551630496979, + "learning_rate": 2.7444735430795245e-06, + "loss": 0.0038, + "step": 26220 + }, + { + "epoch": 1.6053614052267582, + "grad_norm": 0.1838567554950714, + "learning_rate": 2.740595627381096e-06, + "loss": 0.004, + "step": 26230 + }, + { + "epoch": 1.6059734377868904, + "grad_norm": 0.1419040560722351, + "learning_rate": 2.7367274045537477e-06, + "loss": 0.0028, + "step": 26240 + }, + { + "epoch": 1.6065854703470226, + "grad_norm": 0.11946547776460648, + "learning_rate": 2.732868879137055e-06, + "loss": 0.004, + "step": 26250 + }, + { + "epoch": 1.6071975029071548, + "grad_norm": 0.2451052963733673, + "learning_rate": 2.7290200556592094e-06, + "loss": 0.0041, + "step": 26260 + }, + { + "epoch": 1.607809535467287, + "grad_norm": 0.11013349890708923, + "learning_rate": 2.72518093863702e-06, + "loss": 0.0031, + "step": 26270 + }, + { + "epoch": 1.6084215680274192, + "grad_norm": 0.13513876497745514, + "learning_rate": 2.721351532575906e-06, + "loss": 0.0067, + "step": 26280 + }, + { + "epoch": 1.6090336005875514, + "grad_norm": 0.13167037069797516, + "learning_rate": 2.717531841969889e-06, + "loss": 0.0054, + "step": 26290 + }, + { + "epoch": 1.6096456331476836, + "grad_norm": 0.17578460276126862, + "learning_rate": 2.713721871301588e-06, + "loss": 0.0042, + "step": 26300 + }, + { + "epoch": 1.6102576657078158, + "grad_norm": 0.26278436183929443, + "learning_rate": 2.709921625042214e-06, + "loss": 0.0052, + "step": 26310 + }, + { + "epoch": 1.610869698267948, + "grad_norm": 0.12841887772083282, + "learning_rate": 2.7061311076515717e-06, + "loss": 0.0029, + "step": 26320 + }, + { + "epoch": 1.6114817308280802, + "grad_norm": 0.08532734215259552, + "learning_rate": 2.7023503235780395e-06, + "loss": 0.0037, + "step": 26330 + }, + { + "epoch": 1.6120937633882122, + "grad_norm": 0.23955127596855164, + "learning_rate": 2.6985792772585826e-06, + "loss": 0.0026, + "step": 26340 + }, + { + "epoch": 1.6127057959483444, + "grad_norm": 0.11942708492279053, + "learning_rate": 2.6948179731187315e-06, + "loss": 0.0032, + "step": 26350 + }, + { + "epoch": 1.6133178285084766, + "grad_norm": 0.2980901002883911, + "learning_rate": 2.6910664155725847e-06, + "loss": 0.0036, + "step": 26360 + }, + { + "epoch": 1.6139298610686088, + "grad_norm": 0.18042345345020294, + "learning_rate": 2.6873246090228063e-06, + "loss": 0.0023, + "step": 26370 + }, + { + "epoch": 1.614541893628741, + "grad_norm": 0.09250669926404953, + "learning_rate": 2.683592557860616e-06, + "loss": 0.0028, + "step": 26380 + }, + { + "epoch": 1.6151539261888732, + "grad_norm": 0.11877484619617462, + "learning_rate": 2.6798702664657803e-06, + "loss": 0.0042, + "step": 26390 + }, + { + "epoch": 1.6157659587490054, + "grad_norm": 0.20574252307415009, + "learning_rate": 2.6761577392066163e-06, + "loss": 0.0028, + "step": 26400 + }, + { + "epoch": 1.6163779913091376, + "grad_norm": 0.18342842161655426, + "learning_rate": 2.6724549804399845e-06, + "loss": 0.0028, + "step": 26410 + }, + { + "epoch": 1.6169900238692698, + "grad_norm": 0.18038654327392578, + "learning_rate": 2.6687619945112743e-06, + "loss": 0.0031, + "step": 26420 + }, + { + "epoch": 1.617602056429402, + "grad_norm": 0.14160999655723572, + "learning_rate": 2.6650787857544134e-06, + "loss": 0.0035, + "step": 26430 + }, + { + "epoch": 1.6182140889895342, + "grad_norm": 0.09427947551012039, + "learning_rate": 2.661405358491851e-06, + "loss": 0.0041, + "step": 26440 + }, + { + "epoch": 1.6188261215496664, + "grad_norm": 0.07515032589435577, + "learning_rate": 2.6577417170345594e-06, + "loss": 0.0032, + "step": 26450 + }, + { + "epoch": 1.6194381541097986, + "grad_norm": 0.19633768498897552, + "learning_rate": 2.6540878656820246e-06, + "loss": 0.0043, + "step": 26460 + }, + { + "epoch": 1.6200501866699308, + "grad_norm": 0.22237136960029602, + "learning_rate": 2.6504438087222474e-06, + "loss": 0.003, + "step": 26470 + }, + { + "epoch": 1.620662219230063, + "grad_norm": 0.21898943185806274, + "learning_rate": 2.6468095504317275e-06, + "loss": 0.0033, + "step": 26480 + }, + { + "epoch": 1.6212742517901952, + "grad_norm": 0.14833909273147583, + "learning_rate": 2.643185095075473e-06, + "loss": 0.003, + "step": 26490 + }, + { + "epoch": 1.6218862843503274, + "grad_norm": 0.10988935828208923, + "learning_rate": 2.6395704469069837e-06, + "loss": 0.0036, + "step": 26500 + }, + { + "epoch": 1.6224983169104596, + "grad_norm": 0.17635370790958405, + "learning_rate": 2.635965610168249e-06, + "loss": 0.0047, + "step": 26510 + }, + { + "epoch": 1.6231103494705919, + "grad_norm": 0.15108852088451385, + "learning_rate": 2.6323705890897464e-06, + "loss": 0.0034, + "step": 26520 + }, + { + "epoch": 1.623722382030724, + "grad_norm": 0.1829880177974701, + "learning_rate": 2.628785387890433e-06, + "loss": 0.0041, + "step": 26530 + }, + { + "epoch": 1.6243344145908563, + "grad_norm": 0.15146563947200775, + "learning_rate": 2.6252100107777422e-06, + "loss": 0.0034, + "step": 26540 + }, + { + "epoch": 1.6249464471509885, + "grad_norm": 0.1440849006175995, + "learning_rate": 2.6216444619475786e-06, + "loss": 0.0044, + "step": 26550 + }, + { + "epoch": 1.6255584797111207, + "grad_norm": 0.1681547313928604, + "learning_rate": 2.6180887455843135e-06, + "loss": 0.0045, + "step": 26560 + }, + { + "epoch": 1.6261705122712529, + "grad_norm": 0.07170043885707855, + "learning_rate": 2.6145428658607753e-06, + "loss": 0.0028, + "step": 26570 + }, + { + "epoch": 1.626782544831385, + "grad_norm": 0.0961712971329689, + "learning_rate": 2.6110068269382534e-06, + "loss": 0.0026, + "step": 26580 + }, + { + "epoch": 1.6273945773915173, + "grad_norm": 0.0957784354686737, + "learning_rate": 2.6074806329664854e-06, + "loss": 0.0031, + "step": 26590 + }, + { + "epoch": 1.6280066099516495, + "grad_norm": 0.09888478368520737, + "learning_rate": 2.6039642880836585e-06, + "loss": 0.0042, + "step": 26600 + }, + { + "epoch": 1.6286186425117817, + "grad_norm": 0.1469460278749466, + "learning_rate": 2.600457796416397e-06, + "loss": 0.003, + "step": 26610 + }, + { + "epoch": 1.6292306750719139, + "grad_norm": 0.23431086540222168, + "learning_rate": 2.5969611620797636e-06, + "loss": 0.003, + "step": 26620 + }, + { + "epoch": 1.6298427076320459, + "grad_norm": 0.11390798538923264, + "learning_rate": 2.593474389177255e-06, + "loss": 0.0026, + "step": 26630 + }, + { + "epoch": 1.630454740192178, + "grad_norm": 0.17735126614570618, + "learning_rate": 2.5899974818007924e-06, + "loss": 0.0032, + "step": 26640 + }, + { + "epoch": 1.6310667727523103, + "grad_norm": 0.047082606703042984, + "learning_rate": 2.586530444030723e-06, + "loss": 0.0036, + "step": 26650 + }, + { + "epoch": 1.6316788053124425, + "grad_norm": 0.3262721598148346, + "learning_rate": 2.583073279935805e-06, + "loss": 0.004, + "step": 26660 + }, + { + "epoch": 1.6322908378725747, + "grad_norm": 0.2153632938861847, + "learning_rate": 2.5796259935732143e-06, + "loss": 0.003, + "step": 26670 + }, + { + "epoch": 1.6329028704327069, + "grad_norm": 0.12398967891931534, + "learning_rate": 2.5761885889885346e-06, + "loss": 0.0031, + "step": 26680 + }, + { + "epoch": 1.633514902992839, + "grad_norm": 0.404419481754303, + "learning_rate": 2.5727610702157518e-06, + "loss": 0.0038, + "step": 26690 + }, + { + "epoch": 1.6341269355529713, + "grad_norm": 0.3094029426574707, + "learning_rate": 2.5693434412772496e-06, + "loss": 0.005, + "step": 26700 + }, + { + "epoch": 1.6347389681131035, + "grad_norm": 0.23702147603034973, + "learning_rate": 2.565935706183804e-06, + "loss": 0.003, + "step": 26710 + }, + { + "epoch": 1.6353510006732357, + "grad_norm": 0.175592839717865, + "learning_rate": 2.5625378689345837e-06, + "loss": 0.0038, + "step": 26720 + }, + { + "epoch": 1.635963033233368, + "grad_norm": 0.20330312848091125, + "learning_rate": 2.5591499335171394e-06, + "loss": 0.0034, + "step": 26730 + }, + { + "epoch": 1.6365750657935, + "grad_norm": 0.1990291029214859, + "learning_rate": 2.555771903907403e-06, + "loss": 0.0031, + "step": 26740 + }, + { + "epoch": 1.6371870983536323, + "grad_norm": 0.2611120343208313, + "learning_rate": 2.5524037840696787e-06, + "loss": 0.0026, + "step": 26750 + }, + { + "epoch": 1.6377991309137645, + "grad_norm": 0.15563850104808807, + "learning_rate": 2.5490455779566446e-06, + "loss": 0.0029, + "step": 26760 + }, + { + "epoch": 1.6384111634738967, + "grad_norm": 0.10159289091825485, + "learning_rate": 2.545697289509341e-06, + "loss": 0.0027, + "step": 26770 + }, + { + "epoch": 1.639023196034029, + "grad_norm": 0.14164364337921143, + "learning_rate": 2.5423589226571733e-06, + "loss": 0.0022, + "step": 26780 + }, + { + "epoch": 1.639635228594161, + "grad_norm": 0.09149957448244095, + "learning_rate": 2.5390304813179e-06, + "loss": 0.0042, + "step": 26790 + }, + { + "epoch": 1.6402472611542933, + "grad_norm": 0.19528718292713165, + "learning_rate": 2.5357119693976297e-06, + "loss": 0.0021, + "step": 26800 + }, + { + "epoch": 1.6408592937144255, + "grad_norm": 0.11716540157794952, + "learning_rate": 2.532403390790823e-06, + "loss": 0.0029, + "step": 26810 + }, + { + "epoch": 1.6414713262745577, + "grad_norm": 0.06402851641178131, + "learning_rate": 2.529104749380281e-06, + "loss": 0.0023, + "step": 26820 + }, + { + "epoch": 1.64208335883469, + "grad_norm": 0.12224840372800827, + "learning_rate": 2.5258160490371446e-06, + "loss": 0.0029, + "step": 26830 + }, + { + "epoch": 1.6426953913948221, + "grad_norm": 0.13217593729496002, + "learning_rate": 2.5225372936208854e-06, + "loss": 0.0035, + "step": 26840 + }, + { + "epoch": 1.6433074239549543, + "grad_norm": 0.15030793845653534, + "learning_rate": 2.5192684869793043e-06, + "loss": 0.0026, + "step": 26850 + }, + { + "epoch": 1.6439194565150865, + "grad_norm": 0.10057740658521652, + "learning_rate": 2.51600963294853e-06, + "loss": 0.0044, + "step": 26860 + }, + { + "epoch": 1.6445314890752187, + "grad_norm": 0.19387565553188324, + "learning_rate": 2.5127607353530097e-06, + "loss": 0.0032, + "step": 26870 + }, + { + "epoch": 1.645143521635351, + "grad_norm": 0.32513365149497986, + "learning_rate": 2.5095217980055052e-06, + "loss": 0.0026, + "step": 26880 + }, + { + "epoch": 1.6457555541954831, + "grad_norm": 0.11426142603158951, + "learning_rate": 2.5062928247070873e-06, + "loss": 0.0035, + "step": 26890 + }, + { + "epoch": 1.6463675867556153, + "grad_norm": 0.15678571164608002, + "learning_rate": 2.503073819247138e-06, + "loss": 0.0033, + "step": 26900 + }, + { + "epoch": 1.6469796193157475, + "grad_norm": 0.0901828184723854, + "learning_rate": 2.4998647854033393e-06, + "loss": 0.0033, + "step": 26910 + }, + { + "epoch": 1.6475916518758797, + "grad_norm": 0.1439771205186844, + "learning_rate": 2.4966657269416738e-06, + "loss": 0.0036, + "step": 26920 + }, + { + "epoch": 1.648203684436012, + "grad_norm": 0.08516893535852432, + "learning_rate": 2.49347664761641e-06, + "loss": 0.0031, + "step": 26930 + }, + { + "epoch": 1.6488157169961442, + "grad_norm": 0.13487808406352997, + "learning_rate": 2.490297551170112e-06, + "loss": 0.0038, + "step": 26940 + }, + { + "epoch": 1.6494277495562764, + "grad_norm": 0.12181483954191208, + "learning_rate": 2.487128441333628e-06, + "loss": 0.0048, + "step": 26950 + }, + { + "epoch": 1.6500397821164086, + "grad_norm": 0.11907542496919632, + "learning_rate": 2.4839693218260844e-06, + "loss": 0.0028, + "step": 26960 + }, + { + "epoch": 1.6506518146765408, + "grad_norm": 0.11463847011327744, + "learning_rate": 2.4808201963548844e-06, + "loss": 0.0032, + "step": 26970 + }, + { + "epoch": 1.651263847236673, + "grad_norm": 0.10308004170656204, + "learning_rate": 2.477681068615698e-06, + "loss": 0.0027, + "step": 26980 + }, + { + "epoch": 1.6518758797968052, + "grad_norm": 0.1553436815738678, + "learning_rate": 2.4745519422924715e-06, + "loss": 0.0032, + "step": 26990 + }, + { + "epoch": 1.6524879123569374, + "grad_norm": 0.11983859539031982, + "learning_rate": 2.471432821057406e-06, + "loss": 0.0031, + "step": 27000 + }, + { + "epoch": 1.6530999449170696, + "grad_norm": 0.07867950201034546, + "learning_rate": 2.4683237085709673e-06, + "loss": 0.003, + "step": 27010 + }, + { + "epoch": 1.6537119774772018, + "grad_norm": 0.0990489274263382, + "learning_rate": 2.4652246084818678e-06, + "loss": 0.0035, + "step": 27020 + }, + { + "epoch": 1.654324010037334, + "grad_norm": 0.15849289298057556, + "learning_rate": 2.4621355244270764e-06, + "loss": 0.0029, + "step": 27030 + }, + { + "epoch": 1.6549360425974662, + "grad_norm": 0.23918525874614716, + "learning_rate": 2.4590564600318047e-06, + "loss": 0.0032, + "step": 27040 + }, + { + "epoch": 1.6555480751575984, + "grad_norm": 0.15686926245689392, + "learning_rate": 2.4559874189095077e-06, + "loss": 0.003, + "step": 27050 + }, + { + "epoch": 1.6561601077177306, + "grad_norm": 0.06435749679803848, + "learning_rate": 2.4529284046618745e-06, + "loss": 0.003, + "step": 27060 + }, + { + "epoch": 1.6567721402778628, + "grad_norm": 0.1966746598482132, + "learning_rate": 2.4498794208788296e-06, + "loss": 0.0029, + "step": 27070 + }, + { + "epoch": 1.657384172837995, + "grad_norm": 0.1173984557390213, + "learning_rate": 2.446840471138524e-06, + "loss": 0.0034, + "step": 27080 + }, + { + "epoch": 1.6579962053981272, + "grad_norm": 0.15185165405273438, + "learning_rate": 2.443811559007335e-06, + "loss": 0.0036, + "step": 27090 + }, + { + "epoch": 1.6586082379582594, + "grad_norm": 0.1371954381465912, + "learning_rate": 2.440792688039862e-06, + "loss": 0.002, + "step": 27100 + }, + { + "epoch": 1.6592202705183916, + "grad_norm": 0.10718704760074615, + "learning_rate": 2.437783861778914e-06, + "loss": 0.003, + "step": 27110 + }, + { + "epoch": 1.6598323030785238, + "grad_norm": 0.09085255861282349, + "learning_rate": 2.4347850837555197e-06, + "loss": 0.0028, + "step": 27120 + }, + { + "epoch": 1.660444335638656, + "grad_norm": 0.12604662775993347, + "learning_rate": 2.4317963574889108e-06, + "loss": 0.0029, + "step": 27130 + }, + { + "epoch": 1.6610563681987882, + "grad_norm": 0.06227592006325722, + "learning_rate": 2.428817686486524e-06, + "loss": 0.0034, + "step": 27140 + }, + { + "epoch": 1.6616684007589204, + "grad_norm": 0.15667739510536194, + "learning_rate": 2.425849074243997e-06, + "loss": 0.0029, + "step": 27150 + }, + { + "epoch": 1.6622804333190526, + "grad_norm": 0.11927297711372375, + "learning_rate": 2.4228905242451593e-06, + "loss": 0.0025, + "step": 27160 + }, + { + "epoch": 1.6628924658791848, + "grad_norm": 0.13583429157733917, + "learning_rate": 2.419942039962035e-06, + "loss": 0.0049, + "step": 27170 + }, + { + "epoch": 1.663504498439317, + "grad_norm": 0.31264790892601013, + "learning_rate": 2.4170036248548345e-06, + "loss": 0.0032, + "step": 27180 + }, + { + "epoch": 1.6641165309994492, + "grad_norm": 0.1507059931755066, + "learning_rate": 2.414075282371954e-06, + "loss": 0.0041, + "step": 27190 + }, + { + "epoch": 1.6647285635595814, + "grad_norm": 0.22571611404418945, + "learning_rate": 2.411157015949963e-06, + "loss": 0.006, + "step": 27200 + }, + { + "epoch": 1.6653405961197136, + "grad_norm": 0.07582036405801773, + "learning_rate": 2.408248829013611e-06, + "loss": 0.0025, + "step": 27210 + }, + { + "epoch": 1.6659526286798458, + "grad_norm": 0.16827397048473358, + "learning_rate": 2.4053507249758174e-06, + "loss": 0.0025, + "step": 27220 + }, + { + "epoch": 1.666564661239978, + "grad_norm": 0.26645299792289734, + "learning_rate": 2.40246270723767e-06, + "loss": 0.0034, + "step": 27230 + }, + { + "epoch": 1.6671766938001102, + "grad_norm": 0.15947957336902618, + "learning_rate": 2.399584779188417e-06, + "loss": 0.003, + "step": 27240 + }, + { + "epoch": 1.6677887263602424, + "grad_norm": 0.16127845644950867, + "learning_rate": 2.396716944205467e-06, + "loss": 0.0049, + "step": 27250 + }, + { + "epoch": 1.6684007589203746, + "grad_norm": 0.1279461681842804, + "learning_rate": 2.3938592056543853e-06, + "loss": 0.0027, + "step": 27260 + }, + { + "epoch": 1.6690127914805069, + "grad_norm": 0.06649098545312881, + "learning_rate": 2.391011566888887e-06, + "loss": 0.0021, + "step": 27270 + }, + { + "epoch": 1.669624824040639, + "grad_norm": 0.196940615773201, + "learning_rate": 2.3881740312508346e-06, + "loss": 0.0024, + "step": 27280 + }, + { + "epoch": 1.6702368566007713, + "grad_norm": 0.07980433851480484, + "learning_rate": 2.3853466020702323e-06, + "loss": 0.0028, + "step": 27290 + }, + { + "epoch": 1.6708488891609035, + "grad_norm": 0.10023880004882812, + "learning_rate": 2.382529282665229e-06, + "loss": 0.0027, + "step": 27300 + }, + { + "epoch": 1.6714609217210357, + "grad_norm": 0.12118209153413773, + "learning_rate": 2.379722076342103e-06, + "loss": 0.0035, + "step": 27310 + }, + { + "epoch": 1.6720729542811679, + "grad_norm": 0.1536104530096054, + "learning_rate": 2.376924986395271e-06, + "loss": 0.0032, + "step": 27320 + }, + { + "epoch": 1.6726849868413, + "grad_norm": 0.0671612024307251, + "learning_rate": 2.37413801610727e-06, + "loss": 0.0025, + "step": 27330 + }, + { + "epoch": 1.6732970194014323, + "grad_norm": 0.17756326496601105, + "learning_rate": 2.371361168748767e-06, + "loss": 0.0033, + "step": 27340 + }, + { + "epoch": 1.6739090519615645, + "grad_norm": 0.07412310689687729, + "learning_rate": 2.3685944475785463e-06, + "loss": 0.0039, + "step": 27350 + }, + { + "epoch": 1.6745210845216967, + "grad_norm": 0.17036253213882446, + "learning_rate": 2.3658378558435098e-06, + "loss": 0.0046, + "step": 27360 + }, + { + "epoch": 1.6751331170818289, + "grad_norm": 0.07159245759248734, + "learning_rate": 2.363091396778672e-06, + "loss": 0.0025, + "step": 27370 + }, + { + "epoch": 1.675745149641961, + "grad_norm": 0.11311008781194687, + "learning_rate": 2.3603550736071535e-06, + "loss": 0.0028, + "step": 27380 + }, + { + "epoch": 1.6763571822020933, + "grad_norm": 0.062365781515836716, + "learning_rate": 2.357628889540182e-06, + "loss": 0.0029, + "step": 27390 + }, + { + "epoch": 1.6769692147622255, + "grad_norm": 0.1132882833480835, + "learning_rate": 2.3549128477770894e-06, + "loss": 0.003, + "step": 27400 + }, + { + "epoch": 1.6775812473223577, + "grad_norm": 0.2946174740791321, + "learning_rate": 2.3522069515052996e-06, + "loss": 0.003, + "step": 27410 + }, + { + "epoch": 1.67819327988249, + "grad_norm": 0.22978715598583221, + "learning_rate": 2.349511203900333e-06, + "loss": 0.0028, + "step": 27420 + }, + { + "epoch": 1.678805312442622, + "grad_norm": 0.12381251156330109, + "learning_rate": 2.3468256081258e-06, + "loss": 0.0035, + "step": 27430 + }, + { + "epoch": 1.6794173450027543, + "grad_norm": 0.3918306231498718, + "learning_rate": 2.344150167333397e-06, + "loss": 0.0036, + "step": 27440 + }, + { + "epoch": 1.6800293775628865, + "grad_norm": 0.1729428470134735, + "learning_rate": 2.3414848846629013e-06, + "loss": 0.0039, + "step": 27450 + }, + { + "epoch": 1.6806414101230187, + "grad_norm": 0.10841631144285202, + "learning_rate": 2.3388297632421727e-06, + "loss": 0.0032, + "step": 27460 + }, + { + "epoch": 1.6812534426831507, + "grad_norm": 0.12045114487409592, + "learning_rate": 2.3361848061871417e-06, + "loss": 0.0048, + "step": 27470 + }, + { + "epoch": 1.681865475243283, + "grad_norm": 0.15946263074874878, + "learning_rate": 2.333550016601814e-06, + "loss": 0.0025, + "step": 27480 + }, + { + "epoch": 1.682477507803415, + "grad_norm": 0.20978282392024994, + "learning_rate": 2.3309253975782623e-06, + "loss": 0.0039, + "step": 27490 + }, + { + "epoch": 1.6830895403635473, + "grad_norm": 0.4889276325702667, + "learning_rate": 2.3283109521966236e-06, + "loss": 0.005, + "step": 27500 + }, + { + "epoch": 1.6837015729236795, + "grad_norm": 0.10033760219812393, + "learning_rate": 2.325706683525094e-06, + "loss": 0.0032, + "step": 27510 + }, + { + "epoch": 1.6843136054838117, + "grad_norm": 0.16516660153865814, + "learning_rate": 2.3231125946199277e-06, + "loss": 0.0043, + "step": 27520 + }, + { + "epoch": 1.684925638043944, + "grad_norm": 0.15988346934318542, + "learning_rate": 2.320528688525433e-06, + "loss": 0.0031, + "step": 27530 + }, + { + "epoch": 1.685537670604076, + "grad_norm": 0.0838918536901474, + "learning_rate": 2.317954968273969e-06, + "loss": 0.0033, + "step": 27540 + }, + { + "epoch": 1.6861497031642083, + "grad_norm": 0.09774886816740036, + "learning_rate": 2.3153914368859386e-06, + "loss": 0.0037, + "step": 27550 + }, + { + "epoch": 1.6867617357243405, + "grad_norm": 0.11428319662809372, + "learning_rate": 2.3128380973697868e-06, + "loss": 0.0028, + "step": 27560 + }, + { + "epoch": 1.6873737682844727, + "grad_norm": 0.0789853185415268, + "learning_rate": 2.3102949527220025e-06, + "loss": 0.0033, + "step": 27570 + }, + { + "epoch": 1.687985800844605, + "grad_norm": 0.12702232599258423, + "learning_rate": 2.3077620059271054e-06, + "loss": 0.0033, + "step": 27580 + }, + { + "epoch": 1.6885978334047371, + "grad_norm": 0.12080296128988266, + "learning_rate": 2.305239259957653e-06, + "loss": 0.0027, + "step": 27590 + }, + { + "epoch": 1.6892098659648693, + "grad_norm": 0.21917396783828735, + "learning_rate": 2.302726717774224e-06, + "loss": 0.0032, + "step": 27600 + }, + { + "epoch": 1.6898218985250015, + "grad_norm": 0.28265318274497986, + "learning_rate": 2.3002243823254294e-06, + "loss": 0.0036, + "step": 27610 + }, + { + "epoch": 1.6904339310851337, + "grad_norm": 0.09106706827878952, + "learning_rate": 2.2977322565478988e-06, + "loss": 0.0029, + "step": 27620 + }, + { + "epoch": 1.691045963645266, + "grad_norm": 0.1670890897512436, + "learning_rate": 2.2952503433662806e-06, + "loss": 0.0024, + "step": 27630 + }, + { + "epoch": 1.6916579962053981, + "grad_norm": 0.16830581426620483, + "learning_rate": 2.2927786456932383e-06, + "loss": 0.0033, + "step": 27640 + }, + { + "epoch": 1.6922700287655303, + "grad_norm": 0.3394775092601776, + "learning_rate": 2.2903171664294446e-06, + "loss": 0.0045, + "step": 27650 + }, + { + "epoch": 1.6928820613256625, + "grad_norm": 0.11403192579746246, + "learning_rate": 2.287865908463585e-06, + "loss": 0.0047, + "step": 27660 + }, + { + "epoch": 1.6934940938857947, + "grad_norm": 0.12133318930864334, + "learning_rate": 2.2854248746723464e-06, + "loss": 0.0038, + "step": 27670 + }, + { + "epoch": 1.694106126445927, + "grad_norm": 0.2074453979730606, + "learning_rate": 2.2829940679204192e-06, + "loss": 0.0045, + "step": 27680 + }, + { + "epoch": 1.6947181590060592, + "grad_norm": 0.0654371827840805, + "learning_rate": 2.280573491060488e-06, + "loss": 0.0023, + "step": 27690 + }, + { + "epoch": 1.6953301915661914, + "grad_norm": 0.3289278745651245, + "learning_rate": 2.278163146933236e-06, + "loss": 0.0043, + "step": 27700 + }, + { + "epoch": 1.6959422241263236, + "grad_norm": 0.10692958533763885, + "learning_rate": 2.275763038367336e-06, + "loss": 0.0026, + "step": 27710 + }, + { + "epoch": 1.6965542566864558, + "grad_norm": 0.06414066255092621, + "learning_rate": 2.2733731681794505e-06, + "loss": 0.0022, + "step": 27720 + }, + { + "epoch": 1.697166289246588, + "grad_norm": 0.22467097640037537, + "learning_rate": 2.270993539174225e-06, + "loss": 0.0028, + "step": 27730 + }, + { + "epoch": 1.6977783218067202, + "grad_norm": 0.14074043929576874, + "learning_rate": 2.268624154144283e-06, + "loss": 0.0033, + "step": 27740 + }, + { + "epoch": 1.6983903543668522, + "grad_norm": 0.17113615572452545, + "learning_rate": 2.266265015870234e-06, + "loss": 0.0033, + "step": 27750 + }, + { + "epoch": 1.6990023869269844, + "grad_norm": 0.09429248422384262, + "learning_rate": 2.2639161271206562e-06, + "loss": 0.0027, + "step": 27760 + }, + { + "epoch": 1.6996144194871166, + "grad_norm": 0.06843049824237823, + "learning_rate": 2.261577490652103e-06, + "loss": 0.0044, + "step": 27770 + }, + { + "epoch": 1.7002264520472488, + "grad_norm": 0.08251061290502548, + "learning_rate": 2.259249109209093e-06, + "loss": 0.0029, + "step": 27780 + }, + { + "epoch": 1.700838484607381, + "grad_norm": 0.29461193084716797, + "learning_rate": 2.256930985524111e-06, + "loss": 0.0037, + "step": 27790 + }, + { + "epoch": 1.7014505171675132, + "grad_norm": 0.11461394280195236, + "learning_rate": 2.2546231223176062e-06, + "loss": 0.0027, + "step": 27800 + }, + { + "epoch": 1.7020625497276454, + "grad_norm": 0.15875136852264404, + "learning_rate": 2.2523255222979846e-06, + "loss": 0.0031, + "step": 27810 + }, + { + "epoch": 1.7026745822877776, + "grad_norm": 0.097860187292099, + "learning_rate": 2.2500381881616064e-06, + "loss": 0.0034, + "step": 27820 + }, + { + "epoch": 1.7032866148479098, + "grad_norm": 0.07356908917427063, + "learning_rate": 2.2477611225927847e-06, + "loss": 0.0026, + "step": 27830 + }, + { + "epoch": 1.703898647408042, + "grad_norm": 0.1890958547592163, + "learning_rate": 2.2454943282637852e-06, + "loss": 0.0037, + "step": 27840 + }, + { + "epoch": 1.7045106799681742, + "grad_norm": 0.1173754408955574, + "learning_rate": 2.2432378078348166e-06, + "loss": 0.0034, + "step": 27850 + }, + { + "epoch": 1.7051227125283064, + "grad_norm": 0.2559126019477844, + "learning_rate": 2.2409915639540295e-06, + "loss": 0.0024, + "step": 27860 + }, + { + "epoch": 1.7057347450884386, + "grad_norm": 0.17337289452552795, + "learning_rate": 2.2387555992575192e-06, + "loss": 0.0026, + "step": 27870 + }, + { + "epoch": 1.7063467776485708, + "grad_norm": 0.34073203802108765, + "learning_rate": 2.236529916369313e-06, + "loss": 0.0057, + "step": 27880 + }, + { + "epoch": 1.706958810208703, + "grad_norm": 0.1395779252052307, + "learning_rate": 2.2343145179013726e-06, + "loss": 0.0029, + "step": 27890 + }, + { + "epoch": 1.7075708427688352, + "grad_norm": 0.07645416259765625, + "learning_rate": 2.232109406453595e-06, + "loss": 0.0034, + "step": 27900 + }, + { + "epoch": 1.7081828753289674, + "grad_norm": 0.19695641100406647, + "learning_rate": 2.229914584613798e-06, + "loss": 0.0031, + "step": 27910 + }, + { + "epoch": 1.7087949078890996, + "grad_norm": 0.09641100466251373, + "learning_rate": 2.22773005495773e-06, + "loss": 0.0029, + "step": 27920 + }, + { + "epoch": 1.7094069404492318, + "grad_norm": 0.13393571972846985, + "learning_rate": 2.2255558200490557e-06, + "loss": 0.0029, + "step": 27930 + }, + { + "epoch": 1.710018973009364, + "grad_norm": 0.12252296507358551, + "learning_rate": 2.2233918824393625e-06, + "loss": 0.0035, + "step": 27940 + }, + { + "epoch": 1.7106310055694962, + "grad_norm": 0.18026909232139587, + "learning_rate": 2.221238244668151e-06, + "loss": 0.0033, + "step": 27950 + }, + { + "epoch": 1.7112430381296284, + "grad_norm": 0.11210714280605316, + "learning_rate": 2.219094909262834e-06, + "loss": 0.0041, + "step": 27960 + }, + { + "epoch": 1.7118550706897606, + "grad_norm": 0.08154530823230743, + "learning_rate": 2.2169618787387374e-06, + "loss": 0.0023, + "step": 27970 + }, + { + "epoch": 1.7124671032498928, + "grad_norm": 0.11625959724187851, + "learning_rate": 2.2148391555990905e-06, + "loss": 0.002, + "step": 27980 + }, + { + "epoch": 1.713079135810025, + "grad_norm": 0.17261847853660583, + "learning_rate": 2.212726742335025e-06, + "loss": 0.0034, + "step": 27990 + }, + { + "epoch": 1.7136911683701572, + "grad_norm": 0.2842121422290802, + "learning_rate": 2.210624641425579e-06, + "loss": 0.0037, + "step": 28000 + }, + { + "epoch": 1.7143032009302894, + "grad_norm": 0.0846833735704422, + "learning_rate": 2.208532855337684e-06, + "loss": 0.003, + "step": 28010 + }, + { + "epoch": 1.7149152334904216, + "grad_norm": 0.14987042546272278, + "learning_rate": 2.2064513865261646e-06, + "loss": 0.0028, + "step": 28020 + }, + { + "epoch": 1.7155272660505538, + "grad_norm": 0.11375584453344345, + "learning_rate": 2.204380237433745e-06, + "loss": 0.0035, + "step": 28030 + }, + { + "epoch": 1.716139298610686, + "grad_norm": 0.15772263705730438, + "learning_rate": 2.202319410491029e-06, + "loss": 0.0028, + "step": 28040 + }, + { + "epoch": 1.7167513311708182, + "grad_norm": 0.09632930904626846, + "learning_rate": 2.2002689081165155e-06, + "loss": 0.0026, + "step": 28050 + }, + { + "epoch": 1.7173633637309504, + "grad_norm": 0.10105090588331223, + "learning_rate": 2.1982287327165827e-06, + "loss": 0.0028, + "step": 28060 + }, + { + "epoch": 1.7179753962910826, + "grad_norm": 0.07200506329536438, + "learning_rate": 2.19619888668549e-06, + "loss": 0.0026, + "step": 28070 + }, + { + "epoch": 1.7185874288512148, + "grad_norm": 0.11725947260856628, + "learning_rate": 2.1941793724053733e-06, + "loss": 0.0036, + "step": 28080 + }, + { + "epoch": 1.719199461411347, + "grad_norm": 0.20034383237361908, + "learning_rate": 2.1921701922462463e-06, + "loss": 0.0027, + "step": 28090 + }, + { + "epoch": 1.7198114939714793, + "grad_norm": 0.1051931157708168, + "learning_rate": 2.190171348565994e-06, + "loss": 0.0035, + "step": 28100 + }, + { + "epoch": 1.7204235265316115, + "grad_norm": 0.15733452141284943, + "learning_rate": 2.188182843710369e-06, + "loss": 0.0032, + "step": 28110 + }, + { + "epoch": 1.7210355590917437, + "grad_norm": 0.1562259942293167, + "learning_rate": 2.1862046800129964e-06, + "loss": 0.0037, + "step": 28120 + }, + { + "epoch": 1.7216475916518759, + "grad_norm": 0.12120307981967926, + "learning_rate": 2.1842368597953578e-06, + "loss": 0.0027, + "step": 28130 + }, + { + "epoch": 1.722259624212008, + "grad_norm": 0.10682159662246704, + "learning_rate": 2.1822793853668e-06, + "loss": 0.0028, + "step": 28140 + }, + { + "epoch": 1.7228716567721403, + "grad_norm": 0.3744218349456787, + "learning_rate": 2.18033225902453e-06, + "loss": 0.0031, + "step": 28150 + }, + { + "epoch": 1.7234836893322725, + "grad_norm": 0.10296724736690521, + "learning_rate": 2.17839548305361e-06, + "loss": 0.0037, + "step": 28160 + }, + { + "epoch": 1.7240957218924047, + "grad_norm": 0.10784043371677399, + "learning_rate": 2.1764690597269507e-06, + "loss": 0.0026, + "step": 28170 + }, + { + "epoch": 1.7247077544525369, + "grad_norm": 0.11451563239097595, + "learning_rate": 2.17455299130532e-06, + "loss": 0.0042, + "step": 28180 + }, + { + "epoch": 1.725319787012669, + "grad_norm": 0.23215091228485107, + "learning_rate": 2.17264728003733e-06, + "loss": 0.0057, + "step": 28190 + }, + { + "epoch": 1.7259318195728013, + "grad_norm": 0.26827526092529297, + "learning_rate": 2.17075192815944e-06, + "loss": 0.0045, + "step": 28200 + }, + { + "epoch": 1.7265438521329335, + "grad_norm": 0.18574558198451996, + "learning_rate": 2.168866937895951e-06, + "loss": 0.0031, + "step": 28210 + }, + { + "epoch": 1.7271558846930657, + "grad_norm": 0.13175436854362488, + "learning_rate": 2.166992311459001e-06, + "loss": 0.0037, + "step": 28220 + }, + { + "epoch": 1.727767917253198, + "grad_norm": 0.11453181505203247, + "learning_rate": 2.1651280510485727e-06, + "loss": 0.0024, + "step": 28230 + }, + { + "epoch": 1.72837994981333, + "grad_norm": 0.07552323490381241, + "learning_rate": 2.163274158852476e-06, + "loss": 0.0028, + "step": 28240 + }, + { + "epoch": 1.7289919823734623, + "grad_norm": 0.19949491322040558, + "learning_rate": 2.1614306370463605e-06, + "loss": 0.0037, + "step": 28250 + }, + { + "epoch": 1.7296040149335945, + "grad_norm": 0.11841476708650589, + "learning_rate": 2.1595974877936977e-06, + "loss": 0.003, + "step": 28260 + }, + { + "epoch": 1.7302160474937267, + "grad_norm": 0.07479251176118851, + "learning_rate": 2.1577747132457933e-06, + "loss": 0.0026, + "step": 28270 + }, + { + "epoch": 1.730828080053859, + "grad_norm": 0.09975548088550568, + "learning_rate": 2.155962315541773e-06, + "loss": 0.0038, + "step": 28280 + }, + { + "epoch": 1.7314401126139911, + "grad_norm": 0.11624854803085327, + "learning_rate": 2.154160296808588e-06, + "loss": 0.0022, + "step": 28290 + }, + { + "epoch": 1.7320521451741233, + "grad_norm": 0.11251319199800491, + "learning_rate": 2.1523686591610064e-06, + "loss": 0.0028, + "step": 28300 + }, + { + "epoch": 1.7326641777342555, + "grad_norm": 0.1166340559720993, + "learning_rate": 2.1505874047016146e-06, + "loss": 0.0021, + "step": 28310 + }, + { + "epoch": 1.7332762102943877, + "grad_norm": 0.09875024855136871, + "learning_rate": 2.1488165355208147e-06, + "loss": 0.0035, + "step": 28320 + }, + { + "epoch": 1.73388824285452, + "grad_norm": 0.1096075028181076, + "learning_rate": 2.14705605369682e-06, + "loss": 0.0023, + "step": 28330 + }, + { + "epoch": 1.7345002754146521, + "grad_norm": 0.07303491234779358, + "learning_rate": 2.145305961295655e-06, + "loss": 0.0033, + "step": 28340 + }, + { + "epoch": 1.7351123079747843, + "grad_norm": 0.079298235476017, + "learning_rate": 2.143566260371149e-06, + "loss": 0.0029, + "step": 28350 + }, + { + "epoch": 1.7357243405349165, + "grad_norm": 0.23943912982940674, + "learning_rate": 2.141836952964938e-06, + "loss": 0.0028, + "step": 28360 + }, + { + "epoch": 1.7363363730950487, + "grad_norm": 0.16530318558216095, + "learning_rate": 2.1401180411064616e-06, + "loss": 0.0026, + "step": 28370 + }, + { + "epoch": 1.736948405655181, + "grad_norm": 0.30809924006462097, + "learning_rate": 2.138409526812959e-06, + "loss": 0.0038, + "step": 28380 + }, + { + "epoch": 1.7375604382153131, + "grad_norm": 0.1776202917098999, + "learning_rate": 2.1367114120894663e-06, + "loss": 0.0025, + "step": 28390 + }, + { + "epoch": 1.7381724707754453, + "grad_norm": 0.12845134735107422, + "learning_rate": 2.1350236989288136e-06, + "loss": 0.0025, + "step": 28400 + }, + { + "epoch": 1.7387845033355775, + "grad_norm": 0.3023861050605774, + "learning_rate": 2.1333463893116294e-06, + "loss": 0.0027, + "step": 28410 + }, + { + "epoch": 1.7393965358957098, + "grad_norm": 0.20150741934776306, + "learning_rate": 2.131679485206329e-06, + "loss": 0.0037, + "step": 28420 + }, + { + "epoch": 1.740008568455842, + "grad_norm": 0.13612216711044312, + "learning_rate": 2.130022988569117e-06, + "loss": 0.003, + "step": 28430 + }, + { + "epoch": 1.7406206010159742, + "grad_norm": 0.08449587225914001, + "learning_rate": 2.128376901343984e-06, + "loss": 0.0029, + "step": 28440 + }, + { + "epoch": 1.7412326335761064, + "grad_norm": 0.15262214839458466, + "learning_rate": 2.1267412254627056e-06, + "loss": 0.0036, + "step": 28450 + }, + { + "epoch": 1.7418446661362386, + "grad_norm": 0.12141858786344528, + "learning_rate": 2.1251159628448386e-06, + "loss": 0.0033, + "step": 28460 + }, + { + "epoch": 1.7424566986963708, + "grad_norm": 0.16376341879367828, + "learning_rate": 2.1235011153977192e-06, + "loss": 0.0026, + "step": 28470 + }, + { + "epoch": 1.743068731256503, + "grad_norm": 0.20567956566810608, + "learning_rate": 2.121896685016461e-06, + "loss": 0.0035, + "step": 28480 + }, + { + "epoch": 1.7436807638166352, + "grad_norm": 0.09294500946998596, + "learning_rate": 2.1203026735839514e-06, + "loss": 0.003, + "step": 28490 + }, + { + "epoch": 1.7442927963767674, + "grad_norm": 0.08701831847429276, + "learning_rate": 2.118719082970852e-06, + "loss": 0.0035, + "step": 28500 + }, + { + "epoch": 1.7449048289368996, + "grad_norm": 0.05340641364455223, + "learning_rate": 2.1171459150355947e-06, + "loss": 0.0033, + "step": 28510 + }, + { + "epoch": 1.7455168614970318, + "grad_norm": 0.16895434260368347, + "learning_rate": 2.115583171624381e-06, + "loss": 0.0041, + "step": 28520 + }, + { + "epoch": 1.746128894057164, + "grad_norm": 0.192590594291687, + "learning_rate": 2.114030854571176e-06, + "loss": 0.004, + "step": 28530 + }, + { + "epoch": 1.7467409266172962, + "grad_norm": 0.07753138244152069, + "learning_rate": 2.1124889656977097e-06, + "loss": 0.0029, + "step": 28540 + }, + { + "epoch": 1.7473529591774284, + "grad_norm": 0.2521173655986786, + "learning_rate": 2.1109575068134756e-06, + "loss": 0.004, + "step": 28550 + }, + { + "epoch": 1.7479649917375606, + "grad_norm": 0.0666038915514946, + "learning_rate": 2.1094364797157267e-06, + "loss": 0.0027, + "step": 28560 + }, + { + "epoch": 1.7485770242976928, + "grad_norm": 0.098371222615242, + "learning_rate": 2.107925886189472e-06, + "loss": 0.0046, + "step": 28570 + }, + { + "epoch": 1.749189056857825, + "grad_norm": 0.10023763030767441, + "learning_rate": 2.1064257280074763e-06, + "loss": 0.003, + "step": 28580 + }, + { + "epoch": 1.7498010894179572, + "grad_norm": 0.18487419188022614, + "learning_rate": 2.1049360069302594e-06, + "loss": 0.0023, + "step": 28590 + }, + { + "epoch": 1.7504131219780892, + "grad_norm": 0.068140909075737, + "learning_rate": 2.1034567247060926e-06, + "loss": 0.0031, + "step": 28600 + }, + { + "epoch": 1.7510251545382214, + "grad_norm": 0.203145369887352, + "learning_rate": 2.1019878830709968e-06, + "loss": 0.0045, + "step": 28610 + }, + { + "epoch": 1.7516371870983536, + "grad_norm": 0.1300811469554901, + "learning_rate": 2.100529483748737e-06, + "loss": 0.0033, + "step": 28620 + }, + { + "epoch": 1.7522492196584858, + "grad_norm": 0.10490277409553528, + "learning_rate": 2.099081528450828e-06, + "loss": 0.003, + "step": 28630 + }, + { + "epoch": 1.752861252218618, + "grad_norm": 0.07734280824661255, + "learning_rate": 2.097644018876524e-06, + "loss": 0.0027, + "step": 28640 + }, + { + "epoch": 1.7534732847787502, + "grad_norm": 0.09990867972373962, + "learning_rate": 2.096216956712826e-06, + "loss": 0.0025, + "step": 28650 + }, + { + "epoch": 1.7540853173388824, + "grad_norm": 0.087434321641922, + "learning_rate": 2.0948003436344666e-06, + "loss": 0.0026, + "step": 28660 + }, + { + "epoch": 1.7546973498990146, + "grad_norm": 0.14746612310409546, + "learning_rate": 2.0933941813039244e-06, + "loss": 0.0024, + "step": 28670 + }, + { + "epoch": 1.7553093824591468, + "grad_norm": 0.10767928510904312, + "learning_rate": 2.091998471371406e-06, + "loss": 0.0026, + "step": 28680 + }, + { + "epoch": 1.755921415019279, + "grad_norm": 0.1551862210035324, + "learning_rate": 2.0906132154748557e-06, + "loss": 0.0025, + "step": 28690 + }, + { + "epoch": 1.7565334475794112, + "grad_norm": 0.09829024225473404, + "learning_rate": 2.0892384152399504e-06, + "loss": 0.0039, + "step": 28700 + }, + { + "epoch": 1.7571454801395434, + "grad_norm": 0.10503874719142914, + "learning_rate": 2.0878740722800917e-06, + "loss": 0.0032, + "step": 28710 + }, + { + "epoch": 1.7577575126996756, + "grad_norm": 0.0730491355061531, + "learning_rate": 2.086520188196413e-06, + "loss": 0.0031, + "step": 28720 + }, + { + "epoch": 1.7583695452598078, + "grad_norm": 0.10079263150691986, + "learning_rate": 2.085176764577774e-06, + "loss": 0.0049, + "step": 28730 + }, + { + "epoch": 1.75898157781994, + "grad_norm": 0.09458324313163757, + "learning_rate": 2.083843803000755e-06, + "loss": 0.0032, + "step": 28740 + }, + { + "epoch": 1.7595936103800722, + "grad_norm": 0.10003770887851715, + "learning_rate": 2.0825213050296636e-06, + "loss": 0.0028, + "step": 28750 + }, + { + "epoch": 1.7602056429402044, + "grad_norm": 0.08591483533382416, + "learning_rate": 2.081209272216522e-06, + "loss": 0.004, + "step": 28760 + }, + { + "epoch": 1.7608176755003366, + "grad_norm": 0.06842748820781708, + "learning_rate": 2.079907706101075e-06, + "loss": 0.0027, + "step": 28770 + }, + { + "epoch": 1.7614297080604688, + "grad_norm": 0.20110534131526947, + "learning_rate": 2.0786166082107833e-06, + "loss": 0.0032, + "step": 28780 + }, + { + "epoch": 1.762041740620601, + "grad_norm": 0.0891185775399208, + "learning_rate": 2.0773359800608217e-06, + "loss": 0.0032, + "step": 28790 + }, + { + "epoch": 1.7626537731807332, + "grad_norm": 0.0719524472951889, + "learning_rate": 2.076065823154079e-06, + "loss": 0.0032, + "step": 28800 + }, + { + "epoch": 1.7632658057408654, + "grad_norm": 0.08921847492456436, + "learning_rate": 2.0748061389811543e-06, + "loss": 0.0021, + "step": 28810 + }, + { + "epoch": 1.7638778383009976, + "grad_norm": 0.15532712638378143, + "learning_rate": 2.073556929020357e-06, + "loss": 0.0036, + "step": 28820 + }, + { + "epoch": 1.7644898708611298, + "grad_norm": 0.09795820713043213, + "learning_rate": 2.0723181947377057e-06, + "loss": 0.0038, + "step": 28830 + }, + { + "epoch": 1.765101903421262, + "grad_norm": 0.16977304220199585, + "learning_rate": 2.0710899375869237e-06, + "loss": 0.0027, + "step": 28840 + }, + { + "epoch": 1.7657139359813943, + "grad_norm": 0.15005043148994446, + "learning_rate": 2.0698721590094387e-06, + "loss": 0.0034, + "step": 28850 + }, + { + "epoch": 1.7663259685415265, + "grad_norm": 0.2764229476451874, + "learning_rate": 2.0686648604343824e-06, + "loss": 0.0036, + "step": 28860 + }, + { + "epoch": 1.7669380011016587, + "grad_norm": 0.10011457651853561, + "learning_rate": 2.067468043278587e-06, + "loss": 0.0036, + "step": 28870 + }, + { + "epoch": 1.7675500336617906, + "grad_norm": 0.13169759511947632, + "learning_rate": 2.066281708946583e-06, + "loss": 0.0042, + "step": 28880 + }, + { + "epoch": 1.7681620662219228, + "grad_norm": 0.09271719306707382, + "learning_rate": 2.0651058588306007e-06, + "loss": 0.0024, + "step": 28890 + }, + { + "epoch": 1.768774098782055, + "grad_norm": 0.08888175338506699, + "learning_rate": 2.063940494310565e-06, + "loss": 0.003, + "step": 28900 + }, + { + "epoch": 1.7693861313421873, + "grad_norm": 0.09285194426774979, + "learning_rate": 2.062785616754097e-06, + "loss": 0.0029, + "step": 28910 + }, + { + "epoch": 1.7699981639023195, + "grad_norm": 0.16032962501049042, + "learning_rate": 2.0616412275165097e-06, + "loss": 0.0036, + "step": 28920 + }, + { + "epoch": 1.7706101964624517, + "grad_norm": 0.1677922010421753, + "learning_rate": 2.0605073279408063e-06, + "loss": 0.0029, + "step": 28930 + }, + { + "epoch": 1.7712222290225839, + "grad_norm": 0.16370612382888794, + "learning_rate": 2.0593839193576833e-06, + "loss": 0.0032, + "step": 28940 + }, + { + "epoch": 1.771834261582716, + "grad_norm": 0.19864866137504578, + "learning_rate": 2.058271003085521e-06, + "loss": 0.0035, + "step": 28950 + }, + { + "epoch": 1.7724462941428483, + "grad_norm": 0.06023133546113968, + "learning_rate": 2.0571685804303905e-06, + "loss": 0.0037, + "step": 28960 + }, + { + "epoch": 1.7730583267029805, + "grad_norm": 0.15308921039104462, + "learning_rate": 2.0560766526860447e-06, + "loss": 0.0037, + "step": 28970 + }, + { + "epoch": 1.7736703592631127, + "grad_norm": 0.061173055320978165, + "learning_rate": 2.054995221133923e-06, + "loss": 0.0036, + "step": 28980 + }, + { + "epoch": 1.7742823918232449, + "grad_norm": 0.11913572996854782, + "learning_rate": 2.053924287043144e-06, + "loss": 0.0028, + "step": 28990 + }, + { + "epoch": 1.774894424383377, + "grad_norm": 0.09992241114377975, + "learning_rate": 2.0528638516705106e-06, + "loss": 0.0029, + "step": 29000 + }, + { + "epoch": 1.7755064569435093, + "grad_norm": 0.2562020719051361, + "learning_rate": 2.051813916260501e-06, + "loss": 0.0034, + "step": 29010 + }, + { + "epoch": 1.7761184895036415, + "grad_norm": 0.06800663471221924, + "learning_rate": 2.050774482045273e-06, + "loss": 0.0031, + "step": 29020 + }, + { + "epoch": 1.7767305220637737, + "grad_norm": 0.09397796541452408, + "learning_rate": 2.049745550244661e-06, + "loss": 0.0029, + "step": 29030 + }, + { + "epoch": 1.777342554623906, + "grad_norm": 0.4348801076412201, + "learning_rate": 2.0487271220661735e-06, + "loss": 0.0031, + "step": 29040 + }, + { + "epoch": 1.777954587184038, + "grad_norm": 0.11066912859678268, + "learning_rate": 2.047719198704994e-06, + "loss": 0.0032, + "step": 29050 + }, + { + "epoch": 1.7785666197441703, + "grad_norm": 0.12962423264980316, + "learning_rate": 2.0467217813439762e-06, + "loss": 0.0025, + "step": 29060 + }, + { + "epoch": 1.7791786523043025, + "grad_norm": 0.12331631779670715, + "learning_rate": 2.0457348711536426e-06, + "loss": 0.003, + "step": 29070 + }, + { + "epoch": 1.7797906848644347, + "grad_norm": 0.14841991662979126, + "learning_rate": 2.0447584692921894e-06, + "loss": 0.004, + "step": 29080 + }, + { + "epoch": 1.780402717424567, + "grad_norm": 0.060349978506565094, + "learning_rate": 2.043792576905478e-06, + "loss": 0.0021, + "step": 29090 + }, + { + "epoch": 1.781014749984699, + "grad_norm": 0.3353869616985321, + "learning_rate": 2.0428371951270394e-06, + "loss": 0.004, + "step": 29100 + }, + { + "epoch": 1.7816267825448313, + "grad_norm": 0.1450352966785431, + "learning_rate": 2.0418923250780633e-06, + "loss": 0.0027, + "step": 29110 + }, + { + "epoch": 1.7822388151049635, + "grad_norm": 0.17684252560138702, + "learning_rate": 2.0409579678674084e-06, + "loss": 0.0032, + "step": 29120 + }, + { + "epoch": 1.7828508476650957, + "grad_norm": 0.153119757771492, + "learning_rate": 2.040034124591597e-06, + "loss": 0.0031, + "step": 29130 + }, + { + "epoch": 1.783462880225228, + "grad_norm": 0.09753888100385666, + "learning_rate": 2.039120796334809e-06, + "loss": 0.0038, + "step": 29140 + }, + { + "epoch": 1.7840749127853601, + "grad_norm": 0.1232074424624443, + "learning_rate": 2.0382179841688868e-06, + "loss": 0.0033, + "step": 29150 + }, + { + "epoch": 1.7846869453454923, + "grad_norm": 0.13487598299980164, + "learning_rate": 2.0373256891533293e-06, + "loss": 0.004, + "step": 29160 + }, + { + "epoch": 1.7852989779056245, + "grad_norm": 0.1717495173215866, + "learning_rate": 2.0364439123352956e-06, + "loss": 0.0032, + "step": 29170 + }, + { + "epoch": 1.7859110104657567, + "grad_norm": 0.21602065861225128, + "learning_rate": 2.0355726547495998e-06, + "loss": 0.0036, + "step": 29180 + }, + { + "epoch": 1.786523043025889, + "grad_norm": 0.18952055275440216, + "learning_rate": 2.034711917418711e-06, + "loss": 0.0039, + "step": 29190 + }, + { + "epoch": 1.7871350755860211, + "grad_norm": 0.18922209739685059, + "learning_rate": 2.033861701352752e-06, + "loss": 0.0031, + "step": 29200 + }, + { + "epoch": 1.7877471081461533, + "grad_norm": 0.10511717200279236, + "learning_rate": 2.0330220075494992e-06, + "loss": 0.0034, + "step": 29210 + }, + { + "epoch": 1.7883591407062855, + "grad_norm": 0.11389610171318054, + "learning_rate": 2.0321928369943807e-06, + "loss": 0.0049, + "step": 29220 + }, + { + "epoch": 1.7889711732664177, + "grad_norm": 0.08670853078365326, + "learning_rate": 2.031374190660474e-06, + "loss": 0.0021, + "step": 29230 + }, + { + "epoch": 1.78958320582655, + "grad_norm": 0.08372897654771805, + "learning_rate": 2.0305660695085054e-06, + "loss": 0.0063, + "step": 29240 + }, + { + "epoch": 1.7901952383866822, + "grad_norm": 0.08623497933149338, + "learning_rate": 2.0297684744868494e-06, + "loss": 0.0022, + "step": 29250 + }, + { + "epoch": 1.7908072709468144, + "grad_norm": 0.2859722375869751, + "learning_rate": 2.0289814065315306e-06, + "loss": 0.0034, + "step": 29260 + }, + { + "epoch": 1.7914193035069466, + "grad_norm": 0.12175265699625015, + "learning_rate": 2.0282048665662153e-06, + "loss": 0.0033, + "step": 29270 + }, + { + "epoch": 1.7920313360670788, + "grad_norm": 0.07477760314941406, + "learning_rate": 2.0274388555022176e-06, + "loss": 0.0025, + "step": 29280 + }, + { + "epoch": 1.792643368627211, + "grad_norm": 0.16364359855651855, + "learning_rate": 2.0266833742384928e-06, + "loss": 0.0025, + "step": 29290 + }, + { + "epoch": 1.7932554011873432, + "grad_norm": 0.1571386754512787, + "learning_rate": 2.0259384236616404e-06, + "loss": 0.0034, + "step": 29300 + }, + { + "epoch": 1.7938674337474754, + "grad_norm": 0.4267171323299408, + "learning_rate": 2.0252040046459022e-06, + "loss": 0.0036, + "step": 29310 + }, + { + "epoch": 1.7944794663076076, + "grad_norm": 0.1351214498281479, + "learning_rate": 2.02448011805316e-06, + "loss": 0.0035, + "step": 29320 + }, + { + "epoch": 1.7950914988677398, + "grad_norm": 0.39643657207489014, + "learning_rate": 2.023766764732934e-06, + "loss": 0.0036, + "step": 29330 + }, + { + "epoch": 1.795703531427872, + "grad_norm": 0.10161790996789932, + "learning_rate": 2.0230639455223853e-06, + "loss": 0.0032, + "step": 29340 + }, + { + "epoch": 1.7963155639880042, + "grad_norm": 0.1493646800518036, + "learning_rate": 2.0223716612463095e-06, + "loss": 0.0036, + "step": 29350 + }, + { + "epoch": 1.7969275965481364, + "grad_norm": 0.12911222875118256, + "learning_rate": 2.0216899127171424e-06, + "loss": 0.0029, + "step": 29360 + }, + { + "epoch": 1.7975396291082686, + "grad_norm": 0.11447032541036606, + "learning_rate": 2.0210187007349534e-06, + "loss": 0.0042, + "step": 29370 + }, + { + "epoch": 1.7981516616684008, + "grad_norm": 0.15647603571414948, + "learning_rate": 2.0203580260874474e-06, + "loss": 0.0034, + "step": 29380 + }, + { + "epoch": 1.798763694228533, + "grad_norm": 0.14736993610858917, + "learning_rate": 2.019707889549963e-06, + "loss": 0.0028, + "step": 29390 + }, + { + "epoch": 1.7993757267886652, + "grad_norm": 0.10555008798837662, + "learning_rate": 2.01906829188547e-06, + "loss": 0.0031, + "step": 29400 + }, + { + "epoch": 1.7999877593487974, + "grad_norm": 0.08822382241487503, + "learning_rate": 2.018439233844574e-06, + "loss": 0.0029, + "step": 29410 + }, + { + "epoch": 1.8005997919089296, + "grad_norm": 0.08774827420711517, + "learning_rate": 2.0178207161655087e-06, + "loss": 0.0029, + "step": 29420 + }, + { + "epoch": 1.8012118244690618, + "grad_norm": 0.12119588255882263, + "learning_rate": 2.0172127395741398e-06, + "loss": 0.0031, + "step": 29430 + }, + { + "epoch": 1.801823857029194, + "grad_norm": 0.14716175198554993, + "learning_rate": 2.0166153047839603e-06, + "loss": 0.0037, + "step": 29440 + }, + { + "epoch": 1.8024358895893262, + "grad_norm": 0.09904798865318298, + "learning_rate": 2.016028412496094e-06, + "loss": 0.004, + "step": 29450 + }, + { + "epoch": 1.8030479221494584, + "grad_norm": 0.05114385858178139, + "learning_rate": 2.015452063399292e-06, + "loss": 0.003, + "step": 29460 + }, + { + "epoch": 1.8036599547095906, + "grad_norm": 0.12696151435375214, + "learning_rate": 2.014886258169932e-06, + "loss": 0.0033, + "step": 29470 + }, + { + "epoch": 1.8042719872697228, + "grad_norm": 0.155229389667511, + "learning_rate": 2.014330997472017e-06, + "loss": 0.0045, + "step": 29480 + }, + { + "epoch": 1.804884019829855, + "grad_norm": 0.22578737139701843, + "learning_rate": 2.013786281957177e-06, + "loss": 0.0023, + "step": 29490 + }, + { + "epoch": 1.8054960523899872, + "grad_norm": 0.28504616022109985, + "learning_rate": 2.0132521122646662e-06, + "loss": 0.0037, + "step": 29500 + }, + { + "epoch": 1.8061080849501194, + "grad_norm": 0.1696653664112091, + "learning_rate": 2.0127284890213623e-06, + "loss": 0.0028, + "step": 29510 + }, + { + "epoch": 1.8067201175102516, + "grad_norm": 0.13287198543548584, + "learning_rate": 2.012215412841767e-06, + "loss": 0.0029, + "step": 29520 + }, + { + "epoch": 1.8073321500703838, + "grad_norm": 0.3142126202583313, + "learning_rate": 2.011712884328003e-06, + "loss": 0.0027, + "step": 29530 + }, + { + "epoch": 1.807944182630516, + "grad_norm": 0.19873814284801483, + "learning_rate": 2.011220904069815e-06, + "loss": 0.0047, + "step": 29540 + }, + { + "epoch": 1.8085562151906482, + "grad_norm": 0.19563670456409454, + "learning_rate": 2.01073947264457e-06, + "loss": 0.0026, + "step": 29550 + }, + { + "epoch": 1.8091682477507804, + "grad_norm": 0.10068873316049576, + "learning_rate": 2.0102685906172543e-06, + "loss": 0.0026, + "step": 29560 + }, + { + "epoch": 1.8097802803109126, + "grad_norm": 0.32799556851387024, + "learning_rate": 2.009808258540475e-06, + "loss": 0.0035, + "step": 29570 + }, + { + "epoch": 1.8103923128710449, + "grad_norm": 0.11536014825105667, + "learning_rate": 2.009358476954456e-06, + "loss": 0.0033, + "step": 29580 + }, + { + "epoch": 1.811004345431177, + "grad_norm": 0.2622664272785187, + "learning_rate": 2.008919246387043e-06, + "loss": 0.0036, + "step": 29590 + }, + { + "epoch": 1.8116163779913093, + "grad_norm": 0.14324435591697693, + "learning_rate": 2.0084905673536952e-06, + "loss": 0.0022, + "step": 29600 + }, + { + "epoch": 1.8122284105514415, + "grad_norm": 0.1239459365606308, + "learning_rate": 2.0080724403574922e-06, + "loss": 0.0025, + "step": 29610 + }, + { + "epoch": 1.8128404431115737, + "grad_norm": 0.17734837532043457, + "learning_rate": 2.007664865889131e-06, + "loss": 0.0038, + "step": 29620 + }, + { + "epoch": 1.8134524756717059, + "grad_norm": 0.09072575718164444, + "learning_rate": 2.0072678444269208e-06, + "loss": 0.004, + "step": 29630 + }, + { + "epoch": 1.814064508231838, + "grad_norm": 0.09809702634811401, + "learning_rate": 2.006881376436789e-06, + "loss": 0.0029, + "step": 29640 + }, + { + "epoch": 1.8146765407919703, + "grad_norm": 0.25450852513313293, + "learning_rate": 2.0065054623722772e-06, + "loss": 0.0034, + "step": 29650 + }, + { + "epoch": 1.8152885733521025, + "grad_norm": 0.06607849150896072, + "learning_rate": 2.0061401026745425e-06, + "loss": 0.0034, + "step": 29660 + }, + { + "epoch": 1.8159006059122347, + "grad_norm": 0.06259845197200775, + "learning_rate": 2.005785297772354e-06, + "loss": 0.0022, + "step": 29670 + }, + { + "epoch": 1.8165126384723669, + "grad_norm": 0.10518421977758408, + "learning_rate": 2.005441048082095e-06, + "loss": 0.0033, + "step": 29680 + }, + { + "epoch": 1.817124671032499, + "grad_norm": 0.15561337769031525, + "learning_rate": 2.0051073540077617e-06, + "loss": 0.0031, + "step": 29690 + }, + { + "epoch": 1.8177367035926313, + "grad_norm": 0.08990275114774704, + "learning_rate": 2.0047842159409633e-06, + "loss": 0.003, + "step": 29700 + }, + { + "epoch": 1.8183487361527635, + "grad_norm": 0.14854039251804352, + "learning_rate": 2.004471634260919e-06, + "loss": 0.0034, + "step": 29710 + }, + { + "epoch": 1.8189607687128957, + "grad_norm": 0.08208440989255905, + "learning_rate": 2.004169609334462e-06, + "loss": 0.0027, + "step": 29720 + }, + { + "epoch": 1.8195728012730277, + "grad_norm": 0.18652454018592834, + "learning_rate": 2.003878141516035e-06, + "loss": 0.0049, + "step": 29730 + }, + { + "epoch": 1.8201848338331599, + "grad_norm": 0.1906939297914505, + "learning_rate": 2.0035972311476916e-06, + "loss": 0.0035, + "step": 29740 + }, + { + "epoch": 1.820796866393292, + "grad_norm": 0.4511241614818573, + "learning_rate": 2.0033268785590954e-06, + "loss": 0.004, + "step": 29750 + }, + { + "epoch": 1.8214088989534243, + "grad_norm": 0.12219764292240143, + "learning_rate": 2.003067084067522e-06, + "loss": 0.0027, + "step": 29760 + }, + { + "epoch": 1.8220209315135565, + "grad_norm": 0.17036347091197968, + "learning_rate": 2.0028178479778523e-06, + "loss": 0.004, + "step": 29770 + }, + { + "epoch": 1.8226329640736887, + "grad_norm": 0.12429634481668472, + "learning_rate": 2.0025791705825805e-06, + "loss": 0.0038, + "step": 29780 + }, + { + "epoch": 1.823244996633821, + "grad_norm": 0.08393344283103943, + "learning_rate": 2.0023510521618066e-06, + "loss": 0.003, + "step": 29790 + }, + { + "epoch": 1.823857029193953, + "grad_norm": 0.15894703567028046, + "learning_rate": 2.0021334929832407e-06, + "loss": 0.0035, + "step": 29800 + }, + { + "epoch": 1.8244690617540853, + "grad_norm": 0.04971808195114136, + "learning_rate": 2.0019264933022016e-06, + "loss": 0.0034, + "step": 29810 + }, + { + "epoch": 1.8250810943142175, + "grad_norm": 0.0734478309750557, + "learning_rate": 2.001730053361614e-06, + "loss": 0.0025, + "step": 29820 + }, + { + "epoch": 1.8256931268743497, + "grad_norm": 0.10533800721168518, + "learning_rate": 2.0015441733920105e-06, + "loss": 0.0048, + "step": 29830 + }, + { + "epoch": 1.826305159434482, + "grad_norm": 0.1547422856092453, + "learning_rate": 2.0013688536115332e-06, + "loss": 0.0044, + "step": 29840 + }, + { + "epoch": 1.826917191994614, + "grad_norm": 0.09437263756990433, + "learning_rate": 2.0012040942259285e-06, + "loss": 0.0033, + "step": 29850 + }, + { + "epoch": 1.8275292245547463, + "grad_norm": 0.12579235434532166, + "learning_rate": 2.0010498954285506e-06, + "loss": 0.0025, + "step": 29860 + }, + { + "epoch": 1.8281412571148785, + "grad_norm": 0.06368619203567505, + "learning_rate": 2.00090625740036e-06, + "loss": 0.0022, + "step": 29870 + }, + { + "epoch": 1.8287532896750107, + "grad_norm": 0.09379997849464417, + "learning_rate": 2.0007731803099256e-06, + "loss": 0.0035, + "step": 29880 + }, + { + "epoch": 1.829365322235143, + "grad_norm": 0.11959333717823029, + "learning_rate": 2.00065066431342e-06, + "loss": 0.0023, + "step": 29890 + }, + { + "epoch": 1.8299773547952751, + "grad_norm": 0.14770719408988953, + "learning_rate": 2.0005387095546222e-06, + "loss": 0.0043, + "step": 29900 + }, + { + "epoch": 1.8305893873554073, + "grad_norm": 0.10033386945724487, + "learning_rate": 2.000437316164917e-06, + "loss": 0.0028, + "step": 29910 + }, + { + "epoch": 1.8312014199155395, + "grad_norm": 0.1918601095676422, + "learning_rate": 2.000346484263297e-06, + "loss": 0.0028, + "step": 29920 + }, + { + "epoch": 1.8318134524756717, + "grad_norm": 0.11692646890878677, + "learning_rate": 2.0002662139563564e-06, + "loss": 0.0036, + "step": 29930 + }, + { + "epoch": 1.832425485035804, + "grad_norm": 0.07981011271476746, + "learning_rate": 2.0001965053382976e-06, + "loss": 0.0028, + "step": 29940 + }, + { + "epoch": 1.8330375175959361, + "grad_norm": 0.08957688510417938, + "learning_rate": 2.000137358490928e-06, + "loss": 0.0029, + "step": 29950 + }, + { + "epoch": 1.8336495501560683, + "grad_norm": 0.16067251563072205, + "learning_rate": 2.0000887734836583e-06, + "loss": 0.0033, + "step": 29960 + }, + { + "epoch": 1.8342615827162005, + "grad_norm": 0.08392494916915894, + "learning_rate": 2.0000507503735076e-06, + "loss": 0.0021, + "step": 29970 + }, + { + "epoch": 1.8348736152763327, + "grad_norm": 0.11575599759817123, + "learning_rate": 2.0000232892050976e-06, + "loss": 0.0027, + "step": 29980 + }, + { + "epoch": 1.835485647836465, + "grad_norm": 0.13176386058330536, + "learning_rate": 2.000006390010655e-06, + "loss": 0.0029, + "step": 29990 + }, + { + "epoch": 1.8360976803965972, + "grad_norm": 0.11743218451738358, + "learning_rate": 2.0000000528100118e-06, + "loss": 0.003, + "step": 30000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.87391671271424e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/training_args.bin b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cd9e28a44ae85140e2ef027a82e8be4c39167cc4 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5644791eb57bcb4c4808b4c2429b71e4c49eece4fc60f263f4553a3380f230bb +size 6097 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1176494009828ca1a8d623c603070781658572df --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": true, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/generation_config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/model-00001-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9129331ebe688349ffe3716540872bf018d97ac3 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b408b1954964cad3baaf590b0f8a4577e39f5c3a0cd70d56798923e542df9431 +size 4921072616 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/model-00002-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e7bc8d4157c4995f8c2ef08078ef001e58b1e6be --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:889c8a2bc301c00c7a3595263f5be2ecdd423b3a320092d60f49d545bfc96a1c +size 4978830984 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/model-00003-of-00003.safetensors b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3abc785fb50560572293522e2ccb633a3ac3f9b2 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0443466509d753d0638fcb9827ce86fbec9b3c05f32cc9a3db9201092996dd6 +size 4100977896 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/model.safetensors.index.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/norm_stats.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..f33de4b80f47e0bac1a414431a8354d8345d60c5 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -24.65332532291412, + -30.64622355117798, + -14.452480476760865, + -1.8581012797355654, + -2.2742317820549007, + -1.9569469915390014, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 3.0011677881240857, + 22.348905650329584, + 21.68580058555603, + 2.3937565994262693, + 4.117288079452516, + 3.295379007720948, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + -6.570000648498535, + -1.0618462562561035, + 3.623035430908203, + 0.010442602448165417, + 0.7240540385246277, + 0.44398337602615356, + 0.12898989021778107, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 7.04909086227417, + 17.099597930908203, + 8.363018989562988, + 0.6997263431549072, + 1.1358375549316406, + 0.9687971472740173, + 0.9916459321975708, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.777750787353515, + -21.249025872802733, + -2.4021557040214536, + -4.092200187206268, + -3.2986312219619753, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.645499613952634, + 30.59561934127808, + 14.405443457031247, + 1.8499586300849913, + 2.268683268356323, + 1.963451420021057, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.6817545890808105, + 1.3444018363952637, + -3.5411791801452637, + -0.009792014956474304, + -0.7230188846588135, + -0.44849714636802673, + 0.15749873220920563, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.988739013671875, + 16.884004592895508, + 8.242538452148438, + 0.6991510391235352, + 1.1302146911621094, + 0.9690405130386353, + 0.9875192046165466, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/pi0.yaml b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff8c0ecf109af377331583e4079865e7d8037bc8 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 5 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/special_tokens_map.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/tokenizer.model b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/tokenizer_config.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/trainer_state.json b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..27026f8acf754e4f1ea2d74093236b5bc33e5e36 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/trainer_state.json @@ -0,0 +1,21043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.8360976803965972, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006120325601321991, + "grad_norm": 2.2432243824005127, + "learning_rate": 1.8e-07, + "loss": 0.1384, + "step": 10 + }, + { + "epoch": 0.0012240651202643981, + "grad_norm": 1.959119439125061, + "learning_rate": 3.8e-07, + "loss": 0.1388, + "step": 20 + }, + { + "epoch": 0.001836097680396597, + "grad_norm": 1.8843899965286255, + "learning_rate": 5.800000000000001e-07, + "loss": 0.1307, + "step": 30 + }, + { + "epoch": 0.0024481302405287963, + "grad_norm": 1.7569042444229126, + "learning_rate": 7.8e-07, + "loss": 0.1238, + "step": 40 + }, + { + "epoch": 0.0030601628006609954, + "grad_norm": 2.6189017295837402, + "learning_rate": 9.800000000000001e-07, + "loss": 0.1275, + "step": 50 + }, + { + "epoch": 0.003672195360793194, + "grad_norm": 1.8418694734573364, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.1032, + "step": 60 + }, + { + "epoch": 0.004284227920925393, + "grad_norm": 1.481676697731018, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0816, + "step": 70 + }, + { + "epoch": 0.004896260481057593, + "grad_norm": 0.9590038061141968, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.0732, + "step": 80 + }, + { + "epoch": 0.005508293041189791, + "grad_norm": 1.002897024154663, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.0608, + "step": 90 + }, + { + "epoch": 0.006120325601321991, + "grad_norm": 0.9830108284950256, + "learning_rate": 1.98e-06, + "loss": 0.042, + "step": 100 + }, + { + "epoch": 0.006732358161454189, + "grad_norm": 0.858244001865387, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.0314, + "step": 110 + }, + { + "epoch": 0.007344390721586388, + "grad_norm": 0.5761063694953918, + "learning_rate": 2.38e-06, + "loss": 0.029, + "step": 120 + }, + { + "epoch": 0.007956423281718587, + "grad_norm": 0.5434514284133911, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.0227, + "step": 130 + }, + { + "epoch": 0.008568455841850786, + "grad_norm": 0.6488766670227051, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.0202, + "step": 140 + }, + { + "epoch": 0.009180488401982986, + "grad_norm": 0.36763015389442444, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.0157, + "step": 150 + }, + { + "epoch": 0.009792520962115185, + "grad_norm": 0.49271446466445923, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.0194, + "step": 160 + }, + { + "epoch": 0.010404553522247383, + "grad_norm": 0.23608209192752838, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.0122, + "step": 170 + }, + { + "epoch": 0.011016586082379582, + "grad_norm": 0.47871828079223633, + "learning_rate": 3.58e-06, + "loss": 0.0131, + "step": 180 + }, + { + "epoch": 0.011628618642511782, + "grad_norm": 0.6862446069717407, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.0131, + "step": 190 + }, + { + "epoch": 0.012240651202643981, + "grad_norm": 0.7964349389076233, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0155, + "step": 200 + }, + { + "epoch": 0.01285268376277618, + "grad_norm": 0.5564846396446228, + "learning_rate": 4.18e-06, + "loss": 0.0104, + "step": 210 + }, + { + "epoch": 0.013464716322908379, + "grad_norm": 0.2810452878475189, + "learning_rate": 4.38e-06, + "loss": 0.0128, + "step": 220 + }, + { + "epoch": 0.014076748883040578, + "grad_norm": 0.4474979341030121, + "learning_rate": 4.58e-06, + "loss": 0.0188, + "step": 230 + }, + { + "epoch": 0.014688781443172776, + "grad_norm": 0.47965875267982483, + "learning_rate": 4.78e-06, + "loss": 0.0141, + "step": 240 + }, + { + "epoch": 0.015300814003304975, + "grad_norm": 0.3410812020301819, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0085, + "step": 250 + }, + { + "epoch": 0.015912846563437173, + "grad_norm": 0.39907002449035645, + "learning_rate": 5.18e-06, + "loss": 0.0106, + "step": 260 + }, + { + "epoch": 0.016524879123569373, + "grad_norm": 0.28909367322921753, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0103, + "step": 270 + }, + { + "epoch": 0.017136911683701572, + "grad_norm": 0.31524109840393066, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0101, + "step": 280 + }, + { + "epoch": 0.017748944243833772, + "grad_norm": 0.29430100321769714, + "learning_rate": 5.78e-06, + "loss": 0.0109, + "step": 290 + }, + { + "epoch": 0.01836097680396597, + "grad_norm": 0.2709169387817383, + "learning_rate": 5.98e-06, + "loss": 0.0102, + "step": 300 + }, + { + "epoch": 0.01897300936409817, + "grad_norm": 0.33067119121551514, + "learning_rate": 6.18e-06, + "loss": 0.0095, + "step": 310 + }, + { + "epoch": 0.01958504192423037, + "grad_norm": 0.28110620379447937, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0102, + "step": 320 + }, + { + "epoch": 0.02019707448436257, + "grad_norm": 0.27736902236938477, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0088, + "step": 330 + }, + { + "epoch": 0.020809107044494766, + "grad_norm": 0.3238557279109955, + "learning_rate": 6.780000000000001e-06, + "loss": 0.01, + "step": 340 + }, + { + "epoch": 0.021421139604626965, + "grad_norm": 0.30263441801071167, + "learning_rate": 6.98e-06, + "loss": 0.0095, + "step": 350 + }, + { + "epoch": 0.022033172164759165, + "grad_norm": 0.2618265450000763, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0096, + "step": 360 + }, + { + "epoch": 0.022645204724891364, + "grad_norm": 0.272565633058548, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0093, + "step": 370 + }, + { + "epoch": 0.023257237285023564, + "grad_norm": 0.44272440671920776, + "learning_rate": 7.58e-06, + "loss": 0.0087, + "step": 380 + }, + { + "epoch": 0.023869269845155763, + "grad_norm": 0.27631404995918274, + "learning_rate": 7.78e-06, + "loss": 0.0093, + "step": 390 + }, + { + "epoch": 0.024481302405287963, + "grad_norm": 0.4108494520187378, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0093, + "step": 400 + }, + { + "epoch": 0.02509333496542016, + "grad_norm": 0.43498387932777405, + "learning_rate": 8.18e-06, + "loss": 0.0098, + "step": 410 + }, + { + "epoch": 0.02570536752555236, + "grad_norm": 0.3419845700263977, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0091, + "step": 420 + }, + { + "epoch": 0.026317400085684558, + "grad_norm": 0.5677013993263245, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0104, + "step": 430 + }, + { + "epoch": 0.026929432645816757, + "grad_norm": 0.24424298107624054, + "learning_rate": 8.78e-06, + "loss": 0.0089, + "step": 440 + }, + { + "epoch": 0.027541465205948957, + "grad_norm": 0.267781138420105, + "learning_rate": 8.98e-06, + "loss": 0.0107, + "step": 450 + }, + { + "epoch": 0.028153497766081156, + "grad_norm": 0.38459253311157227, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0081, + "step": 460 + }, + { + "epoch": 0.028765530326213356, + "grad_norm": 0.2647954523563385, + "learning_rate": 9.38e-06, + "loss": 0.0082, + "step": 470 + }, + { + "epoch": 0.029377562886345552, + "grad_norm": 0.44312018156051636, + "learning_rate": 9.58e-06, + "loss": 0.0102, + "step": 480 + }, + { + "epoch": 0.02998959544647775, + "grad_norm": 0.2309781014919281, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0118, + "step": 490 + }, + { + "epoch": 0.03060162800660995, + "grad_norm": 0.41755014657974243, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0094, + "step": 500 + }, + { + "epoch": 0.03121366056674215, + "grad_norm": 0.38537120819091797, + "learning_rate": 1.018e-05, + "loss": 0.011, + "step": 510 + }, + { + "epoch": 0.031825693126874346, + "grad_norm": 0.49801477789878845, + "learning_rate": 1.038e-05, + "loss": 0.0093, + "step": 520 + }, + { + "epoch": 0.03243772568700655, + "grad_norm": 0.3854966163635254, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0116, + "step": 530 + }, + { + "epoch": 0.033049758247138745, + "grad_norm": 0.3163810968399048, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.008, + "step": 540 + }, + { + "epoch": 0.03366179080727095, + "grad_norm": 0.33000636100769043, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0093, + "step": 550 + }, + { + "epoch": 0.034273823367403145, + "grad_norm": 0.3350297808647156, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0083, + "step": 560 + }, + { + "epoch": 0.03488585592753535, + "grad_norm": 0.18780949711799622, + "learning_rate": 1.138e-05, + "loss": 0.0097, + "step": 570 + }, + { + "epoch": 0.035497888487667544, + "grad_norm": 0.20399607717990875, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0092, + "step": 580 + }, + { + "epoch": 0.03610992104779974, + "grad_norm": 0.15931005775928497, + "learning_rate": 1.178e-05, + "loss": 0.0076, + "step": 590 + }, + { + "epoch": 0.03672195360793194, + "grad_norm": 0.20751547813415527, + "learning_rate": 1.198e-05, + "loss": 0.0079, + "step": 600 + }, + { + "epoch": 0.03733398616806414, + "grad_norm": 0.39666953682899475, + "learning_rate": 1.218e-05, + "loss": 0.0072, + "step": 610 + }, + { + "epoch": 0.03794601872819634, + "grad_norm": 0.385407030582428, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0089, + "step": 620 + }, + { + "epoch": 0.03855805128832854, + "grad_norm": 0.5228332877159119, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0092, + "step": 630 + }, + { + "epoch": 0.03917008384846074, + "grad_norm": 0.29315415024757385, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0098, + "step": 640 + }, + { + "epoch": 0.03978211640859294, + "grad_norm": 0.4300646483898163, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0079, + "step": 650 + }, + { + "epoch": 0.04039414896872514, + "grad_norm": 0.38021156191825867, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0103, + "step": 660 + }, + { + "epoch": 0.041006181528857336, + "grad_norm": 0.43489688634872437, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0105, + "step": 670 + }, + { + "epoch": 0.04161821408898953, + "grad_norm": 0.48019328713417053, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0124, + "step": 680 + }, + { + "epoch": 0.042230246649121735, + "grad_norm": 0.28486984968185425, + "learning_rate": 1.378e-05, + "loss": 0.0122, + "step": 690 + }, + { + "epoch": 0.04284227920925393, + "grad_norm": 0.35172080993652344, + "learning_rate": 1.398e-05, + "loss": 0.0093, + "step": 700 + }, + { + "epoch": 0.043454311769386134, + "grad_norm": 0.32531124353408813, + "learning_rate": 1.418e-05, + "loss": 0.0116, + "step": 710 + }, + { + "epoch": 0.04406634432951833, + "grad_norm": 0.388637512922287, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0077, + "step": 720 + }, + { + "epoch": 0.04467837688965053, + "grad_norm": 0.3816429078578949, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0123, + "step": 730 + }, + { + "epoch": 0.04529040944978273, + "grad_norm": 0.22786036133766174, + "learning_rate": 1.478e-05, + "loss": 0.0089, + "step": 740 + }, + { + "epoch": 0.045902442009914925, + "grad_norm": 0.2965328097343445, + "learning_rate": 1.498e-05, + "loss": 0.011, + "step": 750 + }, + { + "epoch": 0.04651447457004713, + "grad_norm": 0.3568362593650818, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0101, + "step": 760 + }, + { + "epoch": 0.047126507130179324, + "grad_norm": 0.2972166836261749, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0093, + "step": 770 + }, + { + "epoch": 0.04773853969031153, + "grad_norm": 0.4221388101577759, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.012, + "step": 780 + }, + { + "epoch": 0.04835057225044372, + "grad_norm": 0.37255391478538513, + "learning_rate": 1.578e-05, + "loss": 0.0085, + "step": 790 + }, + { + "epoch": 0.048962604810575926, + "grad_norm": 0.36007094383239746, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.008, + "step": 800 + }, + { + "epoch": 0.04957463737070812, + "grad_norm": 0.40588808059692383, + "learning_rate": 1.618e-05, + "loss": 0.0081, + "step": 810 + }, + { + "epoch": 0.05018666993084032, + "grad_norm": 0.46563687920570374, + "learning_rate": 1.638e-05, + "loss": 0.0076, + "step": 820 + }, + { + "epoch": 0.05079870249097252, + "grad_norm": 0.3161381483078003, + "learning_rate": 1.658e-05, + "loss": 0.0129, + "step": 830 + }, + { + "epoch": 0.05141073505110472, + "grad_norm": 0.3800298869609833, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0146, + "step": 840 + }, + { + "epoch": 0.05202276761123692, + "grad_norm": 0.36572107672691345, + "learning_rate": 1.698e-05, + "loss": 0.0148, + "step": 850 + }, + { + "epoch": 0.052634800171369116, + "grad_norm": 0.4084141254425049, + "learning_rate": 1.718e-05, + "loss": 0.0085, + "step": 860 + }, + { + "epoch": 0.05324683273150132, + "grad_norm": 0.2906867265701294, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0116, + "step": 870 + }, + { + "epoch": 0.053858865291633515, + "grad_norm": 0.41204380989074707, + "learning_rate": 1.758e-05, + "loss": 0.0076, + "step": 880 + }, + { + "epoch": 0.05447089785176571, + "grad_norm": 0.5292996764183044, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0094, + "step": 890 + }, + { + "epoch": 0.055082930411897914, + "grad_norm": 0.23192685842514038, + "learning_rate": 1.798e-05, + "loss": 0.0116, + "step": 900 + }, + { + "epoch": 0.05569496297203011, + "grad_norm": 0.41050270199775696, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0099, + "step": 910 + }, + { + "epoch": 0.05630699553216231, + "grad_norm": 0.3336002230644226, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0082, + "step": 920 + }, + { + "epoch": 0.05691902809229451, + "grad_norm": 0.46233776211738586, + "learning_rate": 1.858e-05, + "loss": 0.0104, + "step": 930 + }, + { + "epoch": 0.05753106065242671, + "grad_norm": 0.36776405572891235, + "learning_rate": 1.878e-05, + "loss": 0.0115, + "step": 940 + }, + { + "epoch": 0.05814309321255891, + "grad_norm": 0.47848618030548096, + "learning_rate": 1.898e-05, + "loss": 0.0108, + "step": 950 + }, + { + "epoch": 0.058755125772691104, + "grad_norm": 0.35507604479789734, + "learning_rate": 1.918e-05, + "loss": 0.0095, + "step": 960 + }, + { + "epoch": 0.05936715833282331, + "grad_norm": 0.4613397717475891, + "learning_rate": 1.938e-05, + "loss": 0.0119, + "step": 970 + }, + { + "epoch": 0.0599791908929555, + "grad_norm": 0.34492260217666626, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0109, + "step": 980 + }, + { + "epoch": 0.060591223453087706, + "grad_norm": 0.34624582529067993, + "learning_rate": 1.978e-05, + "loss": 0.0099, + "step": 990 + }, + { + "epoch": 0.0612032560132199, + "grad_norm": 0.9161475896835327, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0109, + "step": 1000 + }, + { + "epoch": 0.061815288573352105, + "grad_norm": 0.367807537317276, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0087, + "step": 1010 + }, + { + "epoch": 0.0624273211334843, + "grad_norm": 0.4043216407299042, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.0084, + "step": 1020 + }, + { + "epoch": 0.0630393536936165, + "grad_norm": 0.315305233001709, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0075, + "step": 1030 + }, + { + "epoch": 0.06365138625374869, + "grad_norm": 0.49702969193458557, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0103, + "step": 1040 + }, + { + "epoch": 0.0642634188138809, + "grad_norm": 0.46286216378211975, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0116, + "step": 1050 + }, + { + "epoch": 0.0648754513740131, + "grad_norm": 0.332142174243927, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0103, + "step": 1060 + }, + { + "epoch": 0.0654874839341453, + "grad_norm": 0.6118510961532593, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0118, + "step": 1070 + }, + { + "epoch": 0.06609951649427749, + "grad_norm": 0.49074795842170715, + "learning_rate": 1.999967041472886e-05, + "loss": 0.011, + "step": 1080 + }, + { + "epoch": 0.0667115490544097, + "grad_norm": 0.42575374245643616, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0125, + "step": 1090 + }, + { + "epoch": 0.0673235816145419, + "grad_norm": 0.3223794996738434, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0091, + "step": 1100 + }, + { + "epoch": 0.06793561417467409, + "grad_norm": 0.4952760636806488, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.011, + "step": 1110 + }, + { + "epoch": 0.06854764673480629, + "grad_norm": 0.36144813895225525, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0096, + "step": 1120 + }, + { + "epoch": 0.06915967929493849, + "grad_norm": 0.31190025806427, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0115, + "step": 1130 + }, + { + "epoch": 0.0697717118550707, + "grad_norm": 0.7014928460121155, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.014, + "step": 1140 + }, + { + "epoch": 0.07038374441520288, + "grad_norm": 0.4382205605506897, + "learning_rate": 1.999882759038658e-05, + "loss": 0.0111, + "step": 1150 + }, + { + "epoch": 0.07099577697533509, + "grad_norm": 0.3750714659690857, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0116, + "step": 1160 + }, + { + "epoch": 0.07160780953546729, + "grad_norm": 0.4174371361732483, + "learning_rate": 1.999849173538598e-05, + "loss": 0.009, + "step": 1170 + }, + { + "epoch": 0.07221984209559948, + "grad_norm": 0.44394591450691223, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0094, + "step": 1180 + }, + { + "epoch": 0.07283187465573168, + "grad_norm": 0.43412888050079346, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0098, + "step": 1190 + }, + { + "epoch": 0.07344390721586389, + "grad_norm": 0.6421196460723877, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.01, + "step": 1200 + }, + { + "epoch": 0.07405593977599609, + "grad_norm": 0.6313903331756592, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0137, + "step": 1210 + }, + { + "epoch": 0.07466797233612828, + "grad_norm": 0.49340254068374634, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0104, + "step": 1220 + }, + { + "epoch": 0.07528000489626048, + "grad_norm": 0.40420663356781006, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0126, + "step": 1230 + }, + { + "epoch": 0.07589203745639268, + "grad_norm": 0.3955318033695221, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.013, + "step": 1240 + }, + { + "epoch": 0.07650407001652489, + "grad_norm": 0.4967520236968994, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0098, + "step": 1250 + }, + { + "epoch": 0.07711610257665708, + "grad_norm": 0.3380029499530792, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0084, + "step": 1260 + }, + { + "epoch": 0.07772813513678928, + "grad_norm": 0.4542321562767029, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.009, + "step": 1270 + }, + { + "epoch": 0.07834016769692148, + "grad_norm": 0.4533286392688751, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0093, + "step": 1280 + }, + { + "epoch": 0.07895220025705367, + "grad_norm": 0.39559242129325867, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0106, + "step": 1290 + }, + { + "epoch": 0.07956423281718587, + "grad_norm": 0.23190362751483917, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.01, + "step": 1300 + }, + { + "epoch": 0.08017626537731808, + "grad_norm": 0.4732286334037781, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0089, + "step": 1310 + }, + { + "epoch": 0.08078829793745028, + "grad_norm": 0.3010174036026001, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0091, + "step": 1320 + }, + { + "epoch": 0.08140033049758247, + "grad_norm": 0.3989834189414978, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0097, + "step": 1330 + }, + { + "epoch": 0.08201236305771467, + "grad_norm": 0.4597114622592926, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.01, + "step": 1340 + }, + { + "epoch": 0.08262439561784687, + "grad_norm": 0.426826536655426, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.011, + "step": 1350 + }, + { + "epoch": 0.08323642817797906, + "grad_norm": 0.4876341223716736, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0105, + "step": 1360 + }, + { + "epoch": 0.08384846073811127, + "grad_norm": 0.5444457530975342, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0099, + "step": 1370 + }, + { + "epoch": 0.08446049329824347, + "grad_norm": 0.5096126794815063, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.007, + "step": 1380 + }, + { + "epoch": 0.08507252585837567, + "grad_norm": 0.43828368186950684, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.009, + "step": 1390 + }, + { + "epoch": 0.08568455841850786, + "grad_norm": 0.40163955092430115, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0103, + "step": 1400 + }, + { + "epoch": 0.08629659097864006, + "grad_norm": 0.3110432028770447, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0115, + "step": 1410 + }, + { + "epoch": 0.08690862353877227, + "grad_norm": 0.8393893241882324, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.012, + "step": 1420 + }, + { + "epoch": 0.08752065609890446, + "grad_norm": 0.2751714289188385, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0093, + "step": 1430 + }, + { + "epoch": 0.08813268865903666, + "grad_norm": 0.36969971656799316, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0112, + "step": 1440 + }, + { + "epoch": 0.08874472121916886, + "grad_norm": 0.3721938729286194, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0113, + "step": 1450 + }, + { + "epoch": 0.08935675377930107, + "grad_norm": 0.26564934849739075, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0107, + "step": 1460 + }, + { + "epoch": 0.08996878633943325, + "grad_norm": 0.36552169919013977, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0123, + "step": 1470 + }, + { + "epoch": 0.09058081889956546, + "grad_norm": 0.23664990067481995, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0074, + "step": 1480 + }, + { + "epoch": 0.09119285145969766, + "grad_norm": 0.49903133511543274, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0084, + "step": 1490 + }, + { + "epoch": 0.09180488401982985, + "grad_norm": 0.43505051732063293, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0115, + "step": 1500 + }, + { + "epoch": 0.09241691657996205, + "grad_norm": 0.20318932831287384, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0088, + "step": 1510 + }, + { + "epoch": 0.09302894914009426, + "grad_norm": 0.3289708197116852, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.008, + "step": 1520 + }, + { + "epoch": 0.09364098170022646, + "grad_norm": 0.3920934200286865, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0145, + "step": 1530 + }, + { + "epoch": 0.09425301426035865, + "grad_norm": 0.40396374464035034, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0081, + "step": 1540 + }, + { + "epoch": 0.09486504682049085, + "grad_norm": 0.4044182300567627, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.012, + "step": 1550 + }, + { + "epoch": 0.09547707938062305, + "grad_norm": 0.2318611741065979, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0115, + "step": 1560 + }, + { + "epoch": 0.09608911194075524, + "grad_norm": 0.3905714750289917, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.008, + "step": 1570 + }, + { + "epoch": 0.09670114450088745, + "grad_norm": 0.2516922652721405, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0084, + "step": 1580 + }, + { + "epoch": 0.09731317706101965, + "grad_norm": 0.338455468416214, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0122, + "step": 1590 + }, + { + "epoch": 0.09792520962115185, + "grad_norm": 0.31875041127204895, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0081, + "step": 1600 + }, + { + "epoch": 0.09853724218128404, + "grad_norm": 0.2996121644973755, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0068, + "step": 1610 + }, + { + "epoch": 0.09914927474141624, + "grad_norm": 0.4381162226200104, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0103, + "step": 1620 + }, + { + "epoch": 0.09976130730154845, + "grad_norm": 0.5531038045883179, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0168, + "step": 1630 + }, + { + "epoch": 0.10037333986168064, + "grad_norm": 1.1283385753631592, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0119, + "step": 1640 + }, + { + "epoch": 0.10098537242181284, + "grad_norm": 0.38017332553863525, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0096, + "step": 1650 + }, + { + "epoch": 0.10159740498194504, + "grad_norm": 0.4669477045536041, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0111, + "step": 1660 + }, + { + "epoch": 0.10220943754207724, + "grad_norm": 0.3903254270553589, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0098, + "step": 1670 + }, + { + "epoch": 0.10282147010220943, + "grad_norm": 0.49671587347984314, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0083, + "step": 1680 + }, + { + "epoch": 0.10343350266234164, + "grad_norm": 0.36555853486061096, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0088, + "step": 1690 + }, + { + "epoch": 0.10404553522247384, + "grad_norm": 0.21804726123809814, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0086, + "step": 1700 + }, + { + "epoch": 0.10465756778260603, + "grad_norm": 0.6744784116744995, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0109, + "step": 1710 + }, + { + "epoch": 0.10526960034273823, + "grad_norm": 0.34379470348358154, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0096, + "step": 1720 + }, + { + "epoch": 0.10588163290287043, + "grad_norm": 0.27760598063468933, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0095, + "step": 1730 + }, + { + "epoch": 0.10649366546300264, + "grad_norm": 0.36294442415237427, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0091, + "step": 1740 + }, + { + "epoch": 0.10710569802313483, + "grad_norm": 0.42200908064842224, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.011, + "step": 1750 + }, + { + "epoch": 0.10771773058326703, + "grad_norm": 0.47863906621932983, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0107, + "step": 1760 + }, + { + "epoch": 0.10832976314339923, + "grad_norm": 0.32717248797416687, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0112, + "step": 1770 + }, + { + "epoch": 0.10894179570353142, + "grad_norm": 0.4255545735359192, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0106, + "step": 1780 + }, + { + "epoch": 0.10955382826366362, + "grad_norm": 0.5034983158111572, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0098, + "step": 1790 + }, + { + "epoch": 0.11016586082379583, + "grad_norm": 0.37071412801742554, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0099, + "step": 1800 + }, + { + "epoch": 0.11077789338392803, + "grad_norm": 0.23624737560749054, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0069, + "step": 1810 + }, + { + "epoch": 0.11138992594406022, + "grad_norm": 0.5815485715866089, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0095, + "step": 1820 + }, + { + "epoch": 0.11200195850419242, + "grad_norm": 1.1828722953796387, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0104, + "step": 1830 + }, + { + "epoch": 0.11261399106432463, + "grad_norm": 0.38099589943885803, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0106, + "step": 1840 + }, + { + "epoch": 0.11322602362445681, + "grad_norm": 0.38476184010505676, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0112, + "step": 1850 + }, + { + "epoch": 0.11383805618458902, + "grad_norm": 0.48982104659080505, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0125, + "step": 1860 + }, + { + "epoch": 0.11445008874472122, + "grad_norm": 0.4165821671485901, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0099, + "step": 1870 + }, + { + "epoch": 0.11506212130485342, + "grad_norm": 0.3412662446498871, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0061, + "step": 1880 + }, + { + "epoch": 0.11567415386498561, + "grad_norm": 0.46617937088012695, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0129, + "step": 1890 + }, + { + "epoch": 0.11628618642511782, + "grad_norm": 0.2705824077129364, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0082, + "step": 1900 + }, + { + "epoch": 0.11689821898525002, + "grad_norm": 0.3567829430103302, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0125, + "step": 1910 + }, + { + "epoch": 0.11751025154538221, + "grad_norm": 0.4438138008117676, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0129, + "step": 1920 + }, + { + "epoch": 0.11812228410551441, + "grad_norm": 0.356703519821167, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0083, + "step": 1930 + }, + { + "epoch": 0.11873431666564661, + "grad_norm": 0.6039804220199585, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0086, + "step": 1940 + }, + { + "epoch": 0.11934634922577882, + "grad_norm": 0.4572801887989044, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0067, + "step": 1950 + }, + { + "epoch": 0.119958381785911, + "grad_norm": 0.5063445568084717, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0091, + "step": 1960 + }, + { + "epoch": 0.12057041434604321, + "grad_norm": 0.3467857837677002, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.008, + "step": 1970 + }, + { + "epoch": 0.12118244690617541, + "grad_norm": 0.4875742197036743, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0102, + "step": 1980 + }, + { + "epoch": 0.1217944794663076, + "grad_norm": 0.3209119141101837, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0081, + "step": 1990 + }, + { + "epoch": 0.1224065120264398, + "grad_norm": 0.4731980860233307, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0118, + "step": 2000 + }, + { + "epoch": 0.123018544586572, + "grad_norm": 0.5742963552474976, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.0125, + "step": 2010 + }, + { + "epoch": 0.12363057714670421, + "grad_norm": 0.41357406973838806, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.1242426097068364, + "grad_norm": 0.6277521252632141, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0096, + "step": 2030 + }, + { + "epoch": 0.1248546422669686, + "grad_norm": 0.41252902150154114, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0108, + "step": 2040 + }, + { + "epoch": 0.1254666748271008, + "grad_norm": 0.782122790813446, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0134, + "step": 2050 + }, + { + "epoch": 0.126078707387233, + "grad_norm": 0.45011264085769653, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0102, + "step": 2060 + }, + { + "epoch": 0.1266907399473652, + "grad_norm": 0.2724951207637787, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0088, + "step": 2070 + }, + { + "epoch": 0.12730277250749739, + "grad_norm": 0.2351481169462204, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.009, + "step": 2080 + }, + { + "epoch": 0.1279148050676296, + "grad_norm": 0.34568479657173157, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0092, + "step": 2090 + }, + { + "epoch": 0.1285268376277618, + "grad_norm": 0.44493499398231506, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0087, + "step": 2100 + }, + { + "epoch": 0.129138870187894, + "grad_norm": 0.3011283874511719, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0108, + "step": 2110 + }, + { + "epoch": 0.1297509027480262, + "grad_norm": 0.4170232117176056, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0087, + "step": 2120 + }, + { + "epoch": 0.1303629353081584, + "grad_norm": 0.2696056365966797, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0093, + "step": 2130 + }, + { + "epoch": 0.1309749678682906, + "grad_norm": 0.4092336893081665, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0083, + "step": 2140 + }, + { + "epoch": 0.13158700042842278, + "grad_norm": 0.36637401580810547, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.01, + "step": 2150 + }, + { + "epoch": 0.13219903298855498, + "grad_norm": 0.28675684332847595, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0079, + "step": 2160 + }, + { + "epoch": 0.13281106554868718, + "grad_norm": 0.27699902653694153, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0071, + "step": 2170 + }, + { + "epoch": 0.1334230981088194, + "grad_norm": 0.3832298517227173, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0104, + "step": 2180 + }, + { + "epoch": 0.1340351306689516, + "grad_norm": 0.3590598702430725, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0074, + "step": 2190 + }, + { + "epoch": 0.1346471632290838, + "grad_norm": 0.21830014884471893, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0093, + "step": 2200 + }, + { + "epoch": 0.135259195789216, + "grad_norm": 0.342492938041687, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0109, + "step": 2210 + }, + { + "epoch": 0.13587122834934817, + "grad_norm": 0.6337023973464966, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0082, + "step": 2220 + }, + { + "epoch": 0.13648326090948038, + "grad_norm": 0.41742798686027527, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0083, + "step": 2230 + }, + { + "epoch": 0.13709529346961258, + "grad_norm": 0.3180190324783325, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0083, + "step": 2240 + }, + { + "epoch": 0.13770732602974478, + "grad_norm": 0.36720144748687744, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0114, + "step": 2250 + }, + { + "epoch": 0.13831935858987698, + "grad_norm": 0.29457366466522217, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0077, + "step": 2260 + }, + { + "epoch": 0.1389313911500092, + "grad_norm": 0.24702222645282745, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0074, + "step": 2270 + }, + { + "epoch": 0.1395434237101414, + "grad_norm": 0.3203345835208893, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0079, + "step": 2280 + }, + { + "epoch": 0.14015545627027357, + "grad_norm": 0.4375395178794861, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0127, + "step": 2290 + }, + { + "epoch": 0.14076748883040577, + "grad_norm": 0.44338247179985046, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0077, + "step": 2300 + }, + { + "epoch": 0.14137952139053797, + "grad_norm": 0.31765618920326233, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0091, + "step": 2310 + }, + { + "epoch": 0.14199155395067017, + "grad_norm": 0.322534441947937, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0068, + "step": 2320 + }, + { + "epoch": 0.14260358651080238, + "grad_norm": 0.23571068048477173, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0094, + "step": 2330 + }, + { + "epoch": 0.14321561907093458, + "grad_norm": 0.26818808913230896, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0092, + "step": 2340 + }, + { + "epoch": 0.14382765163106678, + "grad_norm": 0.31886982917785645, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0084, + "step": 2350 + }, + { + "epoch": 0.14443968419119896, + "grad_norm": 0.5176070928573608, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0104, + "step": 2360 + }, + { + "epoch": 0.14505171675133116, + "grad_norm": 0.4322161078453064, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0081, + "step": 2370 + }, + { + "epoch": 0.14566374931146336, + "grad_norm": 0.4076510965824127, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0102, + "step": 2380 + }, + { + "epoch": 0.14627578187159557, + "grad_norm": 0.3808838725090027, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0096, + "step": 2390 + }, + { + "epoch": 0.14688781443172777, + "grad_norm": 0.5045232176780701, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0109, + "step": 2400 + }, + { + "epoch": 0.14749984699185997, + "grad_norm": 0.3932737708091736, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0077, + "step": 2410 + }, + { + "epoch": 0.14811187955199218, + "grad_norm": 0.28561875224113464, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0141, + "step": 2420 + }, + { + "epoch": 0.14872391211212435, + "grad_norm": 0.414410799741745, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0094, + "step": 2430 + }, + { + "epoch": 0.14933594467225655, + "grad_norm": 0.4587285816669464, + "learning_rate": 1.989086647373215e-05, + "loss": 0.009, + "step": 2440 + }, + { + "epoch": 0.14994797723238876, + "grad_norm": 0.7567377686500549, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0084, + "step": 2450 + }, + { + "epoch": 0.15056000979252096, + "grad_norm": 0.4980221390724182, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0087, + "step": 2460 + }, + { + "epoch": 0.15117204235265316, + "grad_norm": 0.41810303926467896, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0082, + "step": 2470 + }, + { + "epoch": 0.15178407491278537, + "grad_norm": 0.4193445146083832, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0131, + "step": 2480 + }, + { + "epoch": 0.15239610747291757, + "grad_norm": 0.2561246156692505, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0074, + "step": 2490 + }, + { + "epoch": 0.15300814003304977, + "grad_norm": 0.22316500544548035, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0069, + "step": 2500 + }, + { + "epoch": 0.15362017259318195, + "grad_norm": 0.31504112482070923, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0097, + "step": 2510 + }, + { + "epoch": 0.15423220515331415, + "grad_norm": 0.2944568991661072, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0101, + "step": 2520 + }, + { + "epoch": 0.15484423771344635, + "grad_norm": 0.2744649052619934, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0074, + "step": 2530 + }, + { + "epoch": 0.15545627027357856, + "grad_norm": 0.2717166841030121, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.007, + "step": 2540 + }, + { + "epoch": 0.15606830283371076, + "grad_norm": 0.32652929425239563, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0097, + "step": 2550 + }, + { + "epoch": 0.15668033539384296, + "grad_norm": 0.3169964849948883, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0089, + "step": 2560 + }, + { + "epoch": 0.15729236795397517, + "grad_norm": 0.24130010604858398, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0083, + "step": 2570 + }, + { + "epoch": 0.15790440051410734, + "grad_norm": 0.3869011700153351, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0088, + "step": 2580 + }, + { + "epoch": 0.15851643307423954, + "grad_norm": 0.2944110333919525, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0085, + "step": 2590 + }, + { + "epoch": 0.15912846563437175, + "grad_norm": 0.27993839979171753, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0097, + "step": 2600 + }, + { + "epoch": 0.15974049819450395, + "grad_norm": 0.42018845677375793, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0104, + "step": 2610 + }, + { + "epoch": 0.16035253075463615, + "grad_norm": 0.45006832480430603, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0072, + "step": 2620 + }, + { + "epoch": 0.16096456331476836, + "grad_norm": 0.275564581155777, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0072, + "step": 2630 + }, + { + "epoch": 0.16157659587490056, + "grad_norm": 0.503052294254303, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0091, + "step": 2640 + }, + { + "epoch": 0.16218862843503273, + "grad_norm": 0.33740976452827454, + "learning_rate": 1.985678043265668e-05, + "loss": 0.008, + "step": 2650 + }, + { + "epoch": 0.16280066099516494, + "grad_norm": 0.5379078984260559, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0097, + "step": 2660 + }, + { + "epoch": 0.16341269355529714, + "grad_norm": 0.3605813980102539, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0109, + "step": 2670 + }, + { + "epoch": 0.16402472611542934, + "grad_norm": 0.49490585923194885, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.013, + "step": 2680 + }, + { + "epoch": 0.16463675867556155, + "grad_norm": 0.29894375801086426, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0089, + "step": 2690 + }, + { + "epoch": 0.16524879123569375, + "grad_norm": 0.395270437002182, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0092, + "step": 2700 + }, + { + "epoch": 0.16586082379582595, + "grad_norm": 0.25507843494415283, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0078, + "step": 2710 + }, + { + "epoch": 0.16647285635595813, + "grad_norm": 0.3304852843284607, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0178, + "step": 2720 + }, + { + "epoch": 0.16708488891609033, + "grad_norm": 0.4356633126735687, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0082, + "step": 2730 + }, + { + "epoch": 0.16769692147622253, + "grad_norm": 0.4104527533054352, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0086, + "step": 2740 + }, + { + "epoch": 0.16830895403635474, + "grad_norm": 0.25723493099212646, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0097, + "step": 2750 + }, + { + "epoch": 0.16892098659648694, + "grad_norm": 0.3280608057975769, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0079, + "step": 2760 + }, + { + "epoch": 0.16953301915661914, + "grad_norm": 0.4641128480434418, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0081, + "step": 2770 + }, + { + "epoch": 0.17014505171675134, + "grad_norm": 0.2704941928386688, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.0112, + "step": 2780 + }, + { + "epoch": 0.17075708427688352, + "grad_norm": 0.42343780398368835, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0084, + "step": 2790 + }, + { + "epoch": 0.17136911683701572, + "grad_norm": 0.2606532573699951, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0085, + "step": 2800 + }, + { + "epoch": 0.17198114939714793, + "grad_norm": 0.39099374413490295, + "learning_rate": 1.982773261916081e-05, + "loss": 0.014, + "step": 2810 + }, + { + "epoch": 0.17259318195728013, + "grad_norm": 0.32653889060020447, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0095, + "step": 2820 + }, + { + "epoch": 0.17320521451741233, + "grad_norm": 0.34765321016311646, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0084, + "step": 2830 + }, + { + "epoch": 0.17381724707754453, + "grad_norm": 0.2844177186489105, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.011, + "step": 2840 + }, + { + "epoch": 0.17442927963767674, + "grad_norm": 0.5079899430274963, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0114, + "step": 2850 + }, + { + "epoch": 0.1750413121978089, + "grad_norm": 0.4043678045272827, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0073, + "step": 2860 + }, + { + "epoch": 0.17565334475794112, + "grad_norm": 0.3833003640174866, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0076, + "step": 2870 + }, + { + "epoch": 0.17626537731807332, + "grad_norm": 0.2826341986656189, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0072, + "step": 2880 + }, + { + "epoch": 0.17687740987820552, + "grad_norm": 0.6043460965156555, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0084, + "step": 2890 + }, + { + "epoch": 0.17748944243833772, + "grad_norm": 0.3238481879234314, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0149, + "step": 2900 + }, + { + "epoch": 0.17810147499846993, + "grad_norm": 0.45817995071411133, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0103, + "step": 2910 + }, + { + "epoch": 0.17871350755860213, + "grad_norm": 0.21048744022846222, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0094, + "step": 2920 + }, + { + "epoch": 0.1793255401187343, + "grad_norm": 0.3401891887187958, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0058, + "step": 2930 + }, + { + "epoch": 0.1799375726788665, + "grad_norm": 0.3655509948730469, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0071, + "step": 2940 + }, + { + "epoch": 0.1805496052389987, + "grad_norm": 0.47406241297721863, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0067, + "step": 2950 + }, + { + "epoch": 0.18116163779913091, + "grad_norm": 0.3278841972351074, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0121, + "step": 2960 + }, + { + "epoch": 0.18177367035926312, + "grad_norm": 0.271436482667923, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.008, + "step": 2970 + }, + { + "epoch": 0.18238570291939532, + "grad_norm": 0.41475561261177063, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.008, + "step": 2980 + }, + { + "epoch": 0.18299773547952752, + "grad_norm": 0.5389090776443481, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0091, + "step": 2990 + }, + { + "epoch": 0.1836097680396597, + "grad_norm": 0.3958609700202942, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0106, + "step": 3000 + }, + { + "epoch": 0.1842218005997919, + "grad_norm": 0.3456019461154938, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0084, + "step": 3010 + }, + { + "epoch": 0.1848338331599241, + "grad_norm": 0.2959386706352234, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0071, + "step": 3020 + }, + { + "epoch": 0.1854458657200563, + "grad_norm": 0.2617223858833313, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0082, + "step": 3030 + }, + { + "epoch": 0.1860578982801885, + "grad_norm": 0.45173966884613037, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0105, + "step": 3040 + }, + { + "epoch": 0.1866699308403207, + "grad_norm": 0.4127421975135803, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.008, + "step": 3050 + }, + { + "epoch": 0.18728196340045292, + "grad_norm": 0.3142230808734894, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0085, + "step": 3060 + }, + { + "epoch": 0.1878939959605851, + "grad_norm": 0.49720287322998047, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0089, + "step": 3070 + }, + { + "epoch": 0.1885060285207173, + "grad_norm": 0.6417365074157715, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.0088, + "step": 3080 + }, + { + "epoch": 0.1891180610808495, + "grad_norm": 0.44801583886146545, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.0098, + "step": 3090 + }, + { + "epoch": 0.1897300936409817, + "grad_norm": 0.3606127202510834, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0087, + "step": 3100 + }, + { + "epoch": 0.1903421262011139, + "grad_norm": 0.268971711397171, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0092, + "step": 3110 + }, + { + "epoch": 0.1909541587612461, + "grad_norm": 0.2367011308670044, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0074, + "step": 3120 + }, + { + "epoch": 0.1915661913213783, + "grad_norm": 0.41643625497817993, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0062, + "step": 3130 + }, + { + "epoch": 0.19217822388151048, + "grad_norm": 0.33202284574508667, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0081, + "step": 3140 + }, + { + "epoch": 0.1927902564416427, + "grad_norm": 0.279813289642334, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0074, + "step": 3150 + }, + { + "epoch": 0.1934022890017749, + "grad_norm": 0.5127174258232117, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0101, + "step": 3160 + }, + { + "epoch": 0.1940143215619071, + "grad_norm": 0.36921849846839905, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0078, + "step": 3170 + }, + { + "epoch": 0.1946263541220393, + "grad_norm": 0.3509728014469147, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0114, + "step": 3180 + }, + { + "epoch": 0.1952383866821715, + "grad_norm": 0.3088139295578003, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0089, + "step": 3190 + }, + { + "epoch": 0.1958504192423037, + "grad_norm": 0.43653762340545654, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0116, + "step": 3200 + }, + { + "epoch": 0.19646245180243588, + "grad_norm": 0.2522308826446533, + "learning_rate": 1.974353140804231e-05, + "loss": 0.007, + "step": 3210 + }, + { + "epoch": 0.19707448436256808, + "grad_norm": 0.37519100308418274, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0098, + "step": 3220 + }, + { + "epoch": 0.19768651692270028, + "grad_norm": 0.379027783870697, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0086, + "step": 3230 + }, + { + "epoch": 0.1982985494828325, + "grad_norm": 0.2713090479373932, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0073, + "step": 3240 + }, + { + "epoch": 0.1989105820429647, + "grad_norm": 0.41106846928596497, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0188, + "step": 3250 + }, + { + "epoch": 0.1995226146030969, + "grad_norm": 0.3914758861064911, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0097, + "step": 3260 + }, + { + "epoch": 0.2001346471632291, + "grad_norm": 0.4763018488883972, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0082, + "step": 3270 + }, + { + "epoch": 0.20074667972336127, + "grad_norm": 0.23002664744853973, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0085, + "step": 3280 + }, + { + "epoch": 0.20135871228349347, + "grad_norm": 0.2887377142906189, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0074, + "step": 3290 + }, + { + "epoch": 0.20197074484362568, + "grad_norm": 0.2322079837322235, + "learning_rate": 1.972231769371516e-05, + "loss": 0.009, + "step": 3300 + }, + { + "epoch": 0.20258277740375788, + "grad_norm": 0.39307233691215515, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0095, + "step": 3310 + }, + { + "epoch": 0.20319480996389008, + "grad_norm": 0.5209783315658569, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.012, + "step": 3320 + }, + { + "epoch": 0.20380684252402229, + "grad_norm": 0.45187172293663025, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0086, + "step": 3330 + }, + { + "epoch": 0.2044188750841545, + "grad_norm": 0.480970174074173, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0072, + "step": 3340 + }, + { + "epoch": 0.20503090764428666, + "grad_norm": 0.30979010462760925, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0091, + "step": 3350 + }, + { + "epoch": 0.20564294020441887, + "grad_norm": 0.6410729289054871, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0096, + "step": 3360 + }, + { + "epoch": 0.20625497276455107, + "grad_norm": 0.23707512021064758, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0078, + "step": 3370 + }, + { + "epoch": 0.20686700532468327, + "grad_norm": 0.3029544949531555, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0115, + "step": 3380 + }, + { + "epoch": 0.20747903788481548, + "grad_norm": 0.28677740693092346, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0068, + "step": 3390 + }, + { + "epoch": 0.20809107044494768, + "grad_norm": 0.2433662712574005, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0082, + "step": 3400 + }, + { + "epoch": 0.20870310300507988, + "grad_norm": 0.38066667318344116, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0089, + "step": 3410 + }, + { + "epoch": 0.20931513556521206, + "grad_norm": 0.3830282390117645, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0073, + "step": 3420 + }, + { + "epoch": 0.20992716812534426, + "grad_norm": 0.359684556722641, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0088, + "step": 3430 + }, + { + "epoch": 0.21053920068547646, + "grad_norm": 0.3497346341609955, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0082, + "step": 3440 + }, + { + "epoch": 0.21115123324560867, + "grad_norm": 0.3664748966693878, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0091, + "step": 3450 + }, + { + "epoch": 0.21176326580574087, + "grad_norm": 0.382804811000824, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0077, + "step": 3460 + }, + { + "epoch": 0.21237529836587307, + "grad_norm": 0.22746194899082184, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0107, + "step": 3470 + }, + { + "epoch": 0.21298733092600527, + "grad_norm": 0.4094266891479492, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0096, + "step": 3480 + }, + { + "epoch": 0.21359936348613745, + "grad_norm": 0.26990365982055664, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0089, + "step": 3490 + }, + { + "epoch": 0.21421139604626965, + "grad_norm": 0.2602371275424957, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0074, + "step": 3500 + }, + { + "epoch": 0.21482342860640186, + "grad_norm": 0.34200435876846313, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0083, + "step": 3510 + }, + { + "epoch": 0.21543546116653406, + "grad_norm": 0.4260508716106415, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0098, + "step": 3520 + }, + { + "epoch": 0.21604749372666626, + "grad_norm": 0.4017483592033386, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0103, + "step": 3530 + }, + { + "epoch": 0.21665952628679847, + "grad_norm": 0.40005844831466675, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0094, + "step": 3540 + }, + { + "epoch": 0.21727155884693067, + "grad_norm": 0.3856841027736664, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0095, + "step": 3550 + }, + { + "epoch": 0.21788359140706284, + "grad_norm": 0.3245168626308441, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0067, + "step": 3560 + }, + { + "epoch": 0.21849562396719505, + "grad_norm": 0.2698485255241394, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0079, + "step": 3570 + }, + { + "epoch": 0.21910765652732725, + "grad_norm": 0.24520452320575714, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0109, + "step": 3580 + }, + { + "epoch": 0.21971968908745945, + "grad_norm": 0.397175133228302, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0114, + "step": 3590 + }, + { + "epoch": 0.22033172164759166, + "grad_norm": 0.40339091420173645, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0094, + "step": 3600 + }, + { + "epoch": 0.22094375420772386, + "grad_norm": 0.404435396194458, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0087, + "step": 3610 + }, + { + "epoch": 0.22155578676785606, + "grad_norm": 0.3300188183784485, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0073, + "step": 3620 + }, + { + "epoch": 0.22216781932798824, + "grad_norm": 0.23486892879009247, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0087, + "step": 3630 + }, + { + "epoch": 0.22277985188812044, + "grad_norm": 0.37211188673973083, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0078, + "step": 3640 + }, + { + "epoch": 0.22339188444825264, + "grad_norm": 0.32422709465026855, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.009, + "step": 3650 + }, + { + "epoch": 0.22400391700838485, + "grad_norm": 0.43535664677619934, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0099, + "step": 3660 + }, + { + "epoch": 0.22461594956851705, + "grad_norm": 0.3295724093914032, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0074, + "step": 3670 + }, + { + "epoch": 0.22522798212864925, + "grad_norm": 0.2840734124183655, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0082, + "step": 3680 + }, + { + "epoch": 0.22584001468878145, + "grad_norm": 0.2861844599246979, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0079, + "step": 3690 + }, + { + "epoch": 0.22645204724891363, + "grad_norm": 0.3194407820701599, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0071, + "step": 3700 + }, + { + "epoch": 0.22706407980904583, + "grad_norm": 0.38770729303359985, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0076, + "step": 3710 + }, + { + "epoch": 0.22767611236917804, + "grad_norm": 0.4637960195541382, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0093, + "step": 3720 + }, + { + "epoch": 0.22828814492931024, + "grad_norm": 0.31972312927246094, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0078, + "step": 3730 + }, + { + "epoch": 0.22890017748944244, + "grad_norm": 0.5273001790046692, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0076, + "step": 3740 + }, + { + "epoch": 0.22951221004957464, + "grad_norm": 0.30589622259140015, + "learning_rate": 1.960385541132679e-05, + "loss": 0.009, + "step": 3750 + }, + { + "epoch": 0.23012424260970685, + "grad_norm": 0.31634265184402466, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0063, + "step": 3760 + }, + { + "epoch": 0.23073627516983902, + "grad_norm": 0.32762402296066284, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0081, + "step": 3770 + }, + { + "epoch": 0.23134830772997123, + "grad_norm": 0.42696496844291687, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0089, + "step": 3780 + }, + { + "epoch": 0.23196034029010343, + "grad_norm": 0.4676671624183655, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0093, + "step": 3790 + }, + { + "epoch": 0.23257237285023563, + "grad_norm": 0.3347911536693573, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0131, + "step": 3800 + }, + { + "epoch": 0.23318440541036783, + "grad_norm": 0.3083193600177765, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0072, + "step": 3810 + }, + { + "epoch": 0.23379643797050004, + "grad_norm": 0.38178423047065735, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0083, + "step": 3820 + }, + { + "epoch": 0.23440847053063224, + "grad_norm": 0.2796846330165863, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0072, + "step": 3830 + }, + { + "epoch": 0.23502050309076442, + "grad_norm": 0.37444883584976196, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.008, + "step": 3840 + }, + { + "epoch": 0.23563253565089662, + "grad_norm": 0.3286772668361664, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0092, + "step": 3850 + }, + { + "epoch": 0.23624456821102882, + "grad_norm": 0.45423513650894165, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0068, + "step": 3860 + }, + { + "epoch": 0.23685660077116102, + "grad_norm": 0.36881721019744873, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0068, + "step": 3870 + }, + { + "epoch": 0.23746863333129323, + "grad_norm": 0.3560579717159271, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0084, + "step": 3880 + }, + { + "epoch": 0.23808066589142543, + "grad_norm": 0.43887296319007874, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0076, + "step": 3890 + }, + { + "epoch": 0.23869269845155763, + "grad_norm": 0.3080165982246399, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0069, + "step": 3900 + }, + { + "epoch": 0.2393047310116898, + "grad_norm": 0.2327195703983307, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0077, + "step": 3910 + }, + { + "epoch": 0.239916763571822, + "grad_norm": 0.5960802435874939, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0088, + "step": 3920 + }, + { + "epoch": 0.24052879613195421, + "grad_norm": 0.36213600635528564, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0076, + "step": 3930 + }, + { + "epoch": 0.24114082869208642, + "grad_norm": 0.2950032949447632, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0115, + "step": 3940 + }, + { + "epoch": 0.24175286125221862, + "grad_norm": 0.4527084529399872, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0089, + "step": 3950 + }, + { + "epoch": 0.24236489381235082, + "grad_norm": 0.4422491192817688, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0135, + "step": 3960 + }, + { + "epoch": 0.24297692637248303, + "grad_norm": 0.45049232244491577, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 0.2435889589326152, + "grad_norm": 0.2566494941711426, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0095, + "step": 3980 + }, + { + "epoch": 0.2442009914927474, + "grad_norm": 0.49880343675613403, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0099, + "step": 3990 + }, + { + "epoch": 0.2448130240528796, + "grad_norm": 0.4699341952800751, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0084, + "step": 4000 + }, + { + "epoch": 0.2454250566130118, + "grad_norm": 0.41230708360671997, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0071, + "step": 4010 + }, + { + "epoch": 0.246037089173144, + "grad_norm": 0.4836854934692383, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.01, + "step": 4020 + }, + { + "epoch": 0.24664912173327622, + "grad_norm": 0.3056115508079529, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0082, + "step": 4030 + }, + { + "epoch": 0.24726115429340842, + "grad_norm": 0.151325523853302, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0062, + "step": 4040 + }, + { + "epoch": 0.2478731868535406, + "grad_norm": 0.3798811137676239, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0081, + "step": 4050 + }, + { + "epoch": 0.2484852194136728, + "grad_norm": 0.3308229148387909, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0072, + "step": 4060 + }, + { + "epoch": 0.249097251973805, + "grad_norm": 0.2891339957714081, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0074, + "step": 4070 + }, + { + "epoch": 0.2497092845339372, + "grad_norm": 0.24179549515247345, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 0.2503213170940694, + "grad_norm": 0.20879383385181427, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0064, + "step": 4090 + }, + { + "epoch": 0.2509333496542016, + "grad_norm": 0.39275774359703064, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0073, + "step": 4100 + }, + { + "epoch": 0.2515453822143338, + "grad_norm": 0.2925782799720764, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0095, + "step": 4110 + }, + { + "epoch": 0.252157414774466, + "grad_norm": 0.6465128660202026, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0102, + "step": 4120 + }, + { + "epoch": 0.2527694473345982, + "grad_norm": 0.34663915634155273, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.007, + "step": 4130 + }, + { + "epoch": 0.2533814798947304, + "grad_norm": 0.3387165367603302, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0091, + "step": 4140 + }, + { + "epoch": 0.2539935124548626, + "grad_norm": 0.32989630103111267, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0084, + "step": 4150 + }, + { + "epoch": 0.25460554501499477, + "grad_norm": 0.22870391607284546, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0071, + "step": 4160 + }, + { + "epoch": 0.255217577575127, + "grad_norm": 0.3866496682167053, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0079, + "step": 4170 + }, + { + "epoch": 0.2558296101352592, + "grad_norm": 0.29885268211364746, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0068, + "step": 4180 + }, + { + "epoch": 0.2564416426953914, + "grad_norm": 0.4693736135959625, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0095, + "step": 4190 + }, + { + "epoch": 0.2570536752555236, + "grad_norm": 0.2822454273700714, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0074, + "step": 4200 + }, + { + "epoch": 0.2576657078156558, + "grad_norm": 0.21141012012958527, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0099, + "step": 4210 + }, + { + "epoch": 0.258277740375788, + "grad_norm": 0.2284570336341858, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0102, + "step": 4220 + }, + { + "epoch": 0.2588897729359202, + "grad_norm": 0.4675048887729645, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0112, + "step": 4230 + }, + { + "epoch": 0.2595018054960524, + "grad_norm": 0.3906441628932953, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0088, + "step": 4240 + }, + { + "epoch": 0.2601138380561846, + "grad_norm": 0.22990387678146362, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0091, + "step": 4250 + }, + { + "epoch": 0.2607258706163168, + "grad_norm": 0.41871073842048645, + "learning_rate": 1.944490251296856e-05, + "loss": 0.009, + "step": 4260 + }, + { + "epoch": 0.261337903176449, + "grad_norm": 0.2724440395832062, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0074, + "step": 4270 + }, + { + "epoch": 0.2619499357365812, + "grad_norm": 0.42590636014938354, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0069, + "step": 4280 + }, + { + "epoch": 0.2625619682967134, + "grad_norm": 0.3604855239391327, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0068, + "step": 4290 + }, + { + "epoch": 0.26317400085684556, + "grad_norm": 0.475304514169693, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0082, + "step": 4300 + }, + { + "epoch": 0.26378603341697776, + "grad_norm": 0.24752479791641235, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0065, + "step": 4310 + }, + { + "epoch": 0.26439806597710996, + "grad_norm": 0.4384835958480835, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0104, + "step": 4320 + }, + { + "epoch": 0.26501009853724217, + "grad_norm": 0.24999107420444489, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0076, + "step": 4330 + }, + { + "epoch": 0.26562213109737437, + "grad_norm": 0.292491614818573, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0069, + "step": 4340 + }, + { + "epoch": 0.2662341636575066, + "grad_norm": 0.2380208522081375, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0069, + "step": 4350 + }, + { + "epoch": 0.2668461962176388, + "grad_norm": 0.2906023859977722, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0063, + "step": 4360 + }, + { + "epoch": 0.267458228777771, + "grad_norm": 0.4718990623950958, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0074, + "step": 4370 + }, + { + "epoch": 0.2680702613379032, + "grad_norm": 0.33257269859313965, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0073, + "step": 4380 + }, + { + "epoch": 0.2686822938980354, + "grad_norm": 0.34411463141441345, + "learning_rate": 1.940024231916886e-05, + "loss": 0.006, + "step": 4390 + }, + { + "epoch": 0.2692943264581676, + "grad_norm": 0.40312516689300537, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0093, + "step": 4400 + }, + { + "epoch": 0.2699063590182998, + "grad_norm": 0.2248350828886032, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0082, + "step": 4410 + }, + { + "epoch": 0.270518391578432, + "grad_norm": 0.30094820261001587, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0092, + "step": 4420 + }, + { + "epoch": 0.2711304241385642, + "grad_norm": 0.4277440309524536, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0099, + "step": 4430 + }, + { + "epoch": 0.27174245669869634, + "grad_norm": 0.2876254916191101, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0078, + "step": 4440 + }, + { + "epoch": 0.27235448925882855, + "grad_norm": 0.3453986346721649, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0096, + "step": 4450 + }, + { + "epoch": 0.27296652181896075, + "grad_norm": 0.31379634141921997, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0088, + "step": 4460 + }, + { + "epoch": 0.27357855437909295, + "grad_norm": 0.294477254152298, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0073, + "step": 4470 + }, + { + "epoch": 0.27419058693922516, + "grad_norm": 0.3773270845413208, + "learning_rate": 1.936834723687526e-05, + "loss": 0.008, + "step": 4480 + }, + { + "epoch": 0.27480261949935736, + "grad_norm": 0.31942978501319885, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0086, + "step": 4490 + }, + { + "epoch": 0.27541465205948956, + "grad_norm": 0.46827632188796997, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0081, + "step": 4500 + }, + { + "epoch": 0.27602668461962176, + "grad_norm": 0.2735249102115631, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0107, + "step": 4510 + }, + { + "epoch": 0.27663871717975397, + "grad_norm": 0.30048197507858276, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0082, + "step": 4520 + }, + { + "epoch": 0.27725074973988617, + "grad_norm": 0.3507469594478607, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0093, + "step": 4530 + }, + { + "epoch": 0.2778627823000184, + "grad_norm": 0.5642989277839661, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0091, + "step": 4540 + }, + { + "epoch": 0.2784748148601506, + "grad_norm": 0.2769993245601654, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0105, + "step": 4550 + }, + { + "epoch": 0.2790868474202828, + "grad_norm": 0.30269622802734375, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0066, + "step": 4560 + }, + { + "epoch": 0.279698879980415, + "grad_norm": 0.3717023432254791, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0114, + "step": 4570 + }, + { + "epoch": 0.28031091254054713, + "grad_norm": 0.5065163373947144, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0075, + "step": 4580 + }, + { + "epoch": 0.28092294510067933, + "grad_norm": 0.4302189350128174, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0077, + "step": 4590 + }, + { + "epoch": 0.28153497766081154, + "grad_norm": 0.44008374214172363, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0086, + "step": 4600 + }, + { + "epoch": 0.28214701022094374, + "grad_norm": 0.4647364318370819, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0073, + "step": 4610 + }, + { + "epoch": 0.28275904278107594, + "grad_norm": 0.4229913651943207, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0077, + "step": 4620 + }, + { + "epoch": 0.28337107534120815, + "grad_norm": 0.36600178480148315, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0078, + "step": 4630 + }, + { + "epoch": 0.28398310790134035, + "grad_norm": 0.47143280506134033, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0085, + "step": 4640 + }, + { + "epoch": 0.28459514046147255, + "grad_norm": 0.29140496253967285, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0056, + "step": 4650 + }, + { + "epoch": 0.28520717302160475, + "grad_norm": 0.3964666426181793, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0071, + "step": 4660 + }, + { + "epoch": 0.28581920558173696, + "grad_norm": 0.407536119222641, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0095, + "step": 4670 + }, + { + "epoch": 0.28643123814186916, + "grad_norm": 0.33687031269073486, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0091, + "step": 4680 + }, + { + "epoch": 0.28704327070200136, + "grad_norm": 0.3182448446750641, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0087, + "step": 4690 + }, + { + "epoch": 0.28765530326213357, + "grad_norm": 0.40998023748397827, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0099, + "step": 4700 + }, + { + "epoch": 0.28826733582226577, + "grad_norm": 0.28750360012054443, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0087, + "step": 4710 + }, + { + "epoch": 0.2888793683823979, + "grad_norm": 0.36494627594947815, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0062, + "step": 4720 + }, + { + "epoch": 0.2894914009425301, + "grad_norm": 0.37047910690307617, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0094, + "step": 4730 + }, + { + "epoch": 0.2901034335026623, + "grad_norm": 0.2577553987503052, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0082, + "step": 4740 + }, + { + "epoch": 0.2907154660627945, + "grad_norm": 0.24589397013187408, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0067, + "step": 4750 + }, + { + "epoch": 0.29132749862292673, + "grad_norm": 0.37927499413490295, + "learning_rate": 1.926404507646751e-05, + "loss": 0.008, + "step": 4760 + }, + { + "epoch": 0.29193953118305893, + "grad_norm": 0.40547946095466614, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0101, + "step": 4770 + }, + { + "epoch": 0.29255156374319113, + "grad_norm": 0.47896578907966614, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0114, + "step": 4780 + }, + { + "epoch": 0.29316359630332334, + "grad_norm": 0.42911696434020996, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0066, + "step": 4790 + }, + { + "epoch": 0.29377562886345554, + "grad_norm": 0.21735505759716034, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0072, + "step": 4800 + }, + { + "epoch": 0.29438766142358774, + "grad_norm": 0.25916650891304016, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0078, + "step": 4810 + }, + { + "epoch": 0.29499969398371995, + "grad_norm": 0.23863966763019562, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0086, + "step": 4820 + }, + { + "epoch": 0.29561172654385215, + "grad_norm": 0.41552650928497314, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0078, + "step": 4830 + }, + { + "epoch": 0.29622375910398435, + "grad_norm": 0.2775874733924866, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0082, + "step": 4840 + }, + { + "epoch": 0.29683579166411656, + "grad_norm": 0.28962916135787964, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0088, + "step": 4850 + }, + { + "epoch": 0.2974478242242487, + "grad_norm": 0.3488757610321045, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0076, + "step": 4860 + }, + { + "epoch": 0.2980598567843809, + "grad_norm": 0.3833489716053009, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0096, + "step": 4870 + }, + { + "epoch": 0.2986718893445131, + "grad_norm": 0.20357537269592285, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.0061, + "step": 4880 + }, + { + "epoch": 0.2992839219046453, + "grad_norm": 0.4648539423942566, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0099, + "step": 4890 + }, + { + "epoch": 0.2998959544647775, + "grad_norm": 0.2701941728591919, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0062, + "step": 4900 + }, + { + "epoch": 0.3005079870249097, + "grad_norm": 0.31277161836624146, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0069, + "step": 4910 + }, + { + "epoch": 0.3011200195850419, + "grad_norm": 0.27697697281837463, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0094, + "step": 4920 + }, + { + "epoch": 0.3017320521451741, + "grad_norm": 0.22880606353282928, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0074, + "step": 4930 + }, + { + "epoch": 0.3023440847053063, + "grad_norm": 0.258404940366745, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0078, + "step": 4940 + }, + { + "epoch": 0.30295611726543853, + "grad_norm": 0.394394189119339, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0109, + "step": 4950 + }, + { + "epoch": 0.30356814982557073, + "grad_norm": 0.24108687043190002, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0082, + "step": 4960 + }, + { + "epoch": 0.30418018238570294, + "grad_norm": 0.34520867466926575, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0098, + "step": 4970 + }, + { + "epoch": 0.30479221494583514, + "grad_norm": 0.33723267912864685, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0104, + "step": 4980 + }, + { + "epoch": 0.30540424750596734, + "grad_norm": 0.28276878595352173, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0072, + "step": 4990 + }, + { + "epoch": 0.30601628006609954, + "grad_norm": 0.32236188650131226, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.012, + "step": 5000 + }, + { + "epoch": 0.3066283126262317, + "grad_norm": 0.20596888661384583, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0073, + "step": 5010 + }, + { + "epoch": 0.3072403451863639, + "grad_norm": 0.37921255826950073, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0073, + "step": 5020 + }, + { + "epoch": 0.3078523777464961, + "grad_norm": 0.30738911032676697, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0083, + "step": 5030 + }, + { + "epoch": 0.3084644103066283, + "grad_norm": 0.1938163936138153, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0065, + "step": 5040 + }, + { + "epoch": 0.3090764428667605, + "grad_norm": 0.25826898217201233, + "learning_rate": 1.914800406458133e-05, + "loss": 0.008, + "step": 5050 + }, + { + "epoch": 0.3096884754268927, + "grad_norm": 0.18951697647571564, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0058, + "step": 5060 + }, + { + "epoch": 0.3103005079870249, + "grad_norm": 0.3877381980419159, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0095, + "step": 5070 + }, + { + "epoch": 0.3109125405471571, + "grad_norm": 0.3133573830127716, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0088, + "step": 5080 + }, + { + "epoch": 0.3115245731072893, + "grad_norm": 0.33131852746009827, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0062, + "step": 5090 + }, + { + "epoch": 0.3121366056674215, + "grad_norm": 0.21276263892650604, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0071, + "step": 5100 + }, + { + "epoch": 0.3127486382275537, + "grad_norm": 0.46878281235694885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0084, + "step": 5110 + }, + { + "epoch": 0.3133606707876859, + "grad_norm": 0.44227683544158936, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0097, + "step": 5120 + }, + { + "epoch": 0.3139727033478181, + "grad_norm": 0.41950204968452454, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0115, + "step": 5130 + }, + { + "epoch": 0.31458473590795033, + "grad_norm": 0.4214445948600769, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0081, + "step": 5140 + }, + { + "epoch": 0.3151967684680825, + "grad_norm": 0.3779868483543396, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0075, + "step": 5150 + }, + { + "epoch": 0.3158088010282147, + "grad_norm": 0.4587777853012085, + "learning_rate": 1.910187855634501e-05, + "loss": 0.009, + "step": 5160 + }, + { + "epoch": 0.3164208335883469, + "grad_norm": 0.4875587224960327, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0099, + "step": 5170 + }, + { + "epoch": 0.3170328661484791, + "grad_norm": 0.22378237545490265, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0071, + "step": 5180 + }, + { + "epoch": 0.3176448987086113, + "grad_norm": 0.3360678553581238, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0101, + "step": 5190 + }, + { + "epoch": 0.3182569312687435, + "grad_norm": 0.36370640993118286, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0068, + "step": 5200 + }, + { + "epoch": 0.3188689638288757, + "grad_norm": 0.25814393162727356, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0076, + "step": 5210 + }, + { + "epoch": 0.3194809963890079, + "grad_norm": 0.39010074734687805, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0066, + "step": 5220 + }, + { + "epoch": 0.3200930289491401, + "grad_norm": 0.44009074568748474, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0078, + "step": 5230 + }, + { + "epoch": 0.3207050615092723, + "grad_norm": 0.45733046531677246, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0074, + "step": 5240 + }, + { + "epoch": 0.3213170940694045, + "grad_norm": 0.4555135667324066, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0089, + "step": 5250 + }, + { + "epoch": 0.3219291266295367, + "grad_norm": 0.5864276885986328, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0083, + "step": 5260 + }, + { + "epoch": 0.3225411591896689, + "grad_norm": 0.3305470943450928, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0094, + "step": 5270 + }, + { + "epoch": 0.3231531917498011, + "grad_norm": 0.21458053588867188, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.0091, + "step": 5280 + }, + { + "epoch": 0.32376522430993326, + "grad_norm": 0.2927384376525879, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.007, + "step": 5290 + }, + { + "epoch": 0.32437725687006547, + "grad_norm": 0.387608140707016, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0068, + "step": 5300 + }, + { + "epoch": 0.32498928943019767, + "grad_norm": 0.28193122148513794, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0065, + "step": 5310 + }, + { + "epoch": 0.3256013219903299, + "grad_norm": 0.33098119497299194, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0082, + "step": 5320 + }, + { + "epoch": 0.3262133545504621, + "grad_norm": 0.5442482233047485, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0124, + "step": 5330 + }, + { + "epoch": 0.3268253871105943, + "grad_norm": 0.503669798374176, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0093, + "step": 5340 + }, + { + "epoch": 0.3274374196707265, + "grad_norm": 0.2307574301958084, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0071, + "step": 5350 + }, + { + "epoch": 0.3280494522308587, + "grad_norm": 0.3543917238712311, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.007, + "step": 5360 + }, + { + "epoch": 0.3286614847909909, + "grad_norm": 0.21763169765472412, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0059, + "step": 5370 + }, + { + "epoch": 0.3292735173511231, + "grad_norm": 0.38023391366004944, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0068, + "step": 5380 + }, + { + "epoch": 0.3298855499112553, + "grad_norm": 0.44597327709198, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0069, + "step": 5390 + }, + { + "epoch": 0.3304975824713875, + "grad_norm": 0.2994389533996582, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0083, + "step": 5400 + }, + { + "epoch": 0.3311096150315197, + "grad_norm": 0.26668304204940796, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0071, + "step": 5410 + }, + { + "epoch": 0.3317216475916519, + "grad_norm": 0.25944197177886963, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0065, + "step": 5420 + }, + { + "epoch": 0.33233368015178405, + "grad_norm": 0.3646431267261505, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0065, + "step": 5430 + }, + { + "epoch": 0.33294571271191625, + "grad_norm": 0.34860959649086, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0098, + "step": 5440 + }, + { + "epoch": 0.33355774527204846, + "grad_norm": 0.33718568086624146, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0069, + "step": 5450 + }, + { + "epoch": 0.33416977783218066, + "grad_norm": 0.2417302280664444, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0064, + "step": 5460 + }, + { + "epoch": 0.33478181039231286, + "grad_norm": 0.26607826352119446, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0048, + "step": 5470 + }, + { + "epoch": 0.33539384295244506, + "grad_norm": 0.31762364506721497, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0064, + "step": 5480 + }, + { + "epoch": 0.33600587551257727, + "grad_norm": 0.21427015960216522, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0079, + "step": 5490 + }, + { + "epoch": 0.33661790807270947, + "grad_norm": 0.3372637629508972, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0077, + "step": 5500 + }, + { + "epoch": 0.3372299406328417, + "grad_norm": 0.3760700821876526, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0066, + "step": 5510 + }, + { + "epoch": 0.3378419731929739, + "grad_norm": 0.22838029265403748, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0061, + "step": 5520 + }, + { + "epoch": 0.3384540057531061, + "grad_norm": 0.3105243444442749, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0089, + "step": 5530 + }, + { + "epoch": 0.3390660383132383, + "grad_norm": 0.23694929480552673, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0086, + "step": 5540 + }, + { + "epoch": 0.3396780708733705, + "grad_norm": 0.22935174405574799, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0074, + "step": 5550 + }, + { + "epoch": 0.3402901034335027, + "grad_norm": 0.26384714245796204, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0069, + "step": 5560 + }, + { + "epoch": 0.34090213599363484, + "grad_norm": 0.33245643973350525, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0061, + "step": 5570 + }, + { + "epoch": 0.34151416855376704, + "grad_norm": 0.3904813230037689, + "learning_rate": 1.891523933768891e-05, + "loss": 0.009, + "step": 5580 + }, + { + "epoch": 0.34212620111389924, + "grad_norm": 0.33858415484428406, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0067, + "step": 5590 + }, + { + "epoch": 0.34273823367403145, + "grad_norm": 0.3197486996650696, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0082, + "step": 5600 + }, + { + "epoch": 0.34335026623416365, + "grad_norm": 0.23814789950847626, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0068, + "step": 5610 + }, + { + "epoch": 0.34396229879429585, + "grad_norm": 0.3820457458496094, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0082, + "step": 5620 + }, + { + "epoch": 0.34457433135442805, + "grad_norm": 0.27518680691719055, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0063, + "step": 5630 + }, + { + "epoch": 0.34518636391456026, + "grad_norm": 0.24741721153259277, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0076, + "step": 5640 + }, + { + "epoch": 0.34579839647469246, + "grad_norm": 0.5140052437782288, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0178, + "step": 5650 + }, + { + "epoch": 0.34641042903482466, + "grad_norm": 0.5363543033599854, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0097, + "step": 5660 + }, + { + "epoch": 0.34702246159495687, + "grad_norm": 0.41116055846214294, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0078, + "step": 5670 + }, + { + "epoch": 0.34763449415508907, + "grad_norm": 0.412762314081192, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0086, + "step": 5680 + }, + { + "epoch": 0.34824652671522127, + "grad_norm": 0.399527907371521, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0088, + "step": 5690 + }, + { + "epoch": 0.3488585592753535, + "grad_norm": 0.3447834551334381, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0084, + "step": 5700 + }, + { + "epoch": 0.3494705918354856, + "grad_norm": 0.3418859541416168, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0092, + "step": 5710 + }, + { + "epoch": 0.3500826243956178, + "grad_norm": 0.3336535692214966, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.0065, + "step": 5720 + }, + { + "epoch": 0.35069465695575003, + "grad_norm": 0.34575122594833374, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0068, + "step": 5730 + }, + { + "epoch": 0.35130668951588223, + "grad_norm": 0.34325110912323, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.01, + "step": 5740 + }, + { + "epoch": 0.35191872207601443, + "grad_norm": 0.20104236900806427, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0085, + "step": 5750 + }, + { + "epoch": 0.35253075463614664, + "grad_norm": 0.33699074387550354, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0073, + "step": 5760 + }, + { + "epoch": 0.35314278719627884, + "grad_norm": 0.33322635293006897, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0091, + "step": 5770 + }, + { + "epoch": 0.35375481975641104, + "grad_norm": 0.26897475123405457, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0087, + "step": 5780 + }, + { + "epoch": 0.35436685231654325, + "grad_norm": 0.5310013890266418, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0067, + "step": 5790 + }, + { + "epoch": 0.35497888487667545, + "grad_norm": 0.4203440845012665, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0097, + "step": 5800 + }, + { + "epoch": 0.35559091743680765, + "grad_norm": 0.2179369181394577, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0082, + "step": 5810 + }, + { + "epoch": 0.35620294999693985, + "grad_norm": 0.2789444625377655, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0066, + "step": 5820 + }, + { + "epoch": 0.35681498255707206, + "grad_norm": 0.28009694814682007, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.007, + "step": 5830 + }, + { + "epoch": 0.35742701511720426, + "grad_norm": 0.304768443107605, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0065, + "step": 5840 + }, + { + "epoch": 0.3580390476773364, + "grad_norm": 0.2829401195049286, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0061, + "step": 5850 + }, + { + "epoch": 0.3586510802374686, + "grad_norm": 0.3388998508453369, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0083, + "step": 5860 + }, + { + "epoch": 0.3592631127976008, + "grad_norm": 0.3313426673412323, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0069, + "step": 5870 + }, + { + "epoch": 0.359875145357733, + "grad_norm": 0.2886904180049896, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0094, + "step": 5880 + }, + { + "epoch": 0.3604871779178652, + "grad_norm": 0.3132432997226715, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0086, + "step": 5890 + }, + { + "epoch": 0.3610992104779974, + "grad_norm": 0.37195107340812683, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0137, + "step": 5900 + }, + { + "epoch": 0.3617112430381296, + "grad_norm": 0.30853375792503357, + "learning_rate": 1.875708056549365e-05, + "loss": 0.01, + "step": 5910 + }, + { + "epoch": 0.36232327559826183, + "grad_norm": 0.39785459637641907, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0073, + "step": 5920 + }, + { + "epoch": 0.36293530815839403, + "grad_norm": 0.26958727836608887, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0059, + "step": 5930 + }, + { + "epoch": 0.36354734071852624, + "grad_norm": 0.354956716299057, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0069, + "step": 5940 + }, + { + "epoch": 0.36415937327865844, + "grad_norm": 0.3470858037471771, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0066, + "step": 5950 + }, + { + "epoch": 0.36477140583879064, + "grad_norm": 0.30000701546669006, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0075, + "step": 5960 + }, + { + "epoch": 0.36538343839892284, + "grad_norm": 0.5558263063430786, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0083, + "step": 5970 + }, + { + "epoch": 0.36599547095905505, + "grad_norm": 0.39146295189857483, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0062, + "step": 5980 + }, + { + "epoch": 0.3666075035191872, + "grad_norm": 0.44002753496170044, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0089, + "step": 5990 + }, + { + "epoch": 0.3672195360793194, + "grad_norm": 0.3220095932483673, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0097, + "step": 6000 + }, + { + "epoch": 0.3678315686394516, + "grad_norm": 0.3569507598876953, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0067, + "step": 6010 + }, + { + "epoch": 0.3684436011995838, + "grad_norm": 0.3004184365272522, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0084, + "step": 6020 + }, + { + "epoch": 0.369055633759716, + "grad_norm": 0.2931320071220398, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0079, + "step": 6030 + }, + { + "epoch": 0.3696676663198482, + "grad_norm": 0.39551016688346863, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0088, + "step": 6040 + }, + { + "epoch": 0.3702796988799804, + "grad_norm": 0.33755603432655334, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0082, + "step": 6050 + }, + { + "epoch": 0.3708917314401126, + "grad_norm": 0.3101558983325958, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0069, + "step": 6060 + }, + { + "epoch": 0.3715037640002448, + "grad_norm": 0.2921602129936218, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0065, + "step": 6070 + }, + { + "epoch": 0.372115796560377, + "grad_norm": 0.3601403832435608, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0063, + "step": 6080 + }, + { + "epoch": 0.3727278291205092, + "grad_norm": 0.34929168224334717, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0073, + "step": 6090 + }, + { + "epoch": 0.3733398616806414, + "grad_norm": 0.3987390995025635, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0068, + "step": 6100 + }, + { + "epoch": 0.37395189424077363, + "grad_norm": 0.2641090452671051, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0091, + "step": 6110 + }, + { + "epoch": 0.37456392680090583, + "grad_norm": 0.23139338195323944, + "learning_rate": 1.865125972978549e-05, + "loss": 0.006, + "step": 6120 + }, + { + "epoch": 0.375175959361038, + "grad_norm": 0.26552167534828186, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0056, + "step": 6130 + }, + { + "epoch": 0.3757879919211702, + "grad_norm": 0.43827885389328003, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0084, + "step": 6140 + }, + { + "epoch": 0.3764000244813024, + "grad_norm": 0.27495354413986206, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.012, + "step": 6150 + }, + { + "epoch": 0.3770120570414346, + "grad_norm": 0.36078640818595886, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0104, + "step": 6160 + }, + { + "epoch": 0.3776240896015668, + "grad_norm": 0.28252753615379333, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0103, + "step": 6170 + }, + { + "epoch": 0.378236122161699, + "grad_norm": 0.2674558162689209, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0069, + "step": 6180 + }, + { + "epoch": 0.3788481547218312, + "grad_norm": 0.21457509696483612, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0089, + "step": 6190 + }, + { + "epoch": 0.3794601872819634, + "grad_norm": 0.3142339885234833, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0075, + "step": 6200 + }, + { + "epoch": 0.3800722198420956, + "grad_norm": 0.32714203000068665, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0084, + "step": 6210 + }, + { + "epoch": 0.3806842524022278, + "grad_norm": 0.2632557153701782, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0072, + "step": 6220 + }, + { + "epoch": 0.38129628496236, + "grad_norm": 0.1893932968378067, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0063, + "step": 6230 + }, + { + "epoch": 0.3819083175224922, + "grad_norm": 0.49935290217399597, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0087, + "step": 6240 + }, + { + "epoch": 0.3825203500826244, + "grad_norm": 0.34605127573013306, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0101, + "step": 6250 + }, + { + "epoch": 0.3831323826427566, + "grad_norm": 0.3294198513031006, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0067, + "step": 6260 + }, + { + "epoch": 0.38374441520288877, + "grad_norm": 0.34797370433807373, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0065, + "step": 6270 + }, + { + "epoch": 0.38435644776302097, + "grad_norm": 0.37710750102996826, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0061, + "step": 6280 + }, + { + "epoch": 0.3849684803231532, + "grad_norm": 0.39949893951416016, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0066, + "step": 6290 + }, + { + "epoch": 0.3855805128832854, + "grad_norm": 0.33014294505119324, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0095, + "step": 6300 + }, + { + "epoch": 0.3861925454434176, + "grad_norm": 0.4329249858856201, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0089, + "step": 6310 + }, + { + "epoch": 0.3868045780035498, + "grad_norm": 0.298330157995224, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0076, + "step": 6320 + }, + { + "epoch": 0.387416610563682, + "grad_norm": 0.2672661542892456, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0074, + "step": 6330 + }, + { + "epoch": 0.3880286431238142, + "grad_norm": 0.48193076252937317, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0103, + "step": 6340 + }, + { + "epoch": 0.3886406756839464, + "grad_norm": 0.29180601239204407, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0075, + "step": 6350 + }, + { + "epoch": 0.3892527082440786, + "grad_norm": 0.21320492029190063, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0077, + "step": 6360 + }, + { + "epoch": 0.3898647408042108, + "grad_norm": 0.37252935767173767, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0078, + "step": 6370 + }, + { + "epoch": 0.390476773364343, + "grad_norm": 0.284586101770401, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0089, + "step": 6380 + }, + { + "epoch": 0.3910888059244752, + "grad_norm": 0.5030382871627808, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0088, + "step": 6390 + }, + { + "epoch": 0.3917008384846074, + "grad_norm": 0.357239305973053, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0069, + "step": 6400 + }, + { + "epoch": 0.39231287104473955, + "grad_norm": 0.20308594405651093, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0062, + "step": 6410 + }, + { + "epoch": 0.39292490360487176, + "grad_norm": 0.2678150534629822, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0085, + "step": 6420 + }, + { + "epoch": 0.39353693616500396, + "grad_norm": 0.35160595178604126, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0072, + "step": 6430 + }, + { + "epoch": 0.39414896872513616, + "grad_norm": 0.33254173398017883, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0083, + "step": 6440 + }, + { + "epoch": 0.39476100128526836, + "grad_norm": 0.22763408720493317, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0061, + "step": 6450 + }, + { + "epoch": 0.39537303384540057, + "grad_norm": 0.20889192819595337, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0067, + "step": 6460 + }, + { + "epoch": 0.39598506640553277, + "grad_norm": 0.22515206038951874, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0086, + "step": 6470 + }, + { + "epoch": 0.396597098965665, + "grad_norm": 0.36421817541122437, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.0064, + "step": 6480 + }, + { + "epoch": 0.3972091315257972, + "grad_norm": 0.3869773745536804, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0084, + "step": 6490 + }, + { + "epoch": 0.3978211640859294, + "grad_norm": 0.26248687505722046, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0061, + "step": 6500 + }, + { + "epoch": 0.3984331966460616, + "grad_norm": 0.22152310609817505, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0057, + "step": 6510 + }, + { + "epoch": 0.3990452292061938, + "grad_norm": 0.25921961665153503, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0071, + "step": 6520 + }, + { + "epoch": 0.399657261766326, + "grad_norm": 0.3289903998374939, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0076, + "step": 6530 + }, + { + "epoch": 0.4002692943264582, + "grad_norm": 0.2767571210861206, + "learning_rate": 1.8427795928237e-05, + "loss": 0.01, + "step": 6540 + }, + { + "epoch": 0.40088132688659034, + "grad_norm": 0.46339666843414307, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0064, + "step": 6550 + }, + { + "epoch": 0.40149335944672254, + "grad_norm": 0.2942553460597992, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0067, + "step": 6560 + }, + { + "epoch": 0.40210539200685474, + "grad_norm": 0.3868240714073181, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0085, + "step": 6570 + }, + { + "epoch": 0.40271742456698695, + "grad_norm": 0.3999684154987335, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0067, + "step": 6580 + }, + { + "epoch": 0.40332945712711915, + "grad_norm": 0.42856812477111816, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0091, + "step": 6590 + }, + { + "epoch": 0.40394148968725135, + "grad_norm": 0.3099806010723114, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0084, + "step": 6600 + }, + { + "epoch": 0.40455352224738356, + "grad_norm": 0.3798827826976776, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0066, + "step": 6610 + }, + { + "epoch": 0.40516555480751576, + "grad_norm": 0.19007280468940735, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0068, + "step": 6620 + }, + { + "epoch": 0.40577758736764796, + "grad_norm": 0.3723277151584625, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0085, + "step": 6630 + }, + { + "epoch": 0.40638961992778017, + "grad_norm": 0.21034900844097137, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0069, + "step": 6640 + }, + { + "epoch": 0.40700165248791237, + "grad_norm": 0.29838645458221436, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0095, + "step": 6650 + }, + { + "epoch": 0.40761368504804457, + "grad_norm": 0.2645854353904724, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0061, + "step": 6660 + }, + { + "epoch": 0.4082257176081768, + "grad_norm": 0.21633592247962952, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.006, + "step": 6670 + }, + { + "epoch": 0.408837750168309, + "grad_norm": 0.25387731194496155, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.008, + "step": 6680 + }, + { + "epoch": 0.4094497827284412, + "grad_norm": 0.3752288520336151, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0092, + "step": 6690 + }, + { + "epoch": 0.41006181528857333, + "grad_norm": 0.33368971943855286, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0072, + "step": 6700 + }, + { + "epoch": 0.41067384784870553, + "grad_norm": 0.34388917684555054, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0074, + "step": 6710 + }, + { + "epoch": 0.41128588040883773, + "grad_norm": 0.2683192789554596, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.007, + "step": 6720 + }, + { + "epoch": 0.41189791296896994, + "grad_norm": 0.5121234059333801, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0074, + "step": 6730 + }, + { + "epoch": 0.41250994552910214, + "grad_norm": 0.333406925201416, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0117, + "step": 6740 + }, + { + "epoch": 0.41312197808923434, + "grad_norm": 0.26011794805526733, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0062, + "step": 6750 + }, + { + "epoch": 0.41373401064936655, + "grad_norm": 0.28925821185112, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0066, + "step": 6760 + }, + { + "epoch": 0.41434604320949875, + "grad_norm": 0.2202957570552826, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0058, + "step": 6770 + }, + { + "epoch": 0.41495807576963095, + "grad_norm": 0.2740793824195862, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0072, + "step": 6780 + }, + { + "epoch": 0.41557010832976315, + "grad_norm": 0.46569427847862244, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0069, + "step": 6790 + }, + { + "epoch": 0.41618214088989536, + "grad_norm": 0.3959881067276001, + "learning_rate": 1.828172598376902e-05, + "loss": 0.009, + "step": 6800 + }, + { + "epoch": 0.41679417345002756, + "grad_norm": 0.2465214729309082, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0068, + "step": 6810 + }, + { + "epoch": 0.41740620601015976, + "grad_norm": 0.3207756280899048, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0083, + "step": 6820 + }, + { + "epoch": 0.41801823857029197, + "grad_norm": 0.5600990653038025, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0076, + "step": 6830 + }, + { + "epoch": 0.4186302711304241, + "grad_norm": 0.32832831144332886, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0072, + "step": 6840 + }, + { + "epoch": 0.4192423036905563, + "grad_norm": 0.3397129774093628, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0083, + "step": 6850 + }, + { + "epoch": 0.4198543362506885, + "grad_norm": 0.3481312096118927, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0064, + "step": 6860 + }, + { + "epoch": 0.4204663688108207, + "grad_norm": 0.4542059898376465, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0104, + "step": 6870 + }, + { + "epoch": 0.4210784013709529, + "grad_norm": 0.2517620325088501, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0089, + "step": 6880 + }, + { + "epoch": 0.42169043393108513, + "grad_norm": 0.3671923875808716, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0068, + "step": 6890 + }, + { + "epoch": 0.42230246649121733, + "grad_norm": 0.41340726613998413, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0084, + "step": 6900 + }, + { + "epoch": 0.42291449905134954, + "grad_norm": 0.22815965116024017, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0079, + "step": 6910 + }, + { + "epoch": 0.42352653161148174, + "grad_norm": 0.35324010252952576, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0073, + "step": 6920 + }, + { + "epoch": 0.42413856417161394, + "grad_norm": 0.30134323239326477, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0071, + "step": 6930 + }, + { + "epoch": 0.42475059673174614, + "grad_norm": 0.4007415771484375, + "learning_rate": 1.82006727813775e-05, + "loss": 0.006, + "step": 6940 + }, + { + "epoch": 0.42536262929187835, + "grad_norm": 0.3320179879665375, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0074, + "step": 6950 + }, + { + "epoch": 0.42597466185201055, + "grad_norm": 0.311971515417099, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0062, + "step": 6960 + }, + { + "epoch": 0.42658669441214275, + "grad_norm": 0.34347453713417053, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0068, + "step": 6970 + }, + { + "epoch": 0.4271987269722749, + "grad_norm": 0.25632336735725403, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0113, + "step": 6980 + }, + { + "epoch": 0.4278107595324071, + "grad_norm": 0.21711130440235138, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0068, + "step": 6990 + }, + { + "epoch": 0.4284227920925393, + "grad_norm": 0.3381270170211792, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0064, + "step": 7000 + }, + { + "epoch": 0.4290348246526715, + "grad_norm": 0.32262885570526123, + "learning_rate": 1.815952390818299e-05, + "loss": 0.0091, + "step": 7010 + }, + { + "epoch": 0.4296468572128037, + "grad_norm": 0.65865558385849, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0104, + "step": 7020 + }, + { + "epoch": 0.4302588897729359, + "grad_norm": 0.3021128177642822, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.009, + "step": 7030 + }, + { + "epoch": 0.4308709223330681, + "grad_norm": 0.2859005331993103, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0065, + "step": 7040 + }, + { + "epoch": 0.4314829548932003, + "grad_norm": 0.3379405736923218, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0062, + "step": 7050 + }, + { + "epoch": 0.4320949874533325, + "grad_norm": 0.22009991109371185, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.007, + "step": 7060 + }, + { + "epoch": 0.4327070200134647, + "grad_norm": 0.24766206741333008, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0072, + "step": 7070 + }, + { + "epoch": 0.43331905257359693, + "grad_norm": 0.3557615280151367, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0096, + "step": 7080 + }, + { + "epoch": 0.43393108513372913, + "grad_norm": 0.5700691938400269, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0066, + "step": 7090 + }, + { + "epoch": 0.43454311769386134, + "grad_norm": 0.3194892704486847, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0065, + "step": 7100 + }, + { + "epoch": 0.43515515025399354, + "grad_norm": 0.2766750752925873, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0074, + "step": 7110 + }, + { + "epoch": 0.4357671828141257, + "grad_norm": 0.2775132656097412, + "learning_rate": 1.809403050791396e-05, + "loss": 0.007, + "step": 7120 + }, + { + "epoch": 0.4363792153742579, + "grad_norm": 0.4468507170677185, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0066, + "step": 7130 + }, + { + "epoch": 0.4369912479343901, + "grad_norm": 0.3282400369644165, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0185, + "step": 7140 + }, + { + "epoch": 0.4376032804945223, + "grad_norm": 0.2625710964202881, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0088, + "step": 7150 + }, + { + "epoch": 0.4382153130546545, + "grad_norm": 0.47729599475860596, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.008, + "step": 7160 + }, + { + "epoch": 0.4388273456147867, + "grad_norm": 0.30350950360298157, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0091, + "step": 7170 + }, + { + "epoch": 0.4394393781749189, + "grad_norm": 0.3514627516269684, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0065, + "step": 7180 + }, + { + "epoch": 0.4400514107350511, + "grad_norm": 0.26150578260421753, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0087, + "step": 7190 + }, + { + "epoch": 0.4406634432951833, + "grad_norm": 0.374138206243515, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0073, + "step": 7200 + }, + { + "epoch": 0.4412754758553155, + "grad_norm": 0.2980635166168213, + "learning_rate": 1.803969531201634e-05, + "loss": 0.007, + "step": 7210 + }, + { + "epoch": 0.4418875084154477, + "grad_norm": 0.38190510869026184, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0077, + "step": 7220 + }, + { + "epoch": 0.4424995409755799, + "grad_norm": 0.28819066286087036, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0065, + "step": 7230 + }, + { + "epoch": 0.4431115735357121, + "grad_norm": 0.43382275104522705, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0077, + "step": 7240 + }, + { + "epoch": 0.4437236060958443, + "grad_norm": 0.31589648127555847, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0065, + "step": 7250 + }, + { + "epoch": 0.4443356386559765, + "grad_norm": 0.3744536340236664, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0067, + "step": 7260 + }, + { + "epoch": 0.4449476712161087, + "grad_norm": 0.2600225806236267, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.008, + "step": 7270 + }, + { + "epoch": 0.4455597037762409, + "grad_norm": 0.28064799308776855, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0061, + "step": 7280 + }, + { + "epoch": 0.4461717363363731, + "grad_norm": 0.2745135426521301, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0075, + "step": 7290 + }, + { + "epoch": 0.4467837688965053, + "grad_norm": 0.23609793186187744, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0056, + "step": 7300 + }, + { + "epoch": 0.4473958014566375, + "grad_norm": 0.35910022258758545, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0079, + "step": 7310 + }, + { + "epoch": 0.4480078340167697, + "grad_norm": 0.22230662405490875, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0065, + "step": 7320 + }, + { + "epoch": 0.4486198665769019, + "grad_norm": 0.3835199475288391, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.008, + "step": 7330 + }, + { + "epoch": 0.4492318991370341, + "grad_norm": 0.37863102555274963, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0083, + "step": 7340 + }, + { + "epoch": 0.4498439316971663, + "grad_norm": 0.25412216782569885, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0056, + "step": 7350 + }, + { + "epoch": 0.4504559642572985, + "grad_norm": 0.43248918652534485, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0061, + "step": 7360 + }, + { + "epoch": 0.4510679968174307, + "grad_norm": 0.2937811613082886, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0073, + "step": 7370 + }, + { + "epoch": 0.4516800293775629, + "grad_norm": 0.3018436133861542, + "learning_rate": 1.793524061803872e-05, + "loss": 0.007, + "step": 7380 + }, + { + "epoch": 0.4522920619376951, + "grad_norm": 0.32781726121902466, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0079, + "step": 7390 + }, + { + "epoch": 0.45290409449782726, + "grad_norm": 0.2843719720840454, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0067, + "step": 7400 + }, + { + "epoch": 0.45351612705795946, + "grad_norm": 0.27588292956352234, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0092, + "step": 7410 + }, + { + "epoch": 0.45412815961809166, + "grad_norm": 0.38858234882354736, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0067, + "step": 7420 + }, + { + "epoch": 0.45474019217822387, + "grad_norm": 0.4235166609287262, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0082, + "step": 7430 + }, + { + "epoch": 0.45535222473835607, + "grad_norm": 0.272210031747818, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0059, + "step": 7440 + }, + { + "epoch": 0.4559642572984883, + "grad_norm": 0.23851896822452545, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0071, + "step": 7450 + }, + { + "epoch": 0.4565762898586205, + "grad_norm": 0.37179476022720337, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0073, + "step": 7460 + }, + { + "epoch": 0.4571883224187527, + "grad_norm": 0.31902605295181274, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.009, + "step": 7470 + }, + { + "epoch": 0.4578003549788849, + "grad_norm": 0.47023633122444153, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0091, + "step": 7480 + }, + { + "epoch": 0.4584123875390171, + "grad_norm": 0.35726839303970337, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0079, + "step": 7490 + }, + { + "epoch": 0.4590244200991493, + "grad_norm": 0.27567291259765625, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0073, + "step": 7500 + }, + { + "epoch": 0.4596364526592815, + "grad_norm": 0.23053516447544098, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0065, + "step": 7510 + }, + { + "epoch": 0.4602484852194137, + "grad_norm": 0.2169056385755539, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0054, + "step": 7520 + }, + { + "epoch": 0.4608605177795459, + "grad_norm": 0.2912258207798004, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0083, + "step": 7530 + }, + { + "epoch": 0.46147255033967804, + "grad_norm": 0.2527846097946167, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.006, + "step": 7540 + }, + { + "epoch": 0.46208458289981025, + "grad_norm": 0.3878445029258728, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0079, + "step": 7550 + }, + { + "epoch": 0.46269661545994245, + "grad_norm": 0.3981980085372925, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0081, + "step": 7560 + }, + { + "epoch": 0.46330864802007465, + "grad_norm": 0.48834845423698425, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0086, + "step": 7570 + }, + { + "epoch": 0.46392068058020686, + "grad_norm": 0.3045276701450348, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0085, + "step": 7580 + }, + { + "epoch": 0.46453271314033906, + "grad_norm": 0.23345299065113068, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0072, + "step": 7590 + }, + { + "epoch": 0.46514474570047126, + "grad_norm": 0.3632943034172058, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0085, + "step": 7600 + }, + { + "epoch": 0.46575677826060347, + "grad_norm": 0.19813670217990875, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0073, + "step": 7610 + }, + { + "epoch": 0.46636881082073567, + "grad_norm": 0.36094173789024353, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0085, + "step": 7620 + }, + { + "epoch": 0.46698084338086787, + "grad_norm": 0.30049464106559753, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0085, + "step": 7630 + }, + { + "epoch": 0.4675928759410001, + "grad_norm": 0.27693697810173035, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0057, + "step": 7640 + }, + { + "epoch": 0.4682049085011323, + "grad_norm": 0.3656866252422333, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0064, + "step": 7650 + }, + { + "epoch": 0.4688169410612645, + "grad_norm": 0.602168083190918, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0076, + "step": 7660 + }, + { + "epoch": 0.4694289736213967, + "grad_norm": 0.3553078770637512, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0062, + "step": 7670 + }, + { + "epoch": 0.47004100618152883, + "grad_norm": 0.326695054769516, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0062, + "step": 7680 + }, + { + "epoch": 0.47065303874166103, + "grad_norm": 0.2762170732021332, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0075, + "step": 7690 + }, + { + "epoch": 0.47126507130179324, + "grad_norm": 0.35057321190834045, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0063, + "step": 7700 + }, + { + "epoch": 0.47187710386192544, + "grad_norm": 0.3906462788581848, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0086, + "step": 7710 + }, + { + "epoch": 0.47248913642205764, + "grad_norm": 0.290752112865448, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0087, + "step": 7720 + }, + { + "epoch": 0.47310116898218985, + "grad_norm": 0.2242034673690796, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0068, + "step": 7730 + }, + { + "epoch": 0.47371320154232205, + "grad_norm": 0.3283435106277466, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0068, + "step": 7740 + }, + { + "epoch": 0.47432523410245425, + "grad_norm": 0.24059069156646729, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.0056, + "step": 7750 + }, + { + "epoch": 0.47493726666258645, + "grad_norm": 0.2978667914867401, + "learning_rate": 1.769330275540774e-05, + "loss": 0.007, + "step": 7760 + }, + { + "epoch": 0.47554929922271866, + "grad_norm": 0.2605571150779724, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0084, + "step": 7770 + }, + { + "epoch": 0.47616133178285086, + "grad_norm": 0.4010445475578308, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0075, + "step": 7780 + }, + { + "epoch": 0.47677336434298306, + "grad_norm": 0.31932029128074646, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0076, + "step": 7790 + }, + { + "epoch": 0.47738539690311527, + "grad_norm": 0.3508684039115906, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0067, + "step": 7800 + }, + { + "epoch": 0.47799742946324747, + "grad_norm": 0.2835206091403961, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0064, + "step": 7810 + }, + { + "epoch": 0.4786094620233796, + "grad_norm": 0.2661663293838501, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0051, + "step": 7820 + }, + { + "epoch": 0.4792214945835118, + "grad_norm": 0.4146379828453064, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0073, + "step": 7830 + }, + { + "epoch": 0.479833527143644, + "grad_norm": 0.38621196150779724, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0076, + "step": 7840 + }, + { + "epoch": 0.4804455597037762, + "grad_norm": 0.19052188098430634, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.008, + "step": 7850 + }, + { + "epoch": 0.48105759226390843, + "grad_norm": 0.3699149489402771, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0071, + "step": 7860 + }, + { + "epoch": 0.48166962482404063, + "grad_norm": 0.3756427764892578, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0071, + "step": 7870 + }, + { + "epoch": 0.48228165738417283, + "grad_norm": 0.2987386882305145, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0069, + "step": 7880 + }, + { + "epoch": 0.48289368994430504, + "grad_norm": 0.24891899526119232, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0077, + "step": 7890 + }, + { + "epoch": 0.48350572250443724, + "grad_norm": 0.44080299139022827, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.011, + "step": 7900 + }, + { + "epoch": 0.48411775506456944, + "grad_norm": 0.20801177620887756, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0089, + "step": 7910 + }, + { + "epoch": 0.48472978762470165, + "grad_norm": 0.31475305557250977, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0091, + "step": 7920 + }, + { + "epoch": 0.48534182018483385, + "grad_norm": 0.29783639311790466, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0082, + "step": 7930 + }, + { + "epoch": 0.48595385274496605, + "grad_norm": 0.3330203890800476, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0071, + "step": 7940 + }, + { + "epoch": 0.48656588530509826, + "grad_norm": 0.3537667691707611, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0068, + "step": 7950 + }, + { + "epoch": 0.4871779178652304, + "grad_norm": 0.2810688316822052, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0059, + "step": 7960 + }, + { + "epoch": 0.4877899504253626, + "grad_norm": 0.3359779715538025, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0059, + "step": 7970 + }, + { + "epoch": 0.4884019829854948, + "grad_norm": 0.36015257239341736, + "learning_rate": 1.754802282200567e-05, + "loss": 0.008, + "step": 7980 + }, + { + "epoch": 0.489014015545627, + "grad_norm": 0.2647690176963806, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0065, + "step": 7990 + }, + { + "epoch": 0.4896260481057592, + "grad_norm": 0.23366811871528625, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0068, + "step": 8000 + }, + { + "epoch": 0.4902380806658914, + "grad_norm": 0.2904139757156372, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0054, + "step": 8010 + }, + { + "epoch": 0.4908501132260236, + "grad_norm": 0.30941230058670044, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0069, + "step": 8020 + }, + { + "epoch": 0.4914621457861558, + "grad_norm": 0.1959473341703415, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0063, + "step": 8030 + }, + { + "epoch": 0.492074178346288, + "grad_norm": 0.33349713683128357, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0059, + "step": 8040 + }, + { + "epoch": 0.49268621090642023, + "grad_norm": 0.39017921686172485, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0067, + "step": 8050 + }, + { + "epoch": 0.49329824346655243, + "grad_norm": 0.36401957273483276, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0061, + "step": 8060 + }, + { + "epoch": 0.49391027602668464, + "grad_norm": 0.22296921908855438, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0071, + "step": 8070 + }, + { + "epoch": 0.49452230858681684, + "grad_norm": 0.8712129592895508, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0104, + "step": 8080 + }, + { + "epoch": 0.49513434114694904, + "grad_norm": 0.39942649006843567, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0068, + "step": 8090 + }, + { + "epoch": 0.4957463737070812, + "grad_norm": 0.3821292817592621, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0076, + "step": 8100 + }, + { + "epoch": 0.4963584062672134, + "grad_norm": 0.35861077904701233, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0086, + "step": 8110 + }, + { + "epoch": 0.4969704388273456, + "grad_norm": 0.38629451394081116, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0068, + "step": 8120 + }, + { + "epoch": 0.4975824713874778, + "grad_norm": 3.412374973297119, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0168, + "step": 8130 + }, + { + "epoch": 0.49819450394761, + "grad_norm": 0.2893833816051483, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0065, + "step": 8140 + }, + { + "epoch": 0.4988065365077422, + "grad_norm": 0.37679117918014526, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0058, + "step": 8150 + }, + { + "epoch": 0.4994185690678744, + "grad_norm": 0.2745130658149719, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0082, + "step": 8160 + }, + { + "epoch": 0.5000306016280066, + "grad_norm": 0.30250442028045654, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0065, + "step": 8170 + }, + { + "epoch": 0.5006426341881388, + "grad_norm": 0.19602464139461517, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0056, + "step": 8180 + }, + { + "epoch": 0.501254666748271, + "grad_norm": 0.4736115634441376, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0062, + "step": 8190 + }, + { + "epoch": 0.5018666993084032, + "grad_norm": 0.25439244508743286, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0069, + "step": 8200 + }, + { + "epoch": 0.5024787318685354, + "grad_norm": 0.19290995597839355, + "learning_rate": 1.739216409306913e-05, + "loss": 0.007, + "step": 8210 + }, + { + "epoch": 0.5030907644286676, + "grad_norm": 0.24844267964363098, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0071, + "step": 8220 + }, + { + "epoch": 0.5037027969887998, + "grad_norm": 0.21179668605327606, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0071, + "step": 8230 + }, + { + "epoch": 0.504314829548932, + "grad_norm": 0.29139387607574463, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0082, + "step": 8240 + }, + { + "epoch": 0.5049268621090642, + "grad_norm": 0.2621973752975464, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0068, + "step": 8250 + }, + { + "epoch": 0.5055388946691964, + "grad_norm": 0.23394125699996948, + "learning_rate": 1.735775329110705e-05, + "loss": 0.006, + "step": 8260 + }, + { + "epoch": 0.5061509272293286, + "grad_norm": 0.28399863839149475, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0067, + "step": 8270 + }, + { + "epoch": 0.5067629597894608, + "grad_norm": 0.5048072934150696, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.008, + "step": 8280 + }, + { + "epoch": 0.507374992349593, + "grad_norm": 0.33848801255226135, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0054, + "step": 8290 + }, + { + "epoch": 0.5079870249097252, + "grad_norm": 0.28341951966285706, + "learning_rate": 1.733009030001197e-05, + "loss": 0.008, + "step": 8300 + }, + { + "epoch": 0.5085990574698575, + "grad_norm": 0.3223153054714203, + "learning_rate": 1.732315596014244e-05, + "loss": 0.007, + "step": 8310 + }, + { + "epoch": 0.5092110900299895, + "grad_norm": 0.23227599263191223, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0049, + "step": 8320 + }, + { + "epoch": 0.5098231225901217, + "grad_norm": 0.2847786247730255, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.007, + "step": 8330 + }, + { + "epoch": 0.510435155150254, + "grad_norm": 0.2026357650756836, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.007, + "step": 8340 + }, + { + "epoch": 0.5110471877103862, + "grad_norm": 0.3617453873157501, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0067, + "step": 8350 + }, + { + "epoch": 0.5116592202705184, + "grad_norm": 0.4439109265804291, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0067, + "step": 8360 + }, + { + "epoch": 0.5122712528306506, + "grad_norm": 0.26640209555625916, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0086, + "step": 8370 + }, + { + "epoch": 0.5128832853907828, + "grad_norm": 0.38045984506607056, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0075, + "step": 8380 + }, + { + "epoch": 0.513495317950915, + "grad_norm": 0.23035791516304016, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.006, + "step": 8390 + }, + { + "epoch": 0.5141073505110472, + "grad_norm": 0.40618664026260376, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0071, + "step": 8400 + }, + { + "epoch": 0.5147193830711794, + "grad_norm": 0.2593354880809784, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0064, + "step": 8410 + }, + { + "epoch": 0.5153314156313116, + "grad_norm": 0.27723655104637146, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0062, + "step": 8420 + }, + { + "epoch": 0.5159434481914438, + "grad_norm": 0.3793911039829254, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0059, + "step": 8430 + }, + { + "epoch": 0.516555480751576, + "grad_norm": 0.28634312748908997, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0063, + "step": 8440 + }, + { + "epoch": 0.5171675133117082, + "grad_norm": 0.39417290687561035, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0078, + "step": 8450 + }, + { + "epoch": 0.5177795458718404, + "grad_norm": 0.3043057322502136, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0064, + "step": 8460 + }, + { + "epoch": 0.5183915784319726, + "grad_norm": 0.36794111132621765, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0106, + "step": 8470 + }, + { + "epoch": 0.5190036109921048, + "grad_norm": 0.312161922454834, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0067, + "step": 8480 + }, + { + "epoch": 0.519615643552237, + "grad_norm": 0.39240267872810364, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0065, + "step": 8490 + }, + { + "epoch": 0.5202276761123692, + "grad_norm": 0.4500446915626526, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0073, + "step": 8500 + }, + { + "epoch": 0.5208397086725014, + "grad_norm": 0.22808927297592163, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0072, + "step": 8510 + }, + { + "epoch": 0.5214517412326336, + "grad_norm": 0.3262411057949066, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0065, + "step": 8520 + }, + { + "epoch": 0.5220637737927658, + "grad_norm": 0.472229927778244, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0068, + "step": 8530 + }, + { + "epoch": 0.522675806352898, + "grad_norm": 0.31563568115234375, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0074, + "step": 8540 + }, + { + "epoch": 0.5232878389130302, + "grad_norm": 0.27949750423431396, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0064, + "step": 8550 + }, + { + "epoch": 0.5238998714731624, + "grad_norm": 0.30297499895095825, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0075, + "step": 8560 + }, + { + "epoch": 0.5245119040332946, + "grad_norm": 0.3946770429611206, + "learning_rate": 1.714028248198457e-05, + "loss": 0.011, + "step": 8570 + }, + { + "epoch": 0.5251239365934268, + "grad_norm": 0.3405992090702057, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0082, + "step": 8580 + }, + { + "epoch": 0.525735969153559, + "grad_norm": 0.2963511347770691, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0066, + "step": 8590 + }, + { + "epoch": 0.5263480017136911, + "grad_norm": 0.1909177303314209, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.006, + "step": 8600 + }, + { + "epoch": 0.5269600342738233, + "grad_norm": 0.3378836512565613, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0058, + "step": 8610 + }, + { + "epoch": 0.5275720668339555, + "grad_norm": 0.30862805247306824, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0067, + "step": 8620 + }, + { + "epoch": 0.5281840993940877, + "grad_norm": 0.397293359041214, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0071, + "step": 8630 + }, + { + "epoch": 0.5287961319542199, + "grad_norm": 0.3665411174297333, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0068, + "step": 8640 + }, + { + "epoch": 0.5294081645143521, + "grad_norm": 0.34842419624328613, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0068, + "step": 8650 + }, + { + "epoch": 0.5300201970744843, + "grad_norm": 0.38205671310424805, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0065, + "step": 8660 + }, + { + "epoch": 0.5306322296346165, + "grad_norm": 0.35549092292785645, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0068, + "step": 8670 + }, + { + "epoch": 0.5312442621947487, + "grad_norm": 0.15676020085811615, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0055, + "step": 8680 + }, + { + "epoch": 0.5318562947548809, + "grad_norm": 0.22985056042671204, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0071, + "step": 8690 + }, + { + "epoch": 0.5324683273150131, + "grad_norm": 0.2743426263332367, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0047, + "step": 8700 + }, + { + "epoch": 0.5330803598751453, + "grad_norm": 0.2503803074359894, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0079, + "step": 8710 + }, + { + "epoch": 0.5336923924352776, + "grad_norm": 0.5036469101905823, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0095, + "step": 8720 + }, + { + "epoch": 0.5343044249954098, + "grad_norm": 0.2349964827299118, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0068, + "step": 8730 + }, + { + "epoch": 0.534916457555542, + "grad_norm": 0.28706061840057373, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0065, + "step": 8740 + }, + { + "epoch": 0.5355284901156742, + "grad_norm": 0.21812452375888824, + "learning_rate": 1.701081551967764e-05, + "loss": 0.008, + "step": 8750 + }, + { + "epoch": 0.5361405226758064, + "grad_norm": 0.301618754863739, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0069, + "step": 8760 + }, + { + "epoch": 0.5367525552359386, + "grad_norm": 0.35402950644493103, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0067, + "step": 8770 + }, + { + "epoch": 0.5373645877960708, + "grad_norm": 0.2875203788280487, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0097, + "step": 8780 + }, + { + "epoch": 0.537976620356203, + "grad_norm": 0.2358965128660202, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0053, + "step": 8790 + }, + { + "epoch": 0.5385886529163352, + "grad_norm": 0.14462094008922577, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0059, + "step": 8800 + }, + { + "epoch": 0.5392006854764674, + "grad_norm": 0.17893171310424805, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0062, + "step": 8810 + }, + { + "epoch": 0.5398127180365996, + "grad_norm": 0.2923351526260376, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0078, + "step": 8820 + }, + { + "epoch": 0.5404247505967318, + "grad_norm": 0.3288479745388031, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0067, + "step": 8830 + }, + { + "epoch": 0.541036783156864, + "grad_norm": 0.3996310532093048, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.006, + "step": 8840 + }, + { + "epoch": 0.5416488157169962, + "grad_norm": 0.24345380067825317, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0067, + "step": 8850 + }, + { + "epoch": 0.5422608482771284, + "grad_norm": 0.26688340306282043, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0072, + "step": 8860 + }, + { + "epoch": 0.5428728808372606, + "grad_norm": 0.4816153645515442, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0099, + "step": 8870 + }, + { + "epoch": 0.5434849133973927, + "grad_norm": 0.22544988989830017, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.007, + "step": 8880 + }, + { + "epoch": 0.5440969459575249, + "grad_norm": 0.2820419669151306, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0064, + "step": 8890 + }, + { + "epoch": 0.5447089785176571, + "grad_norm": 0.2758846879005432, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0072, + "step": 8900 + }, + { + "epoch": 0.5453210110777893, + "grad_norm": 0.4620129466056824, + "learning_rate": 1.689381359053773e-05, + "loss": 0.008, + "step": 8910 + }, + { + "epoch": 0.5459330436379215, + "grad_norm": 0.5567039847373962, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0079, + "step": 8920 + }, + { + "epoch": 0.5465450761980537, + "grad_norm": 0.347251832485199, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.006, + "step": 8930 + }, + { + "epoch": 0.5471571087581859, + "grad_norm": 0.31768012046813965, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0065, + "step": 8940 + }, + { + "epoch": 0.5477691413183181, + "grad_norm": 0.24245156347751617, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0052, + "step": 8950 + }, + { + "epoch": 0.5483811738784503, + "grad_norm": 0.2124931961297989, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0074, + "step": 8960 + }, + { + "epoch": 0.5489932064385825, + "grad_norm": 0.18998636305332184, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0056, + "step": 8970 + }, + { + "epoch": 0.5496052389987147, + "grad_norm": 0.2667362689971924, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0056, + "step": 8980 + }, + { + "epoch": 0.5502172715588469, + "grad_norm": 0.4424617886543274, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0091, + "step": 8990 + }, + { + "epoch": 0.5508293041189791, + "grad_norm": 0.33623644709587097, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0061, + "step": 9000 + }, + { + "epoch": 0.5514413366791113, + "grad_norm": 0.29990604519844055, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0059, + "step": 9010 + }, + { + "epoch": 0.5520533692392435, + "grad_norm": 0.4384118914604187, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0059, + "step": 9020 + }, + { + "epoch": 0.5526654017993757, + "grad_norm": 0.3468496799468994, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0068, + "step": 9030 + }, + { + "epoch": 0.5532774343595079, + "grad_norm": 0.3473573327064514, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0062, + "step": 9040 + }, + { + "epoch": 0.5538894669196401, + "grad_norm": 0.36125242710113525, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0062, + "step": 9050 + }, + { + "epoch": 0.5545014994797723, + "grad_norm": 0.2603420615196228, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0091, + "step": 9060 + }, + { + "epoch": 0.5551135320399045, + "grad_norm": 0.27355659008026123, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0074, + "step": 9070 + }, + { + "epoch": 0.5557255646000367, + "grad_norm": 0.24741119146347046, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0064, + "step": 9080 + }, + { + "epoch": 0.556337597160169, + "grad_norm": 0.2001475840806961, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0094, + "step": 9090 + }, + { + "epoch": 0.5569496297203012, + "grad_norm": 0.41522347927093506, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0088, + "step": 9100 + }, + { + "epoch": 0.5575616622804334, + "grad_norm": 0.27282488346099854, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0062, + "step": 9110 + }, + { + "epoch": 0.5581736948405656, + "grad_norm": 0.26905956864356995, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.007, + "step": 9120 + }, + { + "epoch": 0.5587857274006978, + "grad_norm": 0.24747484922409058, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0084, + "step": 9130 + }, + { + "epoch": 0.55939775996083, + "grad_norm": 0.1863871067762375, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0061, + "step": 9140 + }, + { + "epoch": 0.5600097925209622, + "grad_norm": 0.3599740266799927, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0063, + "step": 9150 + }, + { + "epoch": 0.5606218250810943, + "grad_norm": 0.2238125205039978, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0047, + "step": 9160 + }, + { + "epoch": 0.5612338576412265, + "grad_norm": 0.272077351808548, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.006, + "step": 9170 + }, + { + "epoch": 0.5618458902013587, + "grad_norm": 0.2371625155210495, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0064, + "step": 9180 + }, + { + "epoch": 0.5624579227614909, + "grad_norm": 0.12783293426036835, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0054, + "step": 9190 + }, + { + "epoch": 0.5630699553216231, + "grad_norm": 0.3144581615924835, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0064, + "step": 9200 + }, + { + "epoch": 0.5636819878817553, + "grad_norm": 0.31995031237602234, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0082, + "step": 9210 + }, + { + "epoch": 0.5642940204418875, + "grad_norm": 0.31995660066604614, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0083, + "step": 9220 + }, + { + "epoch": 0.5649060530020197, + "grad_norm": 0.5018982291221619, + "learning_rate": 1.665453350687773e-05, + "loss": 0.007, + "step": 9230 + }, + { + "epoch": 0.5655180855621519, + "grad_norm": 0.2927841544151306, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0104, + "step": 9240 + }, + { + "epoch": 0.5661301181222841, + "grad_norm": 0.21124979853630066, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0068, + "step": 9250 + }, + { + "epoch": 0.5667421506824163, + "grad_norm": 0.25787463784217834, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0059, + "step": 9260 + }, + { + "epoch": 0.5673541832425485, + "grad_norm": 0.3194720447063446, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0065, + "step": 9270 + }, + { + "epoch": 0.5679662158026807, + "grad_norm": 0.24165599048137665, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.006, + "step": 9280 + }, + { + "epoch": 0.5685782483628129, + "grad_norm": 0.4880482256412506, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0096, + "step": 9290 + }, + { + "epoch": 0.5691902809229451, + "grad_norm": 0.24660199880599976, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0098, + "step": 9300 + }, + { + "epoch": 0.5698023134830773, + "grad_norm": 0.24707400798797607, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0052, + "step": 9310 + }, + { + "epoch": 0.5704143460432095, + "grad_norm": 0.33855682611465454, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.011, + "step": 9320 + }, + { + "epoch": 0.5710263786033417, + "grad_norm": 0.22913751006126404, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0074, + "step": 9330 + }, + { + "epoch": 0.5716384111634739, + "grad_norm": 0.24127185344696045, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0062, + "step": 9340 + }, + { + "epoch": 0.5722504437236061, + "grad_norm": 0.26104915142059326, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0065, + "step": 9350 + }, + { + "epoch": 0.5728624762837383, + "grad_norm": 0.21698857843875885, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0062, + "step": 9360 + }, + { + "epoch": 0.5734745088438705, + "grad_norm": 0.29092445969581604, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0081, + "step": 9370 + }, + { + "epoch": 0.5740865414040027, + "grad_norm": 0.2534378468990326, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0064, + "step": 9380 + }, + { + "epoch": 0.5746985739641349, + "grad_norm": 0.28900131583213806, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0064, + "step": 9390 + }, + { + "epoch": 0.5753106065242671, + "grad_norm": 0.3028101921081543, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0104, + "step": 9400 + }, + { + "epoch": 0.5759226390843993, + "grad_norm": 0.28851139545440674, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0069, + "step": 9410 + }, + { + "epoch": 0.5765346716445315, + "grad_norm": 0.5735841393470764, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0072, + "step": 9420 + }, + { + "epoch": 0.5771467042046637, + "grad_norm": 0.20355567336082458, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0056, + "step": 9430 + }, + { + "epoch": 0.5777587367647958, + "grad_norm": 0.37027955055236816, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.006, + "step": 9440 + }, + { + "epoch": 0.578370769324928, + "grad_norm": 0.2701684832572937, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0066, + "step": 9450 + }, + { + "epoch": 0.5789828018850602, + "grad_norm": 0.17381855845451355, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0054, + "step": 9460 + }, + { + "epoch": 0.5795948344451924, + "grad_norm": 0.250261515378952, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0096, + "step": 9470 + }, + { + "epoch": 0.5802068670053246, + "grad_norm": 0.22972841560840607, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0058, + "step": 9480 + }, + { + "epoch": 0.5808188995654568, + "grad_norm": 0.22654809057712555, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0061, + "step": 9490 + }, + { + "epoch": 0.581430932125589, + "grad_norm": 0.17165100574493408, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0058, + "step": 9500 + }, + { + "epoch": 0.5820429646857213, + "grad_norm": 0.2462143450975418, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0054, + "step": 9510 + }, + { + "epoch": 0.5826549972458535, + "grad_norm": 0.3970383107662201, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0069, + "step": 9520 + }, + { + "epoch": 0.5832670298059857, + "grad_norm": 0.21578988432884216, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0054, + "step": 9530 + }, + { + "epoch": 0.5838790623661179, + "grad_norm": 0.5680915713310242, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0072, + "step": 9540 + }, + { + "epoch": 0.5844910949262501, + "grad_norm": 0.24070246517658234, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0073, + "step": 9550 + }, + { + "epoch": 0.5851031274863823, + "grad_norm": 0.2524685263633728, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0068, + "step": 9560 + }, + { + "epoch": 0.5857151600465145, + "grad_norm": 0.27286672592163086, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.008, + "step": 9570 + }, + { + "epoch": 0.5863271926066467, + "grad_norm": 0.3459629714488983, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0088, + "step": 9580 + }, + { + "epoch": 0.5869392251667789, + "grad_norm": 0.2964814603328705, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0066, + "step": 9590 + }, + { + "epoch": 0.5875512577269111, + "grad_norm": 0.3559853434562683, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0064, + "step": 9600 + }, + { + "epoch": 0.5881632902870433, + "grad_norm": 0.256898432970047, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0056, + "step": 9610 + }, + { + "epoch": 0.5887753228471755, + "grad_norm": 0.25032711029052734, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0052, + "step": 9620 + }, + { + "epoch": 0.5893873554073077, + "grad_norm": 0.2467224895954132, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0048, + "step": 9630 + }, + { + "epoch": 0.5899993879674399, + "grad_norm": 0.5331161618232727, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0078, + "step": 9640 + }, + { + "epoch": 0.5906114205275721, + "grad_norm": 0.33348897099494934, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0068, + "step": 9650 + }, + { + "epoch": 0.5912234530877043, + "grad_norm": 0.21435993909835815, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0058, + "step": 9660 + }, + { + "epoch": 0.5918354856478365, + "grad_norm": 0.35850396752357483, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0068, + "step": 9670 + }, + { + "epoch": 0.5924475182079687, + "grad_norm": 0.3007623851299286, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0063, + "step": 9680 + }, + { + "epoch": 0.5930595507681009, + "grad_norm": 0.22949714958667755, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0054, + "step": 9690 + }, + { + "epoch": 0.5936715833282331, + "grad_norm": 0.23259367048740387, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0048, + "step": 9700 + }, + { + "epoch": 0.5942836158883653, + "grad_norm": 0.2305079996585846, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0047, + "step": 9710 + }, + { + "epoch": 0.5948956484484974, + "grad_norm": 0.33875930309295654, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0063, + "step": 9720 + }, + { + "epoch": 0.5955076810086296, + "grad_norm": 0.3981896936893463, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0076, + "step": 9730 + }, + { + "epoch": 0.5961197135687618, + "grad_norm": 0.280831515789032, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0075, + "step": 9740 + }, + { + "epoch": 0.596731746128894, + "grad_norm": 0.26045629382133484, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0064, + "step": 9750 + }, + { + "epoch": 0.5973437786890262, + "grad_norm": 0.23102521896362305, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0071, + "step": 9760 + }, + { + "epoch": 0.5979558112491584, + "grad_norm": 0.5013224482536316, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0063, + "step": 9770 + }, + { + "epoch": 0.5985678438092906, + "grad_norm": 0.45689067244529724, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0088, + "step": 9780 + }, + { + "epoch": 0.5991798763694228, + "grad_norm": 0.27118632197380066, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0065, + "step": 9790 + }, + { + "epoch": 0.599791908929555, + "grad_norm": 0.420202374458313, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0076, + "step": 9800 + }, + { + "epoch": 0.6004039414896872, + "grad_norm": 0.35844025015830994, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0056, + "step": 9810 + }, + { + "epoch": 0.6010159740498194, + "grad_norm": 0.2205585241317749, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0082, + "step": 9820 + }, + { + "epoch": 0.6016280066099516, + "grad_norm": 0.18860426545143127, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.007, + "step": 9830 + }, + { + "epoch": 0.6022400391700838, + "grad_norm": 0.25045180320739746, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0082, + "step": 9840 + }, + { + "epoch": 0.602852071730216, + "grad_norm": 0.2581705152988434, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0065, + "step": 9850 + }, + { + "epoch": 0.6034641042903482, + "grad_norm": 0.25894811749458313, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0058, + "step": 9860 + }, + { + "epoch": 0.6040761368504804, + "grad_norm": 0.43305444717407227, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0066, + "step": 9870 + }, + { + "epoch": 0.6046881694106127, + "grad_norm": 0.2295757383108139, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0069, + "step": 9880 + }, + { + "epoch": 0.6053002019707449, + "grad_norm": 0.29785802960395813, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0074, + "step": 9890 + }, + { + "epoch": 0.6059122345308771, + "grad_norm": 0.3353278338909149, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0068, + "step": 9900 + }, + { + "epoch": 0.6065242670910093, + "grad_norm": 0.29115045070648193, + "learning_rate": 1.612387195896372e-05, + "loss": 0.008, + "step": 9910 + }, + { + "epoch": 0.6071362996511415, + "grad_norm": 0.3202555477619171, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0071, + "step": 9920 + }, + { + "epoch": 0.6077483322112737, + "grad_norm": 0.2849314212799072, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.005, + "step": 9930 + }, + { + "epoch": 0.6083603647714059, + "grad_norm": 0.2768756151199341, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0051, + "step": 9940 + }, + { + "epoch": 0.6089723973315381, + "grad_norm": 0.3138035535812378, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0058, + "step": 9950 + }, + { + "epoch": 0.6095844298916703, + "grad_norm": 0.20827682316303253, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0058, + "step": 9960 + }, + { + "epoch": 0.6101964624518025, + "grad_norm": 0.29986995458602905, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0076, + "step": 9970 + }, + { + "epoch": 0.6108084950119347, + "grad_norm": 0.23564326763153076, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0056, + "step": 9980 + }, + { + "epoch": 0.6114205275720669, + "grad_norm": 0.24854765832424164, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0066, + "step": 9990 + }, + { + "epoch": 0.6120325601321991, + "grad_norm": 0.5696694850921631, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0072, + "step": 10000 + }, + { + "epoch": 0.6126445926923312, + "grad_norm": 0.24267911911010742, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.0061, + "step": 10010 + }, + { + "epoch": 0.6132566252524634, + "grad_norm": 0.1955283135175705, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0076, + "step": 10020 + }, + { + "epoch": 0.6138686578125956, + "grad_norm": 0.3427830934524536, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0073, + "step": 10030 + }, + { + "epoch": 0.6144806903727278, + "grad_norm": 0.38532915711402893, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0078, + "step": 10040 + }, + { + "epoch": 0.61509272293286, + "grad_norm": 0.4302294850349426, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0071, + "step": 10050 + }, + { + "epoch": 0.6157047554929922, + "grad_norm": 0.38420233130455017, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0072, + "step": 10060 + }, + { + "epoch": 0.6163167880531244, + "grad_norm": 0.23822636902332306, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.004, + "step": 10070 + }, + { + "epoch": 0.6169288206132566, + "grad_norm": 0.25123289227485657, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0065, + "step": 10080 + }, + { + "epoch": 0.6175408531733888, + "grad_norm": 0.23007746040821075, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0059, + "step": 10090 + }, + { + "epoch": 0.618152885733521, + "grad_norm": 0.24051082134246826, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0089, + "step": 10100 + }, + { + "epoch": 0.6187649182936532, + "grad_norm": 0.26246321201324463, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0052, + "step": 10110 + }, + { + "epoch": 0.6193769508537854, + "grad_norm": 0.3160432279109955, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0059, + "step": 10120 + }, + { + "epoch": 0.6199889834139176, + "grad_norm": 0.42534199357032776, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0071, + "step": 10130 + }, + { + "epoch": 0.6206010159740498, + "grad_norm": 0.22966268658638, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0052, + "step": 10140 + }, + { + "epoch": 0.621213048534182, + "grad_norm": 0.22234882414340973, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0073, + "step": 10150 + }, + { + "epoch": 0.6218250810943142, + "grad_norm": 0.31061676144599915, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0066, + "step": 10160 + }, + { + "epoch": 0.6224371136544464, + "grad_norm": 0.34178492426872253, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0063, + "step": 10170 + }, + { + "epoch": 0.6230491462145786, + "grad_norm": 0.263583779335022, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0079, + "step": 10180 + }, + { + "epoch": 0.6236611787747108, + "grad_norm": 0.3774336278438568, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0066, + "step": 10190 + }, + { + "epoch": 0.624273211334843, + "grad_norm": 0.29274430871009827, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.007, + "step": 10200 + }, + { + "epoch": 0.6248852438949752, + "grad_norm": 0.31850868463516235, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0056, + "step": 10210 + }, + { + "epoch": 0.6254972764551074, + "grad_norm": 0.3084369897842407, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0086, + "step": 10220 + }, + { + "epoch": 0.6261093090152396, + "grad_norm": 0.21596118807792664, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0072, + "step": 10230 + }, + { + "epoch": 0.6267213415753718, + "grad_norm": 0.16397996246814728, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0075, + "step": 10240 + }, + { + "epoch": 0.627333374135504, + "grad_norm": 0.15055827796459198, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0046, + "step": 10250 + }, + { + "epoch": 0.6279454066956363, + "grad_norm": 0.23483684659004211, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0064, + "step": 10260 + }, + { + "epoch": 0.6285574392557685, + "grad_norm": 0.3131091594696045, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0069, + "step": 10270 + }, + { + "epoch": 0.6291694718159007, + "grad_norm": 0.27958226203918457, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0067, + "step": 10280 + }, + { + "epoch": 0.6297815043760328, + "grad_norm": 0.23422567546367645, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0086, + "step": 10290 + }, + { + "epoch": 0.630393536936165, + "grad_norm": 0.4644703269004822, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0067, + "step": 10300 + }, + { + "epoch": 0.6310055694962972, + "grad_norm": 0.45787107944488525, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0068, + "step": 10310 + }, + { + "epoch": 0.6316176020564294, + "grad_norm": 0.21038737893104553, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0074, + "step": 10320 + }, + { + "epoch": 0.6322296346165616, + "grad_norm": 0.23812010884284973, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0054, + "step": 10330 + }, + { + "epoch": 0.6328416671766938, + "grad_norm": 0.36856284737586975, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0061, + "step": 10340 + }, + { + "epoch": 0.633453699736826, + "grad_norm": 0.3540131151676178, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0074, + "step": 10350 + }, + { + "epoch": 0.6340657322969582, + "grad_norm": 0.3004823923110962, + "learning_rate": 1.575723252169281e-05, + "loss": 0.006, + "step": 10360 + }, + { + "epoch": 0.6346777648570904, + "grad_norm": 0.17188489437103271, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0053, + "step": 10370 + }, + { + "epoch": 0.6352897974172226, + "grad_norm": 0.21710847318172455, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0062, + "step": 10380 + }, + { + "epoch": 0.6359018299773548, + "grad_norm": 0.2356785386800766, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0061, + "step": 10390 + }, + { + "epoch": 0.636513862537487, + "grad_norm": 0.2736414670944214, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0063, + "step": 10400 + }, + { + "epoch": 0.6371258950976192, + "grad_norm": 0.23872444033622742, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.006, + "step": 10410 + }, + { + "epoch": 0.6377379276577514, + "grad_norm": 0.24478361010551453, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0073, + "step": 10420 + }, + { + "epoch": 0.6383499602178836, + "grad_norm": 0.2964334487915039, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0048, + "step": 10430 + }, + { + "epoch": 0.6389619927780158, + "grad_norm": 0.2760549783706665, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0051, + "step": 10440 + }, + { + "epoch": 0.639574025338148, + "grad_norm": 0.2598065137863159, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0072, + "step": 10450 + }, + { + "epoch": 0.6401860578982802, + "grad_norm": 0.346999853849411, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0052, + "step": 10460 + }, + { + "epoch": 0.6407980904584124, + "grad_norm": 0.31291016936302185, + "learning_rate": 1.56658563993822e-05, + "loss": 0.007, + "step": 10470 + }, + { + "epoch": 0.6414101230185446, + "grad_norm": 0.2631952166557312, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0058, + "step": 10480 + }, + { + "epoch": 0.6420221555786768, + "grad_norm": 0.30895209312438965, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.006, + "step": 10490 + }, + { + "epoch": 0.642634188138809, + "grad_norm": 0.17614217102527618, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0069, + "step": 10500 + }, + { + "epoch": 0.6432462206989412, + "grad_norm": 0.38792312145233154, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0077, + "step": 10510 + }, + { + "epoch": 0.6438582532590734, + "grad_norm": 0.1722564697265625, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0054, + "step": 10520 + }, + { + "epoch": 0.6444702858192056, + "grad_norm": 0.2741699516773224, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0062, + "step": 10530 + }, + { + "epoch": 0.6450823183793378, + "grad_norm": 0.2059863954782486, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0073, + "step": 10540 + }, + { + "epoch": 0.64569435093947, + "grad_norm": 0.2702447474002838, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0044, + "step": 10550 + }, + { + "epoch": 0.6463063834996022, + "grad_norm": 0.2299312800168991, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0051, + "step": 10560 + }, + { + "epoch": 0.6469184160597343, + "grad_norm": 0.1995723992586136, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0057, + "step": 10570 + }, + { + "epoch": 0.6475304486198665, + "grad_norm": 0.30346980690956116, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0063, + "step": 10580 + }, + { + "epoch": 0.6481424811799987, + "grad_norm": 0.5040738582611084, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0064, + "step": 10590 + }, + { + "epoch": 0.6487545137401309, + "grad_norm": 0.16984818875789642, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0071, + "step": 10600 + }, + { + "epoch": 0.6493665463002631, + "grad_norm": 0.26560020446777344, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0116, + "step": 10610 + }, + { + "epoch": 0.6499785788603953, + "grad_norm": 0.4563823342323303, + "learning_rate": 1.554018740860716e-05, + "loss": 0.008, + "step": 10620 + }, + { + "epoch": 0.6505906114205275, + "grad_norm": 0.23272818326950073, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.006, + "step": 10630 + }, + { + "epoch": 0.6512026439806597, + "grad_norm": 0.19166870415210724, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0066, + "step": 10640 + }, + { + "epoch": 0.651814676540792, + "grad_norm": 0.2822705805301666, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0062, + "step": 10650 + }, + { + "epoch": 0.6524267091009242, + "grad_norm": 0.24001267552375793, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0069, + "step": 10660 + }, + { + "epoch": 0.6530387416610564, + "grad_norm": 0.2563900947570801, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0068, + "step": 10670 + }, + { + "epoch": 0.6536507742211886, + "grad_norm": 0.2747437357902527, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0058, + "step": 10680 + }, + { + "epoch": 0.6542628067813208, + "grad_norm": 0.39710354804992676, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.005, + "step": 10690 + }, + { + "epoch": 0.654874839341453, + "grad_norm": 0.30690231919288635, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0102, + "step": 10700 + }, + { + "epoch": 0.6554868719015852, + "grad_norm": 0.2879253923892975, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0072, + "step": 10710 + }, + { + "epoch": 0.6560989044617174, + "grad_norm": 0.19964110851287842, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0065, + "step": 10720 + }, + { + "epoch": 0.6567109370218496, + "grad_norm": 0.20109151303768158, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0066, + "step": 10730 + }, + { + "epoch": 0.6573229695819818, + "grad_norm": 0.21469832956790924, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0048, + "step": 10740 + }, + { + "epoch": 0.657935002142114, + "grad_norm": 0.19622936844825745, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0057, + "step": 10750 + }, + { + "epoch": 0.6585470347022462, + "grad_norm": 0.2255190759897232, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0052, + "step": 10760 + }, + { + "epoch": 0.6591590672623784, + "grad_norm": 0.47484955191612244, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0066, + "step": 10770 + }, + { + "epoch": 0.6597710998225106, + "grad_norm": 0.32192179560661316, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0067, + "step": 10780 + }, + { + "epoch": 0.6603831323826428, + "grad_norm": 0.33044904470443726, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0061, + "step": 10790 + }, + { + "epoch": 0.660995164942775, + "grad_norm": 0.3206661343574524, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0072, + "step": 10800 + }, + { + "epoch": 0.6616071975029072, + "grad_norm": 0.34903818368911743, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0055, + "step": 10810 + }, + { + "epoch": 0.6622192300630394, + "grad_norm": 0.1982222944498062, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0063, + "step": 10820 + }, + { + "epoch": 0.6628312626231716, + "grad_norm": 0.25388309359550476, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0072, + "step": 10830 + }, + { + "epoch": 0.6634432951833038, + "grad_norm": 0.2325269728899002, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0078, + "step": 10840 + }, + { + "epoch": 0.6640553277434359, + "grad_norm": 0.3364964425563812, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0054, + "step": 10850 + }, + { + "epoch": 0.6646673603035681, + "grad_norm": 0.198661208152771, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0061, + "step": 10860 + }, + { + "epoch": 0.6652793928637003, + "grad_norm": 0.333836168050766, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0058, + "step": 10870 + }, + { + "epoch": 0.6658914254238325, + "grad_norm": 0.21908101439476013, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0087, + "step": 10880 + }, + { + "epoch": 0.6665034579839647, + "grad_norm": 0.3094167709350586, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0062, + "step": 10890 + }, + { + "epoch": 0.6671154905440969, + "grad_norm": 0.28113746643066406, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0066, + "step": 10900 + }, + { + "epoch": 0.6677275231042291, + "grad_norm": 0.20239399373531342, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0071, + "step": 10910 + }, + { + "epoch": 0.6683395556643613, + "grad_norm": 0.32829156517982483, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0056, + "step": 10920 + }, + { + "epoch": 0.6689515882244935, + "grad_norm": 0.2950859069824219, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0086, + "step": 10930 + }, + { + "epoch": 0.6695636207846257, + "grad_norm": 0.36404141783714294, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0075, + "step": 10940 + }, + { + "epoch": 0.6701756533447579, + "grad_norm": 0.2479381114244461, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0055, + "step": 10950 + }, + { + "epoch": 0.6707876859048901, + "grad_norm": 0.1934390366077423, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.005, + "step": 10960 + }, + { + "epoch": 0.6713997184650223, + "grad_norm": 0.20912423729896545, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0064, + "step": 10970 + }, + { + "epoch": 0.6720117510251545, + "grad_norm": 0.1781405806541443, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0048, + "step": 10980 + }, + { + "epoch": 0.6726237835852867, + "grad_norm": 0.18812811374664307, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0048, + "step": 10990 + }, + { + "epoch": 0.6732358161454189, + "grad_norm": 0.2006077766418457, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0073, + "step": 11000 + }, + { + "epoch": 0.6738478487055511, + "grad_norm": 0.20471568405628204, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0068, + "step": 11010 + }, + { + "epoch": 0.6744598812656833, + "grad_norm": 0.2979716658592224, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0067, + "step": 11020 + }, + { + "epoch": 0.6750719138258156, + "grad_norm": 0.3256290853023529, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0054, + "step": 11030 + }, + { + "epoch": 0.6756839463859478, + "grad_norm": 0.3346560001373291, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0061, + "step": 11040 + }, + { + "epoch": 0.67629597894608, + "grad_norm": 0.35791122913360596, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0054, + "step": 11050 + }, + { + "epoch": 0.6769080115062122, + "grad_norm": 0.30428826808929443, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0066, + "step": 11060 + }, + { + "epoch": 0.6775200440663444, + "grad_norm": 0.31254154443740845, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0065, + "step": 11070 + }, + { + "epoch": 0.6781320766264766, + "grad_norm": 0.263028621673584, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0062, + "step": 11080 + }, + { + "epoch": 0.6787441091866088, + "grad_norm": 0.22496990859508514, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0056, + "step": 11090 + }, + { + "epoch": 0.679356141746741, + "grad_norm": 0.2647632360458374, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0072, + "step": 11100 + }, + { + "epoch": 0.6799681743068732, + "grad_norm": 0.2517150342464447, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0064, + "step": 11110 + }, + { + "epoch": 0.6805802068670054, + "grad_norm": 0.30550616979599, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0055, + "step": 11120 + }, + { + "epoch": 0.6811922394271375, + "grad_norm": 0.21312931180000305, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0074, + "step": 11130 + }, + { + "epoch": 0.6818042719872697, + "grad_norm": 0.21152199804782867, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0047, + "step": 11140 + }, + { + "epoch": 0.6824163045474019, + "grad_norm": 0.2030613273382187, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0045, + "step": 11150 + }, + { + "epoch": 0.6830283371075341, + "grad_norm": 0.30646151304244995, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0045, + "step": 11160 + }, + { + "epoch": 0.6836403696676663, + "grad_norm": 0.2693783938884735, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0061, + "step": 11170 + }, + { + "epoch": 0.6842524022277985, + "grad_norm": 0.25288495421409607, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0068, + "step": 11180 + }, + { + "epoch": 0.6848644347879307, + "grad_norm": 0.34989964962005615, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.007, + "step": 11190 + }, + { + "epoch": 0.6854764673480629, + "grad_norm": 0.192350834608078, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0064, + "step": 11200 + }, + { + "epoch": 0.6860884999081951, + "grad_norm": 0.3841196894645691, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0069, + "step": 11210 + }, + { + "epoch": 0.6867005324683273, + "grad_norm": 0.2168666571378708, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0063, + "step": 11220 + }, + { + "epoch": 0.6873125650284595, + "grad_norm": 0.2756234109401703, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0068, + "step": 11230 + }, + { + "epoch": 0.6879245975885917, + "grad_norm": 0.1971903294324875, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.006, + "step": 11240 + }, + { + "epoch": 0.6885366301487239, + "grad_norm": 0.3857499659061432, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0063, + "step": 11250 + }, + { + "epoch": 0.6891486627088561, + "grad_norm": 0.194110706448555, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0049, + "step": 11260 + }, + { + "epoch": 0.6897606952689883, + "grad_norm": 0.24935179948806763, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0054, + "step": 11270 + }, + { + "epoch": 0.6903727278291205, + "grad_norm": 0.5208527445793152, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0062, + "step": 11280 + }, + { + "epoch": 0.6909847603892527, + "grad_norm": 0.2917899191379547, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0058, + "step": 11290 + }, + { + "epoch": 0.6915967929493849, + "grad_norm": 0.42692577838897705, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0072, + "step": 11300 + }, + { + "epoch": 0.6922088255095171, + "grad_norm": 0.36888429522514343, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0059, + "step": 11310 + }, + { + "epoch": 0.6928208580696493, + "grad_norm": 0.26246029138565063, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0057, + "step": 11320 + }, + { + "epoch": 0.6934328906297815, + "grad_norm": 0.22163739800453186, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0078, + "step": 11330 + }, + { + "epoch": 0.6940449231899137, + "grad_norm": 0.33411458134651184, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0053, + "step": 11340 + }, + { + "epoch": 0.6946569557500459, + "grad_norm": 0.2792898118495941, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0095, + "step": 11350 + }, + { + "epoch": 0.6952689883101781, + "grad_norm": 0.2770175039768219, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0059, + "step": 11360 + }, + { + "epoch": 0.6958810208703103, + "grad_norm": 0.14913171529769897, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0046, + "step": 11370 + }, + { + "epoch": 0.6964930534304425, + "grad_norm": 0.22906239330768585, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0054, + "step": 11380 + }, + { + "epoch": 0.6971050859905747, + "grad_norm": 0.2854336202144623, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0052, + "step": 11390 + }, + { + "epoch": 0.697717118550707, + "grad_norm": 0.21835818886756897, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0064, + "step": 11400 + }, + { + "epoch": 0.698329151110839, + "grad_norm": 0.42180293798446655, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0071, + "step": 11410 + }, + { + "epoch": 0.6989411836709712, + "grad_norm": 0.3056841492652893, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0092, + "step": 11420 + }, + { + "epoch": 0.6995532162311034, + "grad_norm": 0.15149559080600739, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0049, + "step": 11430 + }, + { + "epoch": 0.7001652487912357, + "grad_norm": 0.15561188757419586, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0051, + "step": 11440 + }, + { + "epoch": 0.7007772813513679, + "grad_norm": 0.2941122055053711, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0065, + "step": 11450 + }, + { + "epoch": 0.7013893139115001, + "grad_norm": 0.3008195757865906, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0059, + "step": 11460 + }, + { + "epoch": 0.7020013464716323, + "grad_norm": 0.3787235617637634, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0068, + "step": 11470 + }, + { + "epoch": 0.7026133790317645, + "grad_norm": 0.2069675624370575, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.005, + "step": 11480 + }, + { + "epoch": 0.7032254115918967, + "grad_norm": 0.33505553007125854, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0058, + "step": 11490 + }, + { + "epoch": 0.7038374441520289, + "grad_norm": 0.281213641166687, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0064, + "step": 11500 + }, + { + "epoch": 0.7044494767121611, + "grad_norm": 0.28471192717552185, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0066, + "step": 11510 + }, + { + "epoch": 0.7050615092722933, + "grad_norm": 0.3166801929473877, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0062, + "step": 11520 + }, + { + "epoch": 0.7056735418324255, + "grad_norm": 0.26893407106399536, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.005, + "step": 11530 + }, + { + "epoch": 0.7062855743925577, + "grad_norm": 0.17421478033065796, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0063, + "step": 11540 + }, + { + "epoch": 0.7068976069526899, + "grad_norm": 0.40999990701675415, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0069, + "step": 11550 + }, + { + "epoch": 0.7075096395128221, + "grad_norm": 0.190180242061615, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0061, + "step": 11560 + }, + { + "epoch": 0.7081216720729543, + "grad_norm": 0.20383603870868683, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0049, + "step": 11570 + }, + { + "epoch": 0.7087337046330865, + "grad_norm": 0.28741395473480225, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0059, + "step": 11580 + }, + { + "epoch": 0.7093457371932187, + "grad_norm": 0.24231962859630585, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.008, + "step": 11590 + }, + { + "epoch": 0.7099577697533509, + "grad_norm": 0.2221115529537201, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0053, + "step": 11600 + }, + { + "epoch": 0.7105698023134831, + "grad_norm": 0.18564820289611816, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0072, + "step": 11610 + }, + { + "epoch": 0.7111818348736153, + "grad_norm": 0.3734343647956848, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0089, + "step": 11620 + }, + { + "epoch": 0.7117938674337475, + "grad_norm": 0.3215912878513336, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0093, + "step": 11630 + }, + { + "epoch": 0.7124058999938797, + "grad_norm": 0.22602899372577667, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0062, + "step": 11640 + }, + { + "epoch": 0.7130179325540119, + "grad_norm": 0.3115978538990021, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.006, + "step": 11650 + }, + { + "epoch": 0.7136299651141441, + "grad_norm": 0.26148155331611633, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0071, + "step": 11660 + }, + { + "epoch": 0.7142419976742763, + "grad_norm": 0.142781600356102, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0051, + "step": 11670 + }, + { + "epoch": 0.7148540302344085, + "grad_norm": 0.21306048333644867, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0068, + "step": 11680 + }, + { + "epoch": 0.7154660627945407, + "grad_norm": 0.3439876437187195, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.006, + "step": 11690 + }, + { + "epoch": 0.7160780953546728, + "grad_norm": 0.4010280966758728, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0062, + "step": 11700 + }, + { + "epoch": 0.716690127914805, + "grad_norm": 0.2760031819343567, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.006, + "step": 11710 + }, + { + "epoch": 0.7173021604749372, + "grad_norm": 0.45097261667251587, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0062, + "step": 11720 + }, + { + "epoch": 0.7179141930350694, + "grad_norm": 0.20118115842342377, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0054, + "step": 11730 + }, + { + "epoch": 0.7185262255952016, + "grad_norm": 0.3090760409832001, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0054, + "step": 11740 + }, + { + "epoch": 0.7191382581553338, + "grad_norm": 0.25016647577285767, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0077, + "step": 11750 + }, + { + "epoch": 0.719750290715466, + "grad_norm": 0.2310703545808792, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0053, + "step": 11760 + }, + { + "epoch": 0.7203623232755982, + "grad_norm": 0.2269359678030014, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.006, + "step": 11770 + }, + { + "epoch": 0.7209743558357304, + "grad_norm": 0.3917788565158844, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0062, + "step": 11780 + }, + { + "epoch": 0.7215863883958626, + "grad_norm": 0.25999465584754944, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0064, + "step": 11790 + }, + { + "epoch": 0.7221984209559948, + "grad_norm": 0.19340357184410095, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0071, + "step": 11800 + }, + { + "epoch": 0.722810453516127, + "grad_norm": 0.25046268105506897, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0078, + "step": 11810 + }, + { + "epoch": 0.7234224860762593, + "grad_norm": 0.19819264113903046, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.005, + "step": 11820 + }, + { + "epoch": 0.7240345186363915, + "grad_norm": 0.43484950065612793, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0078, + "step": 11830 + }, + { + "epoch": 0.7246465511965237, + "grad_norm": 0.29191601276397705, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0064, + "step": 11840 + }, + { + "epoch": 0.7252585837566559, + "grad_norm": 0.21717441082000732, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0056, + "step": 11850 + }, + { + "epoch": 0.7258706163167881, + "grad_norm": 0.3210129737854004, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0072, + "step": 11860 + }, + { + "epoch": 0.7264826488769203, + "grad_norm": 0.33192649483680725, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0061, + "step": 11870 + }, + { + "epoch": 0.7270946814370525, + "grad_norm": 0.14648163318634033, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0083, + "step": 11880 + }, + { + "epoch": 0.7277067139971847, + "grad_norm": 0.20028764009475708, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0052, + "step": 11890 + }, + { + "epoch": 0.7283187465573169, + "grad_norm": 0.21449612081050873, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0063, + "step": 11900 + }, + { + "epoch": 0.7289307791174491, + "grad_norm": 0.27472081780433655, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0062, + "step": 11910 + }, + { + "epoch": 0.7295428116775813, + "grad_norm": 0.2919130027294159, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0048, + "step": 11920 + }, + { + "epoch": 0.7301548442377135, + "grad_norm": 0.153092160820961, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0053, + "step": 11930 + }, + { + "epoch": 0.7307668767978457, + "grad_norm": 0.22820086777210236, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0058, + "step": 11940 + }, + { + "epoch": 0.7313789093579779, + "grad_norm": 0.24281881749629974, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0044, + "step": 11950 + }, + { + "epoch": 0.7319909419181101, + "grad_norm": 0.32581812143325806, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0051, + "step": 11960 + }, + { + "epoch": 0.7326029744782423, + "grad_norm": 0.3139822483062744, + "learning_rate": 1.435930222050582e-05, + "loss": 0.006, + "step": 11970 + }, + { + "epoch": 0.7332150070383744, + "grad_norm": 0.37985655665397644, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0052, + "step": 11980 + }, + { + "epoch": 0.7338270395985066, + "grad_norm": 0.1958508938550949, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.007, + "step": 11990 + }, + { + "epoch": 0.7344390721586388, + "grad_norm": 0.25318172574043274, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0084, + "step": 12000 + }, + { + "epoch": 0.735051104718771, + "grad_norm": 0.33245304226875305, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0051, + "step": 12010 + }, + { + "epoch": 0.7356631372789032, + "grad_norm": 0.2750372290611267, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0057, + "step": 12020 + }, + { + "epoch": 0.7362751698390354, + "grad_norm": 0.2057010382413864, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0057, + "step": 12030 + }, + { + "epoch": 0.7368872023991676, + "grad_norm": 0.30713731050491333, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0067, + "step": 12040 + }, + { + "epoch": 0.7374992349592998, + "grad_norm": 0.20423808693885803, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.006, + "step": 12050 + }, + { + "epoch": 0.738111267519432, + "grad_norm": 0.3129539489746094, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0067, + "step": 12060 + }, + { + "epoch": 0.7387233000795642, + "grad_norm": 0.25026270747184753, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0081, + "step": 12070 + }, + { + "epoch": 0.7393353326396964, + "grad_norm": 0.4147534668445587, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0061, + "step": 12080 + }, + { + "epoch": 0.7399473651998286, + "grad_norm": 0.20954278111457825, + "learning_rate": 1.425047976058418e-05, + "loss": 0.006, + "step": 12090 + }, + { + "epoch": 0.7405593977599608, + "grad_norm": 0.2700798809528351, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0065, + "step": 12100 + }, + { + "epoch": 0.741171430320093, + "grad_norm": 0.2597086429595947, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0059, + "step": 12110 + }, + { + "epoch": 0.7417834628802252, + "grad_norm": 0.2674495279788971, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0045, + "step": 12120 + }, + { + "epoch": 0.7423954954403574, + "grad_norm": 0.24583879113197327, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0061, + "step": 12130 + }, + { + "epoch": 0.7430075280004896, + "grad_norm": 0.23704801499843597, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0054, + "step": 12140 + }, + { + "epoch": 0.7436195605606218, + "grad_norm": 0.2381024807691574, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0073, + "step": 12150 + }, + { + "epoch": 0.744231593120754, + "grad_norm": 0.24937355518341064, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0049, + "step": 12160 + }, + { + "epoch": 0.7448436256808862, + "grad_norm": 0.20442882180213928, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0061, + "step": 12170 + }, + { + "epoch": 0.7454556582410184, + "grad_norm": 0.3053426742553711, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0087, + "step": 12180 + }, + { + "epoch": 0.7460676908011507, + "grad_norm": 0.3654315769672394, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0047, + "step": 12190 + }, + { + "epoch": 0.7466797233612829, + "grad_norm": 0.18926535546779633, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0065, + "step": 12200 + }, + { + "epoch": 0.7472917559214151, + "grad_norm": 0.21620485186576843, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0094, + "step": 12210 + }, + { + "epoch": 0.7479037884815473, + "grad_norm": 0.2754563093185425, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0059, + "step": 12220 + }, + { + "epoch": 0.7485158210416795, + "grad_norm": 0.39795419573783875, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.007, + "step": 12230 + }, + { + "epoch": 0.7491278536018117, + "grad_norm": 0.20502857863903046, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0048, + "step": 12240 + }, + { + "epoch": 0.7497398861619439, + "grad_norm": 0.23821429908275604, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0082, + "step": 12250 + }, + { + "epoch": 0.750351918722076, + "grad_norm": 0.45541366934776306, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0071, + "step": 12260 + }, + { + "epoch": 0.7509639512822082, + "grad_norm": 0.24881400167942047, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0061, + "step": 12270 + }, + { + "epoch": 0.7515759838423404, + "grad_norm": 0.2409125715494156, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0061, + "step": 12280 + }, + { + "epoch": 0.7521880164024726, + "grad_norm": 0.2930417060852051, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0054, + "step": 12290 + }, + { + "epoch": 0.7528000489626048, + "grad_norm": 0.30566394329071045, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0054, + "step": 12300 + }, + { + "epoch": 0.753412081522737, + "grad_norm": 0.32679763436317444, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0059, + "step": 12310 + }, + { + "epoch": 0.7540241140828692, + "grad_norm": 0.29273876547813416, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0067, + "step": 12320 + }, + { + "epoch": 0.7546361466430014, + "grad_norm": 0.19642773270606995, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0062, + "step": 12330 + }, + { + "epoch": 0.7552481792031336, + "grad_norm": 0.21928250789642334, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0066, + "step": 12340 + }, + { + "epoch": 0.7558602117632658, + "grad_norm": 0.2534322738647461, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0059, + "step": 12350 + }, + { + "epoch": 0.756472244323398, + "grad_norm": 0.20712649822235107, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0054, + "step": 12360 + }, + { + "epoch": 0.7570842768835302, + "grad_norm": 0.18670639395713806, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0063, + "step": 12370 + }, + { + "epoch": 0.7576963094436624, + "grad_norm": 0.26770254969596863, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0052, + "step": 12380 + }, + { + "epoch": 0.7583083420037946, + "grad_norm": 0.3621291518211365, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0056, + "step": 12390 + }, + { + "epoch": 0.7589203745639268, + "grad_norm": 0.31771939992904663, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0059, + "step": 12400 + }, + { + "epoch": 0.759532407124059, + "grad_norm": 0.44418177008628845, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0065, + "step": 12410 + }, + { + "epoch": 0.7601444396841912, + "grad_norm": 0.2183474898338318, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0046, + "step": 12420 + }, + { + "epoch": 0.7607564722443234, + "grad_norm": 0.4400590658187866, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0061, + "step": 12430 + }, + { + "epoch": 0.7613685048044556, + "grad_norm": 0.296539843082428, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0059, + "step": 12440 + }, + { + "epoch": 0.7619805373645878, + "grad_norm": 0.352870374917984, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0055, + "step": 12450 + }, + { + "epoch": 0.76259256992472, + "grad_norm": 0.19494596123695374, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0061, + "step": 12460 + }, + { + "epoch": 0.7632046024848522, + "grad_norm": 0.3799489438533783, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0057, + "step": 12470 + }, + { + "epoch": 0.7638166350449844, + "grad_norm": 0.3572365641593933, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0053, + "step": 12480 + }, + { + "epoch": 0.7644286676051166, + "grad_norm": 0.2559097707271576, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0062, + "step": 12490 + }, + { + "epoch": 0.7650407001652488, + "grad_norm": 0.13144978880882263, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0065, + "step": 12500 + }, + { + "epoch": 0.765652732725381, + "grad_norm": 0.34635287523269653, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0081, + "step": 12510 + }, + { + "epoch": 0.7662647652855132, + "grad_norm": 0.25615188479423523, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0057, + "step": 12520 + }, + { + "epoch": 0.7668767978456454, + "grad_norm": 0.17619644105434418, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0047, + "step": 12530 + }, + { + "epoch": 0.7674888304057775, + "grad_norm": 0.20169994235038757, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0068, + "step": 12540 + }, + { + "epoch": 0.7681008629659097, + "grad_norm": 0.49686071276664734, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0066, + "step": 12550 + }, + { + "epoch": 0.7687128955260419, + "grad_norm": 0.28179335594177246, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0058, + "step": 12560 + }, + { + "epoch": 0.7693249280861741, + "grad_norm": 0.28156182169914246, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.005, + "step": 12570 + }, + { + "epoch": 0.7699369606463063, + "grad_norm": 0.15054315328598022, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0051, + "step": 12580 + }, + { + "epoch": 0.7705489932064385, + "grad_norm": 0.22872644662857056, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0066, + "step": 12590 + }, + { + "epoch": 0.7711610257665708, + "grad_norm": 0.25821951031684875, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0054, + "step": 12600 + }, + { + "epoch": 0.771773058326703, + "grad_norm": 0.23592771589756012, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0059, + "step": 12610 + }, + { + "epoch": 0.7723850908868352, + "grad_norm": 0.34409141540527344, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0053, + "step": 12620 + }, + { + "epoch": 0.7729971234469674, + "grad_norm": 0.2803158760070801, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0042, + "step": 12630 + }, + { + "epoch": 0.7736091560070996, + "grad_norm": 0.32796284556388855, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0074, + "step": 12640 + }, + { + "epoch": 0.7742211885672318, + "grad_norm": 0.34749120473861694, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0054, + "step": 12650 + }, + { + "epoch": 0.774833221127364, + "grad_norm": 0.34066343307495117, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0082, + "step": 12660 + }, + { + "epoch": 0.7754452536874962, + "grad_norm": 0.4294384717941284, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0062, + "step": 12670 + }, + { + "epoch": 0.7760572862476284, + "grad_norm": 0.2355230748653412, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0069, + "step": 12680 + }, + { + "epoch": 0.7766693188077606, + "grad_norm": 0.3181976079940796, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0068, + "step": 12690 + }, + { + "epoch": 0.7772813513678928, + "grad_norm": 0.2763727605342865, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0052, + "step": 12700 + }, + { + "epoch": 0.777893383928025, + "grad_norm": 0.2938949465751648, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0041, + "step": 12710 + }, + { + "epoch": 0.7785054164881572, + "grad_norm": 0.31331220269203186, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0062, + "step": 12720 + }, + { + "epoch": 0.7791174490482894, + "grad_norm": 0.3389904797077179, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0061, + "step": 12730 + }, + { + "epoch": 0.7797294816084216, + "grad_norm": 0.2848975360393524, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0065, + "step": 12740 + }, + { + "epoch": 0.7803415141685538, + "grad_norm": 0.29838478565216064, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0061, + "step": 12750 + }, + { + "epoch": 0.780953546728686, + "grad_norm": 0.47004032135009766, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0066, + "step": 12760 + }, + { + "epoch": 0.7815655792888182, + "grad_norm": 0.26898056268692017, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0063, + "step": 12770 + }, + { + "epoch": 0.7821776118489504, + "grad_norm": 0.29459917545318604, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0065, + "step": 12780 + }, + { + "epoch": 0.7827896444090826, + "grad_norm": 0.3481508791446686, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0058, + "step": 12790 + }, + { + "epoch": 0.7834016769692148, + "grad_norm": 0.1707627922296524, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0053, + "step": 12800 + }, + { + "epoch": 0.784013709529347, + "grad_norm": 0.14735333621501923, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0058, + "step": 12810 + }, + { + "epoch": 0.7846257420894791, + "grad_norm": 0.28002044558525085, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.006, + "step": 12820 + }, + { + "epoch": 0.7852377746496113, + "grad_norm": 0.39598894119262695, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0062, + "step": 12830 + }, + { + "epoch": 0.7858498072097435, + "grad_norm": 0.19379247725009918, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0058, + "step": 12840 + }, + { + "epoch": 0.7864618397698757, + "grad_norm": 0.27260729670524597, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.006, + "step": 12850 + }, + { + "epoch": 0.7870738723300079, + "grad_norm": 0.2845087945461273, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0052, + "step": 12860 + }, + { + "epoch": 0.7876859048901401, + "grad_norm": 0.37151217460632324, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0043, + "step": 12870 + }, + { + "epoch": 0.7882979374502723, + "grad_norm": 0.3387412130832672, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0046, + "step": 12880 + }, + { + "epoch": 0.7889099700104045, + "grad_norm": 0.42672809958457947, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0065, + "step": 12890 + }, + { + "epoch": 0.7895220025705367, + "grad_norm": 0.20378202199935913, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0062, + "step": 12900 + }, + { + "epoch": 0.7901340351306689, + "grad_norm": 0.16417330503463745, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0045, + "step": 12910 + }, + { + "epoch": 0.7907460676908011, + "grad_norm": 0.1704142540693283, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0054, + "step": 12920 + }, + { + "epoch": 0.7913581002509333, + "grad_norm": 0.21494890749454498, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0061, + "step": 12930 + }, + { + "epoch": 0.7919701328110655, + "grad_norm": 0.3430638909339905, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0046, + "step": 12940 + }, + { + "epoch": 0.7925821653711977, + "grad_norm": 0.22641201317310333, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0049, + "step": 12950 + }, + { + "epoch": 0.79319419793133, + "grad_norm": 0.27153971791267395, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0057, + "step": 12960 + }, + { + "epoch": 0.7938062304914622, + "grad_norm": 0.2648560702800751, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.0048, + "step": 12970 + }, + { + "epoch": 0.7944182630515944, + "grad_norm": 0.2148633897304535, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0048, + "step": 12980 + }, + { + "epoch": 0.7950302956117266, + "grad_norm": 0.35170191526412964, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0069, + "step": 12990 + }, + { + "epoch": 0.7956423281718588, + "grad_norm": 0.3539712429046631, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0067, + "step": 13000 + }, + { + "epoch": 0.796254360731991, + "grad_norm": 0.29938259720802307, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0102, + "step": 13010 + }, + { + "epoch": 0.7968663932921232, + "grad_norm": 0.35241010785102844, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0073, + "step": 13020 + }, + { + "epoch": 0.7974784258522554, + "grad_norm": 0.2929113805294037, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0061, + "step": 13030 + }, + { + "epoch": 0.7980904584123876, + "grad_norm": 0.24052929878234863, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0046, + "step": 13040 + }, + { + "epoch": 0.7987024909725198, + "grad_norm": 0.21611042320728302, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0043, + "step": 13050 + }, + { + "epoch": 0.799314523532652, + "grad_norm": 0.23498570919036865, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0046, + "step": 13060 + }, + { + "epoch": 0.7999265560927842, + "grad_norm": 0.30229923129081726, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0068, + "step": 13070 + }, + { + "epoch": 0.8005385886529164, + "grad_norm": 0.2916681170463562, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0062, + "step": 13080 + }, + { + "epoch": 0.8011506212130486, + "grad_norm": 0.31905195116996765, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0048, + "step": 13090 + }, + { + "epoch": 0.8017626537731807, + "grad_norm": 0.22307109832763672, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0049, + "step": 13100 + }, + { + "epoch": 0.8023746863333129, + "grad_norm": 0.2815198004245758, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0054, + "step": 13110 + }, + { + "epoch": 0.8029867188934451, + "grad_norm": 0.18762829899787903, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0064, + "step": 13120 + }, + { + "epoch": 0.8035987514535773, + "grad_norm": 0.1918255090713501, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0064, + "step": 13130 + }, + { + "epoch": 0.8042107840137095, + "grad_norm": 0.3726229667663574, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0065, + "step": 13140 + }, + { + "epoch": 0.8048228165738417, + "grad_norm": 0.423285573720932, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0062, + "step": 13150 + }, + { + "epoch": 0.8054348491339739, + "grad_norm": 0.1709958165884018, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0052, + "step": 13160 + }, + { + "epoch": 0.8060468816941061, + "grad_norm": 0.3615981936454773, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0054, + "step": 13170 + }, + { + "epoch": 0.8066589142542383, + "grad_norm": 0.2101999819278717, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0053, + "step": 13180 + }, + { + "epoch": 0.8072709468143705, + "grad_norm": 0.14393582940101624, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0047, + "step": 13190 + }, + { + "epoch": 0.8078829793745027, + "grad_norm": 0.3704521656036377, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0056, + "step": 13200 + }, + { + "epoch": 0.8084950119346349, + "grad_norm": 0.23275913298130035, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0051, + "step": 13210 + }, + { + "epoch": 0.8091070444947671, + "grad_norm": 0.18429698050022125, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0045, + "step": 13220 + }, + { + "epoch": 0.8097190770548993, + "grad_norm": 0.21721667051315308, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0052, + "step": 13230 + }, + { + "epoch": 0.8103311096150315, + "grad_norm": 0.29456019401550293, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0074, + "step": 13240 + }, + { + "epoch": 0.8109431421751637, + "grad_norm": 0.19854630529880524, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0071, + "step": 13250 + }, + { + "epoch": 0.8115551747352959, + "grad_norm": 0.4318163990974426, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0059, + "step": 13260 + }, + { + "epoch": 0.8121672072954281, + "grad_norm": 0.3421531915664673, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.006, + "step": 13270 + }, + { + "epoch": 0.8127792398555603, + "grad_norm": 0.2370125651359558, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0054, + "step": 13280 + }, + { + "epoch": 0.8133912724156925, + "grad_norm": 0.2996460497379303, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0055, + "step": 13290 + }, + { + "epoch": 0.8140033049758247, + "grad_norm": 0.2911904454231262, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0053, + "step": 13300 + }, + { + "epoch": 0.8146153375359569, + "grad_norm": 0.26010408997535706, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0053, + "step": 13310 + }, + { + "epoch": 0.8152273700960891, + "grad_norm": 0.404702752828598, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0081, + "step": 13320 + }, + { + "epoch": 0.8158394026562213, + "grad_norm": 0.25591781735420227, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0057, + "step": 13330 + }, + { + "epoch": 0.8164514352163535, + "grad_norm": 0.1437849998474121, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0064, + "step": 13340 + }, + { + "epoch": 0.8170634677764858, + "grad_norm": 0.12252022325992584, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0047, + "step": 13350 + }, + { + "epoch": 0.817675500336618, + "grad_norm": 0.1861230581998825, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0063, + "step": 13360 + }, + { + "epoch": 0.8182875328967502, + "grad_norm": 0.2313026636838913, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0066, + "step": 13370 + }, + { + "epoch": 0.8188995654568824, + "grad_norm": 0.5445839166641235, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0076, + "step": 13380 + }, + { + "epoch": 0.8195115980170145, + "grad_norm": 0.21818871796131134, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0068, + "step": 13390 + }, + { + "epoch": 0.8201236305771467, + "grad_norm": 0.21823963522911072, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0072, + "step": 13400 + }, + { + "epoch": 0.8207356631372789, + "grad_norm": 0.1730659157037735, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0051, + "step": 13410 + }, + { + "epoch": 0.8213476956974111, + "grad_norm": 0.1301007866859436, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0075, + "step": 13420 + }, + { + "epoch": 0.8219597282575433, + "grad_norm": 0.32452520728111267, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.005, + "step": 13430 + }, + { + "epoch": 0.8225717608176755, + "grad_norm": 0.24771001935005188, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0058, + "step": 13440 + }, + { + "epoch": 0.8231837933778077, + "grad_norm": 0.4575227200984955, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0062, + "step": 13450 + }, + { + "epoch": 0.8237958259379399, + "grad_norm": 0.16441279649734497, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0081, + "step": 13460 + }, + { + "epoch": 0.8244078584980721, + "grad_norm": 0.26582902669906616, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0069, + "step": 13470 + }, + { + "epoch": 0.8250198910582043, + "grad_norm": 0.18871302902698517, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0068, + "step": 13480 + }, + { + "epoch": 0.8256319236183365, + "grad_norm": 0.23244783282279968, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0063, + "step": 13490 + }, + { + "epoch": 0.8262439561784687, + "grad_norm": 0.2399880290031433, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0036, + "step": 13500 + }, + { + "epoch": 0.8268559887386009, + "grad_norm": 0.25766822695732117, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0074, + "step": 13510 + }, + { + "epoch": 0.8274680212987331, + "grad_norm": 0.24792100489139557, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0066, + "step": 13520 + }, + { + "epoch": 0.8280800538588653, + "grad_norm": 0.3371896743774414, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0069, + "step": 13530 + }, + { + "epoch": 0.8286920864189975, + "grad_norm": 0.16249819099903107, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0079, + "step": 13540 + }, + { + "epoch": 0.8293041189791297, + "grad_norm": 0.2705139219760895, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0069, + "step": 13550 + }, + { + "epoch": 0.8299161515392619, + "grad_norm": 0.1905352771282196, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0055, + "step": 13560 + }, + { + "epoch": 0.8305281840993941, + "grad_norm": 0.23938500881195068, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0055, + "step": 13570 + }, + { + "epoch": 0.8311402166595263, + "grad_norm": 0.3562251031398773, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0054, + "step": 13580 + }, + { + "epoch": 0.8317522492196585, + "grad_norm": 0.2934769093990326, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0064, + "step": 13590 + }, + { + "epoch": 0.8323642817797907, + "grad_norm": 0.252366840839386, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0078, + "step": 13600 + }, + { + "epoch": 0.8329763143399229, + "grad_norm": 0.16646964848041534, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0046, + "step": 13610 + }, + { + "epoch": 0.8335883469000551, + "grad_norm": 0.22584658861160278, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0071, + "step": 13620 + }, + { + "epoch": 0.8342003794601873, + "grad_norm": 0.3578774034976959, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0049, + "step": 13630 + }, + { + "epoch": 0.8348124120203195, + "grad_norm": 0.3447739779949188, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0065, + "step": 13640 + }, + { + "epoch": 0.8354244445804517, + "grad_norm": 0.381954550743103, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0057, + "step": 13650 + }, + { + "epoch": 0.8360364771405839, + "grad_norm": 0.3563731908798218, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0065, + "step": 13660 + }, + { + "epoch": 0.836648509700716, + "grad_norm": 0.29516372084617615, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0053, + "step": 13670 + }, + { + "epoch": 0.8372605422608482, + "grad_norm": 0.22686618566513062, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0043, + "step": 13680 + }, + { + "epoch": 0.8378725748209804, + "grad_norm": 0.4608387351036072, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.005, + "step": 13690 + }, + { + "epoch": 0.8384846073811126, + "grad_norm": 0.31025534868240356, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0055, + "step": 13700 + }, + { + "epoch": 0.8390966399412448, + "grad_norm": 0.32904690504074097, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0055, + "step": 13710 + }, + { + "epoch": 0.839708672501377, + "grad_norm": 0.2547053098678589, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0061, + "step": 13720 + }, + { + "epoch": 0.8403207050615092, + "grad_norm": 0.30524104833602905, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.005, + "step": 13730 + }, + { + "epoch": 0.8409327376216414, + "grad_norm": 0.17741642892360687, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0051, + "step": 13740 + }, + { + "epoch": 0.8415447701817736, + "grad_norm": 0.23125578463077545, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0053, + "step": 13750 + }, + { + "epoch": 0.8421568027419059, + "grad_norm": 0.3080023229122162, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0075, + "step": 13760 + }, + { + "epoch": 0.842768835302038, + "grad_norm": 0.2509821951389313, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0053, + "step": 13770 + }, + { + "epoch": 0.8433808678621703, + "grad_norm": 0.17483864724636078, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.005, + "step": 13780 + }, + { + "epoch": 0.8439929004223025, + "grad_norm": 0.3952518403530121, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0056, + "step": 13790 + }, + { + "epoch": 0.8446049329824347, + "grad_norm": 0.2945535480976105, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0055, + "step": 13800 + }, + { + "epoch": 0.8452169655425669, + "grad_norm": 0.13024291396141052, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0073, + "step": 13810 + }, + { + "epoch": 0.8458289981026991, + "grad_norm": 0.1840520054101944, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0061, + "step": 13820 + }, + { + "epoch": 0.8464410306628313, + "grad_norm": 0.2368786782026291, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0058, + "step": 13830 + }, + { + "epoch": 0.8470530632229635, + "grad_norm": 0.2885456085205078, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0055, + "step": 13840 + }, + { + "epoch": 0.8476650957830957, + "grad_norm": 0.2782488167285919, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0046, + "step": 13850 + }, + { + "epoch": 0.8482771283432279, + "grad_norm": 0.1711442470550537, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0055, + "step": 13860 + }, + { + "epoch": 0.8488891609033601, + "grad_norm": 0.22235877811908722, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0056, + "step": 13870 + }, + { + "epoch": 0.8495011934634923, + "grad_norm": 0.1937183290719986, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0047, + "step": 13880 + }, + { + "epoch": 0.8501132260236245, + "grad_norm": 0.33960190415382385, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0063, + "step": 13890 + }, + { + "epoch": 0.8507252585837567, + "grad_norm": 0.1983388215303421, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0045, + "step": 13900 + }, + { + "epoch": 0.8513372911438889, + "grad_norm": 0.2968246638774872, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0051, + "step": 13910 + }, + { + "epoch": 0.8519493237040211, + "grad_norm": 0.25328314304351807, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0057, + "step": 13920 + }, + { + "epoch": 0.8525613562641533, + "grad_norm": 0.2435184270143509, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0056, + "step": 13930 + }, + { + "epoch": 0.8531733888242855, + "grad_norm": 0.24512560665607452, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0053, + "step": 13940 + }, + { + "epoch": 0.8537854213844176, + "grad_norm": 0.22028976678848267, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.006, + "step": 13950 + }, + { + "epoch": 0.8543974539445498, + "grad_norm": 0.24743935465812683, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0065, + "step": 13960 + }, + { + "epoch": 0.855009486504682, + "grad_norm": 0.1393810361623764, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0081, + "step": 13970 + }, + { + "epoch": 0.8556215190648142, + "grad_norm": 0.25975972414016724, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0063, + "step": 13980 + }, + { + "epoch": 0.8562335516249464, + "grad_norm": 0.1944616585969925, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0049, + "step": 13990 + }, + { + "epoch": 0.8568455841850786, + "grad_norm": 0.21936742961406708, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0056, + "step": 14000 + }, + { + "epoch": 0.8574576167452108, + "grad_norm": 0.1556629091501236, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0047, + "step": 14010 + }, + { + "epoch": 0.858069649305343, + "grad_norm": 0.23696991801261902, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.006, + "step": 14020 + }, + { + "epoch": 0.8586816818654752, + "grad_norm": 0.32507795095443726, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0061, + "step": 14030 + }, + { + "epoch": 0.8592937144256074, + "grad_norm": 0.35332199931144714, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0058, + "step": 14040 + }, + { + "epoch": 0.8599057469857396, + "grad_norm": 0.1835644394159317, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0046, + "step": 14050 + }, + { + "epoch": 0.8605177795458718, + "grad_norm": 0.19127517938613892, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0044, + "step": 14060 + }, + { + "epoch": 0.861129812106004, + "grad_norm": 0.30748996138572693, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0055, + "step": 14070 + }, + { + "epoch": 0.8617418446661362, + "grad_norm": 0.178785502910614, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0049, + "step": 14080 + }, + { + "epoch": 0.8623538772262684, + "grad_norm": 0.16979056596755981, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0044, + "step": 14090 + }, + { + "epoch": 0.8629659097864006, + "grad_norm": 0.19519983232021332, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0063, + "step": 14100 + }, + { + "epoch": 0.8635779423465328, + "grad_norm": 0.2722550928592682, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0057, + "step": 14110 + }, + { + "epoch": 0.864189974906665, + "grad_norm": 0.1956222504377365, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0054, + "step": 14120 + }, + { + "epoch": 0.8648020074667973, + "grad_norm": 0.32274308800697327, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0066, + "step": 14130 + }, + { + "epoch": 0.8654140400269295, + "grad_norm": 0.25953641533851624, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.0056, + "step": 14140 + }, + { + "epoch": 0.8660260725870617, + "grad_norm": 0.3293299674987793, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0057, + "step": 14150 + }, + { + "epoch": 0.8666381051471939, + "grad_norm": 0.35404127836227417, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0072, + "step": 14160 + }, + { + "epoch": 0.8672501377073261, + "grad_norm": 0.24674376845359802, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0064, + "step": 14170 + }, + { + "epoch": 0.8678621702674583, + "grad_norm": 0.23506462574005127, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0047, + "step": 14180 + }, + { + "epoch": 0.8684742028275905, + "grad_norm": 0.30500903725624084, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0059, + "step": 14190 + }, + { + "epoch": 0.8690862353877227, + "grad_norm": 0.23000167310237885, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0051, + "step": 14200 + }, + { + "epoch": 0.8696982679478549, + "grad_norm": 0.17339368164539337, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0041, + "step": 14210 + }, + { + "epoch": 0.8703103005079871, + "grad_norm": 0.2505367696285248, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.0059, + "step": 14220 + }, + { + "epoch": 0.8709223330681192, + "grad_norm": 0.22645734250545502, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0044, + "step": 14230 + }, + { + "epoch": 0.8715343656282514, + "grad_norm": 0.3509127199649811, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0043, + "step": 14240 + }, + { + "epoch": 0.8721463981883836, + "grad_norm": 0.2758972644805908, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0055, + "step": 14250 + }, + { + "epoch": 0.8727584307485158, + "grad_norm": 0.1943834275007248, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.006, + "step": 14260 + }, + { + "epoch": 0.873370463308648, + "grad_norm": 0.32881075143814087, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0061, + "step": 14270 + }, + { + "epoch": 0.8739824958687802, + "grad_norm": 0.35203438997268677, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0054, + "step": 14280 + }, + { + "epoch": 0.8745945284289124, + "grad_norm": 0.13618917763233185, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0044, + "step": 14290 + }, + { + "epoch": 0.8752065609890446, + "grad_norm": 0.22939404845237732, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0044, + "step": 14300 + }, + { + "epoch": 0.8758185935491768, + "grad_norm": 0.2027491182088852, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0051, + "step": 14310 + }, + { + "epoch": 0.876430626109309, + "grad_norm": 0.21950028836727142, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0105, + "step": 14320 + }, + { + "epoch": 0.8770426586694412, + "grad_norm": 0.307913213968277, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0051, + "step": 14330 + }, + { + "epoch": 0.8776546912295734, + "grad_norm": 0.1669110357761383, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0058, + "step": 14340 + }, + { + "epoch": 0.8782667237897056, + "grad_norm": 0.3033636808395386, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0066, + "step": 14350 + }, + { + "epoch": 0.8788787563498378, + "grad_norm": 0.25514236092567444, + "learning_rate": 1.210961823379053e-05, + "loss": 0.005, + "step": 14360 + }, + { + "epoch": 0.87949078890997, + "grad_norm": 0.2574418783187866, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0069, + "step": 14370 + }, + { + "epoch": 0.8801028214701022, + "grad_norm": 0.17803016304969788, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.006, + "step": 14380 + }, + { + "epoch": 0.8807148540302344, + "grad_norm": 0.31375741958618164, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0077, + "step": 14390 + }, + { + "epoch": 0.8813268865903666, + "grad_norm": 0.18031778931617737, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0052, + "step": 14400 + }, + { + "epoch": 0.8819389191504988, + "grad_norm": 0.18077519536018372, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0055, + "step": 14410 + }, + { + "epoch": 0.882550951710631, + "grad_norm": 0.22171644866466522, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0059, + "step": 14420 + }, + { + "epoch": 0.8831629842707632, + "grad_norm": 0.16187389194965363, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0044, + "step": 14430 + }, + { + "epoch": 0.8837750168308954, + "grad_norm": 0.27667325735092163, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0063, + "step": 14440 + }, + { + "epoch": 0.8843870493910276, + "grad_norm": 0.2493051290512085, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0072, + "step": 14450 + }, + { + "epoch": 0.8849990819511598, + "grad_norm": 0.3519611656665802, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0057, + "step": 14460 + }, + { + "epoch": 0.885611114511292, + "grad_norm": 0.17942464351654053, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0057, + "step": 14470 + }, + { + "epoch": 0.8862231470714242, + "grad_norm": 0.24518658220767975, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0044, + "step": 14480 + }, + { + "epoch": 0.8868351796315564, + "grad_norm": 0.28493785858154297, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0055, + "step": 14490 + }, + { + "epoch": 0.8874472121916887, + "grad_norm": 0.22260263562202454, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0062, + "step": 14500 + }, + { + "epoch": 0.8880592447518207, + "grad_norm": 0.2804561257362366, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0051, + "step": 14510 + }, + { + "epoch": 0.888671277311953, + "grad_norm": 0.24349385499954224, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0045, + "step": 14520 + }, + { + "epoch": 0.8892833098720851, + "grad_norm": 0.262207955121994, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0082, + "step": 14530 + }, + { + "epoch": 0.8898953424322174, + "grad_norm": 0.15527820587158203, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0061, + "step": 14540 + }, + { + "epoch": 0.8905073749923496, + "grad_norm": 0.23850804567337036, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0042, + "step": 14550 + }, + { + "epoch": 0.8911194075524818, + "grad_norm": 0.2665582001209259, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0053, + "step": 14560 + }, + { + "epoch": 0.891731440112614, + "grad_norm": 0.2652167081832886, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "epoch": 0.8923434726727462, + "grad_norm": 0.21386243402957916, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0072, + "step": 14580 + }, + { + "epoch": 0.8929555052328784, + "grad_norm": 0.3087247312068939, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0082, + "step": 14590 + }, + { + "epoch": 0.8935675377930106, + "grad_norm": 0.2003909796476364, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0048, + "step": 14600 + }, + { + "epoch": 0.8941795703531428, + "grad_norm": 0.2214624583721161, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0062, + "step": 14610 + }, + { + "epoch": 0.894791602913275, + "grad_norm": 0.2500647306442261, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0052, + "step": 14620 + }, + { + "epoch": 0.8954036354734072, + "grad_norm": 0.2615419030189514, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0054, + "step": 14630 + }, + { + "epoch": 0.8960156680335394, + "grad_norm": 0.21347551047801971, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0048, + "step": 14640 + }, + { + "epoch": 0.8966277005936716, + "grad_norm": 0.35483887791633606, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0054, + "step": 14650 + }, + { + "epoch": 0.8972397331538038, + "grad_norm": 0.2423439472913742, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0055, + "step": 14660 + }, + { + "epoch": 0.897851765713936, + "grad_norm": 0.16826359927654266, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0067, + "step": 14670 + }, + { + "epoch": 0.8984637982740682, + "grad_norm": 0.3589499294757843, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0059, + "step": 14680 + }, + { + "epoch": 0.8990758308342004, + "grad_norm": 0.3081042468547821, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0057, + "step": 14690 + }, + { + "epoch": 0.8996878633943326, + "grad_norm": 0.31996914744377136, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0063, + "step": 14700 + }, + { + "epoch": 0.9002998959544648, + "grad_norm": 0.301209419965744, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0073, + "step": 14710 + }, + { + "epoch": 0.900911928514597, + "grad_norm": 0.19257168471813202, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0055, + "step": 14720 + }, + { + "epoch": 0.9015239610747292, + "grad_norm": 0.15221600234508514, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0053, + "step": 14730 + }, + { + "epoch": 0.9021359936348614, + "grad_norm": 0.21519577503204346, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0055, + "step": 14740 + }, + { + "epoch": 0.9027480261949936, + "grad_norm": 0.23772196471691132, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.006, + "step": 14750 + }, + { + "epoch": 0.9033600587551258, + "grad_norm": 0.2872219979763031, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0057, + "step": 14760 + }, + { + "epoch": 0.903972091315258, + "grad_norm": 0.2589483857154846, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0056, + "step": 14770 + }, + { + "epoch": 0.9045841238753902, + "grad_norm": 0.31850162148475647, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0051, + "step": 14780 + }, + { + "epoch": 0.9051961564355223, + "grad_norm": 0.27179282903671265, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0051, + "step": 14790 + }, + { + "epoch": 0.9058081889956545, + "grad_norm": 0.4132739007472992, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.005, + "step": 14800 + }, + { + "epoch": 0.9064202215557867, + "grad_norm": 0.19336774945259094, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0041, + "step": 14810 + }, + { + "epoch": 0.9070322541159189, + "grad_norm": 0.20783282816410065, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0065, + "step": 14820 + }, + { + "epoch": 0.9076442866760511, + "grad_norm": 0.26141899824142456, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0069, + "step": 14830 + }, + { + "epoch": 0.9082563192361833, + "grad_norm": 0.2158539742231369, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0081, + "step": 14840 + }, + { + "epoch": 0.9088683517963155, + "grad_norm": 0.3233732581138611, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0065, + "step": 14850 + }, + { + "epoch": 0.9094803843564477, + "grad_norm": 0.23924769461154938, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0042, + "step": 14860 + }, + { + "epoch": 0.9100924169165799, + "grad_norm": 0.17663812637329102, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.004, + "step": 14870 + }, + { + "epoch": 0.9107044494767121, + "grad_norm": 0.34379643201828003, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.005, + "step": 14880 + }, + { + "epoch": 0.9113164820368443, + "grad_norm": 0.29971349239349365, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0059, + "step": 14890 + }, + { + "epoch": 0.9119285145969765, + "grad_norm": 0.24832949042320251, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0042, + "step": 14900 + }, + { + "epoch": 0.9125405471571088, + "grad_norm": 0.22288024425506592, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0065, + "step": 14910 + }, + { + "epoch": 0.913152579717241, + "grad_norm": 0.2806689441204071, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0043, + "step": 14920 + }, + { + "epoch": 0.9137646122773732, + "grad_norm": 0.3908274173736572, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0047, + "step": 14930 + }, + { + "epoch": 0.9143766448375054, + "grad_norm": 0.16255778074264526, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0068, + "step": 14940 + }, + { + "epoch": 0.9149886773976376, + "grad_norm": 0.430791437625885, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0085, + "step": 14950 + }, + { + "epoch": 0.9156007099577698, + "grad_norm": 0.1739969551563263, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0056, + "step": 14960 + }, + { + "epoch": 0.916212742517902, + "grad_norm": 0.24298283457756042, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0055, + "step": 14970 + }, + { + "epoch": 0.9168247750780342, + "grad_norm": 0.21269915997982025, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0051, + "step": 14980 + }, + { + "epoch": 0.9174368076381664, + "grad_norm": 0.263388991355896, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0058, + "step": 14990 + }, + { + "epoch": 0.9180488401982986, + "grad_norm": 0.28030532598495483, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0068, + "step": 15000 + }, + { + "epoch": 0.9186608727584308, + "grad_norm": 0.17051894962787628, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0049, + "step": 15010 + }, + { + "epoch": 0.919272905318563, + "grad_norm": 0.2763383388519287, + "learning_rate": 1.146875176249365e-05, + "loss": 0.004, + "step": 15020 + }, + { + "epoch": 0.9198849378786952, + "grad_norm": 0.2616822421550751, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0052, + "step": 15030 + }, + { + "epoch": 0.9204969704388274, + "grad_norm": 0.21407093107700348, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0062, + "step": 15040 + }, + { + "epoch": 0.9211090029989596, + "grad_norm": 0.23936578631401062, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0073, + "step": 15050 + }, + { + "epoch": 0.9217210355590918, + "grad_norm": 0.26383110880851746, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.006, + "step": 15060 + }, + { + "epoch": 0.922333068119224, + "grad_norm": 0.19477945566177368, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0043, + "step": 15070 + }, + { + "epoch": 0.9229451006793561, + "grad_norm": 0.16677282750606537, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0061, + "step": 15080 + }, + { + "epoch": 0.9235571332394883, + "grad_norm": 0.26856037974357605, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0065, + "step": 15090 + }, + { + "epoch": 0.9241691657996205, + "grad_norm": 0.20086173713207245, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0056, + "step": 15100 + }, + { + "epoch": 0.9247811983597527, + "grad_norm": 0.26998719573020935, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0034, + "step": 15110 + }, + { + "epoch": 0.9253932309198849, + "grad_norm": 0.12727728486061096, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0043, + "step": 15120 + }, + { + "epoch": 0.9260052634800171, + "grad_norm": 0.11288347095251083, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0055, + "step": 15130 + }, + { + "epoch": 0.9266172960401493, + "grad_norm": 0.1109771579504013, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0048, + "step": 15140 + }, + { + "epoch": 0.9272293286002815, + "grad_norm": 0.2556479275226593, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0056, + "step": 15150 + }, + { + "epoch": 0.9278413611604137, + "grad_norm": 0.2149561196565628, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.005, + "step": 15160 + }, + { + "epoch": 0.9284533937205459, + "grad_norm": 0.16953054070472717, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0063, + "step": 15170 + }, + { + "epoch": 0.9290654262806781, + "grad_norm": 0.18306049704551697, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.004, + "step": 15180 + }, + { + "epoch": 0.9296774588408103, + "grad_norm": 0.15755385160446167, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0035, + "step": 15190 + }, + { + "epoch": 0.9302894914009425, + "grad_norm": 0.21062517166137695, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0049, + "step": 15200 + }, + { + "epoch": 0.9309015239610747, + "grad_norm": 0.1403888463973999, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0051, + "step": 15210 + }, + { + "epoch": 0.9315135565212069, + "grad_norm": 0.4044550359249115, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0062, + "step": 15220 + }, + { + "epoch": 0.9321255890813391, + "grad_norm": 0.22543896734714508, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0056, + "step": 15230 + }, + { + "epoch": 0.9327376216414713, + "grad_norm": 0.2025403380393982, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0055, + "step": 15240 + }, + { + "epoch": 0.9333496542016035, + "grad_norm": 1.0549683570861816, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0092, + "step": 15250 + }, + { + "epoch": 0.9339616867617357, + "grad_norm": 0.3442397117614746, + "learning_rate": 1.123494277220359e-05, + "loss": 0.005, + "step": 15260 + }, + { + "epoch": 0.934573719321868, + "grad_norm": 0.1678813248872757, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.005, + "step": 15270 + }, + { + "epoch": 0.9351857518820001, + "grad_norm": 0.31081119179725647, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0052, + "step": 15280 + }, + { + "epoch": 0.9357977844421324, + "grad_norm": 0.25498780608177185, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.006, + "step": 15290 + }, + { + "epoch": 0.9364098170022646, + "grad_norm": 0.21825125813484192, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0054, + "step": 15300 + }, + { + "epoch": 0.9370218495623968, + "grad_norm": 0.19719983637332916, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0074, + "step": 15310 + }, + { + "epoch": 0.937633882122529, + "grad_norm": 0.32297465205192566, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0058, + "step": 15320 + }, + { + "epoch": 0.9382459146826612, + "grad_norm": 0.2717733383178711, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0035, + "step": 15330 + }, + { + "epoch": 0.9388579472427934, + "grad_norm": 0.22138433158397675, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0048, + "step": 15340 + }, + { + "epoch": 0.9394699798029256, + "grad_norm": 0.1943465769290924, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0063, + "step": 15350 + }, + { + "epoch": 0.9400820123630577, + "grad_norm": 0.18422184884548187, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0054, + "step": 15360 + }, + { + "epoch": 0.9406940449231899, + "grad_norm": 0.17614246904850006, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0054, + "step": 15370 + }, + { + "epoch": 0.9413060774833221, + "grad_norm": 0.17661592364311218, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0053, + "step": 15380 + }, + { + "epoch": 0.9419181100434543, + "grad_norm": 0.42976850271224976, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0054, + "step": 15390 + }, + { + "epoch": 0.9425301426035865, + "grad_norm": 0.34272316098213196, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0044, + "step": 15400 + }, + { + "epoch": 0.9431421751637187, + "grad_norm": 0.3346613645553589, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0042, + "step": 15410 + }, + { + "epoch": 0.9437542077238509, + "grad_norm": 0.15300114452838898, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0057, + "step": 15420 + }, + { + "epoch": 0.9443662402839831, + "grad_norm": 0.23935656249523163, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0084, + "step": 15430 + }, + { + "epoch": 0.9449782728441153, + "grad_norm": 0.21595227718353271, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0051, + "step": 15440 + }, + { + "epoch": 0.9455903054042475, + "grad_norm": 0.2670149505138397, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0059, + "step": 15450 + }, + { + "epoch": 0.9462023379643797, + "grad_norm": 0.2214009314775467, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0054, + "step": 15460 + }, + { + "epoch": 0.9468143705245119, + "grad_norm": 0.3491996228694916, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0057, + "step": 15470 + }, + { + "epoch": 0.9474264030846441, + "grad_norm": 0.28213024139404297, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0054, + "step": 15480 + }, + { + "epoch": 0.9480384356447763, + "grad_norm": 0.30218765139579773, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0049, + "step": 15490 + }, + { + "epoch": 0.9486504682049085, + "grad_norm": 0.17068025469779968, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0046, + "step": 15500 + }, + { + "epoch": 0.9492625007650407, + "grad_norm": 0.23325121402740479, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0054, + "step": 15510 + }, + { + "epoch": 0.9498745333251729, + "grad_norm": 0.22118528187274933, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0035, + "step": 15520 + }, + { + "epoch": 0.9504865658853051, + "grad_norm": 0.20202121138572693, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0043, + "step": 15530 + }, + { + "epoch": 0.9510985984454373, + "grad_norm": 0.28455010056495667, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0039, + "step": 15540 + }, + { + "epoch": 0.9517106310055695, + "grad_norm": 0.26871445775032043, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0046, + "step": 15550 + }, + { + "epoch": 0.9523226635657017, + "grad_norm": 0.33665943145751953, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0058, + "step": 15560 + }, + { + "epoch": 0.9529346961258339, + "grad_norm": 0.3182595670223236, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0053, + "step": 15570 + }, + { + "epoch": 0.9535467286859661, + "grad_norm": 0.2867930829524994, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0068, + "step": 15580 + }, + { + "epoch": 0.9541587612460983, + "grad_norm": 0.21562239527702332, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.0051, + "step": 15590 + }, + { + "epoch": 0.9547707938062305, + "grad_norm": 0.19122859835624695, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0046, + "step": 15600 + }, + { + "epoch": 0.9553828263663627, + "grad_norm": 0.24596959352493286, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.005, + "step": 15610 + }, + { + "epoch": 0.9559948589264949, + "grad_norm": 0.182195246219635, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0038, + "step": 15620 + }, + { + "epoch": 0.9566068914866271, + "grad_norm": 0.3122585415840149, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0053, + "step": 15630 + }, + { + "epoch": 0.9572189240467592, + "grad_norm": 0.25725093483924866, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0063, + "step": 15640 + }, + { + "epoch": 0.9578309566068914, + "grad_norm": 0.19965514540672302, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0053, + "step": 15650 + }, + { + "epoch": 0.9584429891670236, + "grad_norm": 0.3474758267402649, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.006, + "step": 15660 + }, + { + "epoch": 0.9590550217271558, + "grad_norm": 0.18151336908340454, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0048, + "step": 15670 + }, + { + "epoch": 0.959667054287288, + "grad_norm": 0.18923020362854004, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0037, + "step": 15680 + }, + { + "epoch": 0.9602790868474202, + "grad_norm": 0.19792871177196503, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0049, + "step": 15690 + }, + { + "epoch": 0.9608911194075525, + "grad_norm": 0.20296797156333923, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0047, + "step": 15700 + }, + { + "epoch": 0.9615031519676847, + "grad_norm": 0.2556051015853882, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0054, + "step": 15710 + }, + { + "epoch": 0.9621151845278169, + "grad_norm": 0.35538288950920105, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0037, + "step": 15720 + }, + { + "epoch": 0.9627272170879491, + "grad_norm": 0.45357266068458557, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0065, + "step": 15730 + }, + { + "epoch": 0.9633392496480813, + "grad_norm": 0.23721693456172943, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0046, + "step": 15740 + }, + { + "epoch": 0.9639512822082135, + "grad_norm": 0.2727845013141632, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0052, + "step": 15750 + }, + { + "epoch": 0.9645633147683457, + "grad_norm": 0.2647950351238251, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0054, + "step": 15760 + }, + { + "epoch": 0.9651753473284779, + "grad_norm": 0.23364882171154022, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.005, + "step": 15770 + }, + { + "epoch": 0.9657873798886101, + "grad_norm": 0.2035825401544571, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0054, + "step": 15780 + }, + { + "epoch": 0.9663994124487423, + "grad_norm": 0.2411692589521408, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0062, + "step": 15790 + }, + { + "epoch": 0.9670114450088745, + "grad_norm": 0.23559266328811646, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0052, + "step": 15800 + }, + { + "epoch": 0.9676234775690067, + "grad_norm": 0.23872418701648712, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0063, + "step": 15810 + }, + { + "epoch": 0.9682355101291389, + "grad_norm": 0.27072128653526306, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0052, + "step": 15820 + }, + { + "epoch": 0.9688475426892711, + "grad_norm": 0.42610588669776917, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0056, + "step": 15830 + }, + { + "epoch": 0.9694595752494033, + "grad_norm": 0.13065233826637268, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0044, + "step": 15840 + }, + { + "epoch": 0.9700716078095355, + "grad_norm": 0.2479996383190155, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0049, + "step": 15850 + }, + { + "epoch": 0.9706836403696677, + "grad_norm": 0.22867974638938904, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0053, + "step": 15860 + }, + { + "epoch": 0.9712956729297999, + "grad_norm": 0.21570387482643127, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0048, + "step": 15870 + }, + { + "epoch": 0.9719077054899321, + "grad_norm": 0.26354169845581055, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0073, + "step": 15880 + }, + { + "epoch": 0.9725197380500643, + "grad_norm": 0.19785451889038086, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0044, + "step": 15890 + }, + { + "epoch": 0.9731317706101965, + "grad_norm": 0.09346124529838562, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0051, + "step": 15900 + }, + { + "epoch": 0.9737438031703287, + "grad_norm": 0.18946298956871033, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0049, + "step": 15910 + }, + { + "epoch": 0.9743558357304608, + "grad_norm": 0.1761726588010788, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0057, + "step": 15920 + }, + { + "epoch": 0.974967868290593, + "grad_norm": 0.2610328495502472, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0061, + "step": 15930 + }, + { + "epoch": 0.9755799008507252, + "grad_norm": 0.1841743141412735, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0046, + "step": 15940 + }, + { + "epoch": 0.9761919334108574, + "grad_norm": 0.14279355108737946, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0038, + "step": 15950 + }, + { + "epoch": 0.9768039659709896, + "grad_norm": 0.1717681884765625, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0035, + "step": 15960 + }, + { + "epoch": 0.9774159985311218, + "grad_norm": 0.2102527618408203, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.007, + "step": 15970 + }, + { + "epoch": 0.978028031091254, + "grad_norm": 0.29462379217147827, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0058, + "step": 15980 + }, + { + "epoch": 0.9786400636513862, + "grad_norm": 0.1863207072019577, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0058, + "step": 15990 + }, + { + "epoch": 0.9792520962115184, + "grad_norm": 0.2764773964881897, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0051, + "step": 16000 + }, + { + "epoch": 0.9798641287716506, + "grad_norm": 0.2723250091075897, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0056, + "step": 16010 + }, + { + "epoch": 0.9804761613317828, + "grad_norm": 0.21564331650733948, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0048, + "step": 16020 + }, + { + "epoch": 0.981088193891915, + "grad_norm": 0.20242232084274292, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0058, + "step": 16030 + }, + { + "epoch": 0.9817002264520472, + "grad_norm": 0.21522754430770874, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0039, + "step": 16040 + }, + { + "epoch": 0.9823122590121794, + "grad_norm": 0.20013833045959473, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0051, + "step": 16050 + }, + { + "epoch": 0.9829242915723116, + "grad_norm": 0.3008810579776764, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0052, + "step": 16060 + }, + { + "epoch": 0.9835363241324439, + "grad_norm": 0.2994979918003082, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0033, + "step": 16070 + }, + { + "epoch": 0.984148356692576, + "grad_norm": 0.22704628109931946, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0046, + "step": 16080 + }, + { + "epoch": 0.9847603892527083, + "grad_norm": 0.3253551423549652, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0043, + "step": 16090 + }, + { + "epoch": 0.9853724218128405, + "grad_norm": 0.14902091026306152, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0042, + "step": 16100 + }, + { + "epoch": 0.9859844543729727, + "grad_norm": 0.15155524015426636, + "learning_rate": 1.04066696184376e-05, + "loss": 0.005, + "step": 16110 + }, + { + "epoch": 0.9865964869331049, + "grad_norm": 0.1859518140554428, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0063, + "step": 16120 + }, + { + "epoch": 0.9872085194932371, + "grad_norm": 0.5434902906417847, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0072, + "step": 16130 + }, + { + "epoch": 0.9878205520533693, + "grad_norm": 0.19308103621006012, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0046, + "step": 16140 + }, + { + "epoch": 0.9884325846135015, + "grad_norm": 0.21260593831539154, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0077, + "step": 16150 + }, + { + "epoch": 0.9890446171736337, + "grad_norm": 0.15255668759346008, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0059, + "step": 16160 + }, + { + "epoch": 0.9896566497337659, + "grad_norm": 0.18739885091781616, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0047, + "step": 16170 + }, + { + "epoch": 0.9902686822938981, + "grad_norm": 0.2112029641866684, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0049, + "step": 16180 + }, + { + "epoch": 0.9908807148540303, + "grad_norm": 0.35941991209983826, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.005, + "step": 16190 + }, + { + "epoch": 0.9914927474141624, + "grad_norm": 0.16792108118534088, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0051, + "step": 16200 + }, + { + "epoch": 0.9921047799742946, + "grad_norm": 0.1985466182231903, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0063, + "step": 16210 + }, + { + "epoch": 0.9927168125344268, + "grad_norm": 0.17579570412635803, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0046, + "step": 16220 + }, + { + "epoch": 0.993328845094559, + "grad_norm": 0.23352178931236267, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0061, + "step": 16230 + }, + { + "epoch": 0.9939408776546912, + "grad_norm": 0.3543553054332733, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0054, + "step": 16240 + }, + { + "epoch": 0.9945529102148234, + "grad_norm": 0.18603719770908356, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0049, + "step": 16250 + }, + { + "epoch": 0.9951649427749556, + "grad_norm": 0.31745344400405884, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0061, + "step": 16260 + }, + { + "epoch": 0.9957769753350878, + "grad_norm": 0.1416773498058319, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0063, + "step": 16270 + }, + { + "epoch": 0.99638900789522, + "grad_norm": 0.18451642990112305, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0055, + "step": 16280 + }, + { + "epoch": 0.9970010404553522, + "grad_norm": 0.13422183692455292, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0047, + "step": 16290 + }, + { + "epoch": 0.9976130730154844, + "grad_norm": 0.15831588208675385, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0045, + "step": 16300 + }, + { + "epoch": 0.9982251055756166, + "grad_norm": 0.42520084977149963, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0053, + "step": 16310 + }, + { + "epoch": 0.9988371381357488, + "grad_norm": 0.20889437198638916, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0043, + "step": 16320 + }, + { + "epoch": 0.999449170695881, + "grad_norm": 0.17016667127609253, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0072, + "step": 16330 + }, + { + "epoch": 1.0000612032560132, + "grad_norm": 0.3129214346408844, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0054, + "step": 16340 + }, + { + "epoch": 1.0006732358161454, + "grad_norm": 0.334224134683609, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.0037, + "step": 16350 + }, + { + "epoch": 1.0012852683762776, + "grad_norm": 0.28502705693244934, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0065, + "step": 16360 + }, + { + "epoch": 1.0018973009364098, + "grad_norm": 0.21431966125965118, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0046, + "step": 16370 + }, + { + "epoch": 1.002509333496542, + "grad_norm": 0.22898051142692566, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.006, + "step": 16380 + }, + { + "epoch": 1.0031213660566742, + "grad_norm": 0.41625624895095825, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0068, + "step": 16390 + }, + { + "epoch": 1.0037333986168064, + "grad_norm": 0.2510327398777008, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0043, + "step": 16400 + }, + { + "epoch": 1.0043454311769386, + "grad_norm": 0.23560962080955505, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0049, + "step": 16410 + }, + { + "epoch": 1.0049574637370708, + "grad_norm": 0.2081199437379837, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0061, + "step": 16420 + }, + { + "epoch": 1.005569496297203, + "grad_norm": 0.12456244230270386, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0057, + "step": 16430 + }, + { + "epoch": 1.0061815288573353, + "grad_norm": 0.22212636470794678, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0052, + "step": 16440 + }, + { + "epoch": 1.0067935614174675, + "grad_norm": 0.27772897481918335, + "learning_rate": 1.007637577910799e-05, + "loss": 0.007, + "step": 16450 + }, + { + "epoch": 1.0074055939775997, + "grad_norm": 0.40040507912635803, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0051, + "step": 16460 + }, + { + "epoch": 1.0080176265377319, + "grad_norm": 0.19763565063476562, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0047, + "step": 16470 + }, + { + "epoch": 1.008629659097864, + "grad_norm": 0.2906181514263153, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0055, + "step": 16480 + }, + { + "epoch": 1.0092416916579963, + "grad_norm": 0.29949888586997986, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0045, + "step": 16490 + }, + { + "epoch": 1.0098537242181285, + "grad_norm": 0.3900962769985199, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0053, + "step": 16500 + }, + { + "epoch": 1.0104657567782607, + "grad_norm": 0.22380846738815308, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0043, + "step": 16510 + }, + { + "epoch": 1.0110777893383929, + "grad_norm": 0.3426673412322998, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0052, + "step": 16520 + }, + { + "epoch": 1.011689821898525, + "grad_norm": 0.2452230006456375, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0055, + "step": 16530 + }, + { + "epoch": 1.0123018544586573, + "grad_norm": 0.24280408024787903, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0042, + "step": 16540 + }, + { + "epoch": 1.0129138870187895, + "grad_norm": 0.18271701037883759, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0047, + "step": 16550 + }, + { + "epoch": 1.0135259195789217, + "grad_norm": 0.2874322235584259, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0059, + "step": 16560 + }, + { + "epoch": 1.0141379521390539, + "grad_norm": 0.17367394268512726, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0048, + "step": 16570 + }, + { + "epoch": 1.014749984699186, + "grad_norm": 0.167460098862648, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0074, + "step": 16580 + }, + { + "epoch": 1.0153620172593183, + "grad_norm": 0.21867765486240387, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0042, + "step": 16590 + }, + { + "epoch": 1.0159740498194505, + "grad_norm": 0.2539086639881134, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0057, + "step": 16600 + }, + { + "epoch": 1.0165860823795827, + "grad_norm": 0.1415795534849167, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0044, + "step": 16610 + }, + { + "epoch": 1.0171981149397147, + "grad_norm": 0.12702493369579315, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0038, + "step": 16620 + }, + { + "epoch": 1.0178101474998469, + "grad_norm": 0.16548305749893188, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0042, + "step": 16630 + }, + { + "epoch": 1.018422180059979, + "grad_norm": 0.4413173496723175, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0059, + "step": 16640 + }, + { + "epoch": 1.0190342126201113, + "grad_norm": 0.30871614813804626, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0045, + "step": 16650 + }, + { + "epoch": 1.0196462451802435, + "grad_norm": 0.259650319814682, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0044, + "step": 16660 + }, + { + "epoch": 1.0202582777403757, + "grad_norm": 0.36035388708114624, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0068, + "step": 16670 + }, + { + "epoch": 1.020870310300508, + "grad_norm": 0.3487808406352997, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0038, + "step": 16680 + }, + { + "epoch": 1.02148234286064, + "grad_norm": 0.2898370623588562, + "learning_rate": 9.843955128197274e-06, + "loss": 0.004, + "step": 16690 + }, + { + "epoch": 1.0220943754207723, + "grad_norm": 0.2942182719707489, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0042, + "step": 16700 + }, + { + "epoch": 1.0227064079809045, + "grad_norm": 0.27839869260787964, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0042, + "step": 16710 + }, + { + "epoch": 1.0233184405410367, + "grad_norm": 0.17199957370758057, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0059, + "step": 16720 + }, + { + "epoch": 1.023930473101169, + "grad_norm": 0.2521669566631317, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0073, + "step": 16730 + }, + { + "epoch": 1.0245425056613011, + "grad_norm": 0.19908513128757477, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0047, + "step": 16740 + }, + { + "epoch": 1.0251545382214333, + "grad_norm": 0.23300328850746155, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0055, + "step": 16750 + }, + { + "epoch": 1.0257665707815655, + "grad_norm": 0.24671277403831482, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0043, + "step": 16760 + }, + { + "epoch": 1.0263786033416977, + "grad_norm": 0.23183101415634155, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0052, + "step": 16770 + }, + { + "epoch": 1.02699063590183, + "grad_norm": 0.13460612297058105, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0035, + "step": 16780 + }, + { + "epoch": 1.0276026684619621, + "grad_norm": 0.1990940123796463, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0044, + "step": 16790 + }, + { + "epoch": 1.0282147010220943, + "grad_norm": 0.21223406493663788, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0036, + "step": 16800 + }, + { + "epoch": 1.0288267335822265, + "grad_norm": 0.2649106979370117, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0044, + "step": 16810 + }, + { + "epoch": 1.0294387661423587, + "grad_norm": 0.2524845600128174, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0048, + "step": 16820 + }, + { + "epoch": 1.030050798702491, + "grad_norm": 0.22169779241085052, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0047, + "step": 16830 + }, + { + "epoch": 1.0306628312626231, + "grad_norm": 0.16642418503761292, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0048, + "step": 16840 + }, + { + "epoch": 1.0312748638227553, + "grad_norm": 0.22939598560333252, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0048, + "step": 16850 + }, + { + "epoch": 1.0318868963828876, + "grad_norm": 0.2131129503250122, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0055, + "step": 16860 + }, + { + "epoch": 1.0324989289430198, + "grad_norm": 0.20492705702781677, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0041, + "step": 16870 + }, + { + "epoch": 1.033110961503152, + "grad_norm": 0.2988845705986023, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0042, + "step": 16880 + }, + { + "epoch": 1.0337229940632842, + "grad_norm": 0.18579600751399994, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0042, + "step": 16890 + }, + { + "epoch": 1.0343350266234164, + "grad_norm": 0.2553490698337555, + "learning_rate": 9.641222698101725e-06, + "loss": 0.005, + "step": 16900 + }, + { + "epoch": 1.0349470591835486, + "grad_norm": 0.338440865278244, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0036, + "step": 16910 + }, + { + "epoch": 1.0355590917436808, + "grad_norm": 0.12755723297595978, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0044, + "step": 16920 + }, + { + "epoch": 1.036171124303813, + "grad_norm": 0.12222232669591904, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0037, + "step": 16930 + }, + { + "epoch": 1.0367831568639452, + "grad_norm": 0.20246204733848572, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0055, + "step": 16940 + }, + { + "epoch": 1.0373951894240774, + "grad_norm": 0.36903291940689087, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0051, + "step": 16950 + }, + { + "epoch": 1.0380072219842096, + "grad_norm": 0.3166116178035736, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0045, + "step": 16960 + }, + { + "epoch": 1.0386192545443418, + "grad_norm": 0.2777375280857086, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0041, + "step": 16970 + }, + { + "epoch": 1.039231287104474, + "grad_norm": 0.3173989951610565, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0053, + "step": 16980 + }, + { + "epoch": 1.0398433196646062, + "grad_norm": 0.2135571539402008, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0051, + "step": 16990 + }, + { + "epoch": 1.0404553522247384, + "grad_norm": 0.18536782264709473, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0037, + "step": 17000 + }, + { + "epoch": 1.0410673847848706, + "grad_norm": 0.17782410979270935, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0052, + "step": 17010 + }, + { + "epoch": 1.0416794173450028, + "grad_norm": 0.31509512662887573, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0099, + "step": 17020 + }, + { + "epoch": 1.042291449905135, + "grad_norm": 0.22748225927352905, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 1.0429034824652672, + "grad_norm": 0.14924705028533936, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0053, + "step": 17040 + }, + { + "epoch": 1.0435155150253994, + "grad_norm": 0.21390999853610992, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0044, + "step": 17050 + }, + { + "epoch": 1.0441275475855316, + "grad_norm": 0.25828516483306885, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0042, + "step": 17060 + }, + { + "epoch": 1.0447395801456638, + "grad_norm": 0.24069662392139435, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0069, + "step": 17070 + }, + { + "epoch": 1.045351612705796, + "grad_norm": 0.1090504601597786, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0036, + "step": 17080 + }, + { + "epoch": 1.0459636452659282, + "grad_norm": 0.17990687489509583, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0049, + "step": 17090 + }, + { + "epoch": 1.0465756778260604, + "grad_norm": 0.21505555510520935, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0051, + "step": 17100 + }, + { + "epoch": 1.0471877103861926, + "grad_norm": 0.2157493680715561, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0063, + "step": 17110 + }, + { + "epoch": 1.0477997429463248, + "grad_norm": 0.30865493416786194, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0053, + "step": 17120 + }, + { + "epoch": 1.048411775506457, + "grad_norm": 0.16882938146591187, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0043, + "step": 17130 + }, + { + "epoch": 1.0490238080665892, + "grad_norm": 0.14921846985816956, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0043, + "step": 17140 + }, + { + "epoch": 1.0496358406267214, + "grad_norm": 0.15723800659179688, + "learning_rate": 9.400800085133245e-06, + "loss": 0.005, + "step": 17150 + }, + { + "epoch": 1.0502478731868536, + "grad_norm": 0.19597285985946655, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0046, + "step": 17160 + }, + { + "epoch": 1.0508599057469858, + "grad_norm": 0.1684723198413849, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0053, + "step": 17170 + }, + { + "epoch": 1.051471938307118, + "grad_norm": 0.1733175367116928, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0053, + "step": 17180 + }, + { + "epoch": 1.0520839708672503, + "grad_norm": 0.23111647367477417, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0048, + "step": 17190 + }, + { + "epoch": 1.0526960034273822, + "grad_norm": 0.36174628138542175, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0049, + "step": 17200 + }, + { + "epoch": 1.0533080359875144, + "grad_norm": 0.15791575610637665, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0048, + "step": 17210 + }, + { + "epoch": 1.0539200685476466, + "grad_norm": 0.16026809811592102, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0047, + "step": 17220 + }, + { + "epoch": 1.0545321011077788, + "grad_norm": 0.13964296877384186, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0033, + "step": 17230 + }, + { + "epoch": 1.055144133667911, + "grad_norm": 0.22623896598815918, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0041, + "step": 17240 + }, + { + "epoch": 1.0557561662280432, + "grad_norm": 0.15534555912017822, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0067, + "step": 17250 + }, + { + "epoch": 1.0563681987881754, + "grad_norm": 0.09519665688276291, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0035, + "step": 17260 + }, + { + "epoch": 1.0569802313483077, + "grad_norm": 0.19323785603046417, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0045, + "step": 17270 + }, + { + "epoch": 1.0575922639084399, + "grad_norm": 0.21194952726364136, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0047, + "step": 17280 + }, + { + "epoch": 1.058204296468572, + "grad_norm": 0.28977999091148376, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0049, + "step": 17290 + }, + { + "epoch": 1.0588163290287043, + "grad_norm": 0.1739121824502945, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0062, + "step": 17300 + }, + { + "epoch": 1.0594283615888365, + "grad_norm": 0.23189865052700043, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0055, + "step": 17310 + }, + { + "epoch": 1.0600403941489687, + "grad_norm": 0.15705449879169464, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0033, + "step": 17320 + }, + { + "epoch": 1.0606524267091009, + "grad_norm": 0.23189882934093475, + "learning_rate": 9.228411903689187e-06, + "loss": 0.003, + "step": 17330 + }, + { + "epoch": 1.061264459269233, + "grad_norm": 0.19559095799922943, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0051, + "step": 17340 + }, + { + "epoch": 1.0618764918293653, + "grad_norm": 0.2560543715953827, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0049, + "step": 17350 + }, + { + "epoch": 1.0624885243894975, + "grad_norm": 0.35167232155799866, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0042, + "step": 17360 + }, + { + "epoch": 1.0631005569496297, + "grad_norm": 0.17626497149467468, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0039, + "step": 17370 + }, + { + "epoch": 1.0637125895097619, + "grad_norm": 0.18818546831607819, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0043, + "step": 17380 + }, + { + "epoch": 1.064324622069894, + "grad_norm": 0.10237561911344528, + "learning_rate": 9.171095634265995e-06, + "loss": 0.0053, + "step": 17390 + }, + { + "epoch": 1.0649366546300263, + "grad_norm": 0.21828459203243256, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0042, + "step": 17400 + }, + { + "epoch": 1.0655486871901585, + "grad_norm": 0.09354235231876373, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0034, + "step": 17410 + }, + { + "epoch": 1.0661607197502907, + "grad_norm": 0.18106088042259216, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0051, + "step": 17420 + }, + { + "epoch": 1.066772752310423, + "grad_norm": 0.21538101136684418, + "learning_rate": 9.132927564918328e-06, + "loss": 0.0056, + "step": 17430 + }, + { + "epoch": 1.067384784870555, + "grad_norm": 0.18729519844055176, + "learning_rate": 9.1233909973763e-06, + "loss": 0.004, + "step": 17440 + }, + { + "epoch": 1.0679968174306873, + "grad_norm": 0.3791484832763672, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0052, + "step": 17450 + }, + { + "epoch": 1.0686088499908195, + "grad_norm": 0.19206254184246063, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0042, + "step": 17460 + }, + { + "epoch": 1.0692208825509517, + "grad_norm": 0.15434518456459045, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0061, + "step": 17470 + }, + { + "epoch": 1.069832915111084, + "grad_norm": 0.17898093163967133, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0045, + "step": 17480 + }, + { + "epoch": 1.0704449476712161, + "grad_norm": 0.21975649893283844, + "learning_rate": 9.07574141798717e-06, + "loss": 0.005, + "step": 17490 + }, + { + "epoch": 1.0710569802313483, + "grad_norm": 0.1380346417427063, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0032, + "step": 17500 + }, + { + "epoch": 1.0716690127914805, + "grad_norm": 0.28567400574684143, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0044, + "step": 17510 + }, + { + "epoch": 1.0722810453516127, + "grad_norm": 0.22925534844398499, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0041, + "step": 17520 + }, + { + "epoch": 1.072893077911745, + "grad_norm": 0.27094215154647827, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0047, + "step": 17530 + }, + { + "epoch": 1.0735051104718771, + "grad_norm": 0.32299691438674927, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0048, + "step": 17540 + }, + { + "epoch": 1.0741171430320093, + "grad_norm": 0.26789531111717224, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0047, + "step": 17550 + }, + { + "epoch": 1.0747291755921415, + "grad_norm": 0.3175952434539795, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0077, + "step": 17560 + }, + { + "epoch": 1.0753412081522737, + "grad_norm": 0.24784249067306519, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0048, + "step": 17570 + }, + { + "epoch": 1.075953240712406, + "grad_norm": 0.3081960380077362, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0046, + "step": 17580 + }, + { + "epoch": 1.0765652732725381, + "grad_norm": 0.25334152579307556, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0039, + "step": 17590 + }, + { + "epoch": 1.0771773058326704, + "grad_norm": 0.24747619032859802, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0059, + "step": 17600 + }, + { + "epoch": 1.0777893383928026, + "grad_norm": 0.19048908352851868, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0049, + "step": 17610 + }, + { + "epoch": 1.0784013709529348, + "grad_norm": 0.18883349001407623, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0047, + "step": 17620 + }, + { + "epoch": 1.079013403513067, + "grad_norm": 0.18653099238872528, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0044, + "step": 17630 + }, + { + "epoch": 1.0796254360731992, + "grad_norm": 0.1320251226425171, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0042, + "step": 17640 + }, + { + "epoch": 1.0802374686333314, + "grad_norm": 0.14996238052845, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0041, + "step": 17650 + }, + { + "epoch": 1.0808495011934636, + "grad_norm": 0.4576573073863983, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0059, + "step": 17660 + }, + { + "epoch": 1.0814615337535958, + "grad_norm": 0.19582511484622955, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0051, + "step": 17670 + }, + { + "epoch": 1.082073566313728, + "grad_norm": 0.21973003447055817, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0059, + "step": 17680 + }, + { + "epoch": 1.0826855988738602, + "grad_norm": 0.18183568120002747, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0056, + "step": 17690 + }, + { + "epoch": 1.0832976314339924, + "grad_norm": 0.1761978417634964, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0049, + "step": 17700 + }, + { + "epoch": 1.0839096639941246, + "grad_norm": 0.10185366123914719, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0041, + "step": 17710 + }, + { + "epoch": 1.0845216965542568, + "grad_norm": 0.262513130903244, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0046, + "step": 17720 + }, + { + "epoch": 1.0851337291143888, + "grad_norm": 0.36413198709487915, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0043, + "step": 17730 + }, + { + "epoch": 1.085745761674521, + "grad_norm": 0.2258218675851822, + "learning_rate": 8.83836825410936e-06, + "loss": 0.005, + "step": 17740 + }, + { + "epoch": 1.0863577942346532, + "grad_norm": 0.20840497314929962, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0067, + "step": 17750 + }, + { + "epoch": 1.0869698267947854, + "grad_norm": 0.33392995595932007, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0063, + "step": 17760 + }, + { + "epoch": 1.0875818593549176, + "grad_norm": 0.18477876484394073, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0039, + "step": 17770 + }, + { + "epoch": 1.0881938919150498, + "grad_norm": 0.14785899221897125, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0063, + "step": 17780 + }, + { + "epoch": 1.088805924475182, + "grad_norm": 0.12930043041706085, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0055, + "step": 17790 + }, + { + "epoch": 1.0894179570353142, + "grad_norm": 0.1541786789894104, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0035, + "step": 17800 + }, + { + "epoch": 1.0900299895954464, + "grad_norm": 0.1781499683856964, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0054, + "step": 17810 + }, + { + "epoch": 1.0906420221555786, + "grad_norm": 0.13659314811229706, + "learning_rate": 8.762735374981932e-06, + "loss": 0.0047, + "step": 17820 + }, + { + "epoch": 1.0912540547157108, + "grad_norm": 0.18936918675899506, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0041, + "step": 17830 + }, + { + "epoch": 1.091866087275843, + "grad_norm": 0.24795638024806976, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0043, + "step": 17840 + }, + { + "epoch": 1.0924781198359752, + "grad_norm": 0.28090324997901917, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0036, + "step": 17850 + }, + { + "epoch": 1.0930901523961074, + "grad_norm": 0.3130576014518738, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0042, + "step": 17860 + }, + { + "epoch": 1.0937021849562396, + "grad_norm": 0.19758646190166473, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0047, + "step": 17870 + }, + { + "epoch": 1.0943142175163718, + "grad_norm": 0.20309071242809296, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0042, + "step": 17880 + }, + { + "epoch": 1.094926250076504, + "grad_norm": 0.19741898775100708, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0057, + "step": 17890 + }, + { + "epoch": 1.0955382826366362, + "grad_norm": 0.19182747602462769, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0042, + "step": 17900 + }, + { + "epoch": 1.0961503151967684, + "grad_norm": 0.14508575201034546, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0044, + "step": 17910 + }, + { + "epoch": 1.0967623477569006, + "grad_norm": 0.19854849576950073, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0064, + "step": 17920 + }, + { + "epoch": 1.0973743803170328, + "grad_norm": 0.15055720508098602, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0049, + "step": 17930 + }, + { + "epoch": 1.097986412877165, + "grad_norm": 0.1855372190475464, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0043, + "step": 17940 + }, + { + "epoch": 1.0985984454372972, + "grad_norm": 0.13770940899848938, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0058, + "step": 17950 + }, + { + "epoch": 1.0992104779974294, + "grad_norm": 0.24905221164226532, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0048, + "step": 17960 + }, + { + "epoch": 1.0998225105575616, + "grad_norm": 0.1951165348291397, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0043, + "step": 17970 + }, + { + "epoch": 1.1004345431176938, + "grad_norm": 0.18365852534770966, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0051, + "step": 17980 + }, + { + "epoch": 1.101046575677826, + "grad_norm": 0.16304127871990204, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0034, + "step": 17990 + }, + { + "epoch": 1.1016586082379582, + "grad_norm": 0.262677401304245, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0042, + "step": 18000 + }, + { + "epoch": 1.1022706407980905, + "grad_norm": 0.6157310605049133, + "learning_rate": 8.583791146965244e-06, + "loss": 0.007, + "step": 18010 + }, + { + "epoch": 1.1028826733582227, + "grad_norm": 0.2832951247692108, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0056, + "step": 18020 + }, + { + "epoch": 1.1034947059183549, + "grad_norm": 0.1781810224056244, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0049, + "step": 18030 + }, + { + "epoch": 1.104106738478487, + "grad_norm": 0.23228950798511505, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0045, + "step": 18040 + }, + { + "epoch": 1.1047187710386193, + "grad_norm": 0.2573170065879822, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0048, + "step": 18050 + }, + { + "epoch": 1.1053308035987515, + "grad_norm": 0.30996036529541016, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0054, + "step": 18060 + }, + { + "epoch": 1.1059428361588837, + "grad_norm": 0.24979132413864136, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0045, + "step": 18070 + }, + { + "epoch": 1.1065548687190159, + "grad_norm": 0.17564314603805542, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0033, + "step": 18080 + }, + { + "epoch": 1.107166901279148, + "grad_norm": 0.14539776742458344, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0047, + "step": 18090 + }, + { + "epoch": 1.1077789338392803, + "grad_norm": 0.2530387341976166, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0058, + "step": 18100 + }, + { + "epoch": 1.1083909663994125, + "grad_norm": 0.2038760781288147, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0052, + "step": 18110 + }, + { + "epoch": 1.1090029989595447, + "grad_norm": 0.1769075244665146, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0043, + "step": 18120 + }, + { + "epoch": 1.1096150315196769, + "grad_norm": 0.1686626374721527, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0055, + "step": 18130 + }, + { + "epoch": 1.110227064079809, + "grad_norm": 0.21752336621284485, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0052, + "step": 18140 + }, + { + "epoch": 1.1108390966399413, + "grad_norm": 0.2739295959472656, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0039, + "step": 18150 + }, + { + "epoch": 1.1114511292000735, + "grad_norm": 0.18259567022323608, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0038, + "step": 18160 + }, + { + "epoch": 1.1120631617602057, + "grad_norm": 0.21565310657024384, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0043, + "step": 18170 + }, + { + "epoch": 1.112675194320338, + "grad_norm": 0.2141607403755188, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0059, + "step": 18180 + }, + { + "epoch": 1.11328722688047, + "grad_norm": 0.3017563819885254, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0044, + "step": 18190 + }, + { + "epoch": 1.1138992594406023, + "grad_norm": 0.2021455019712448, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0044, + "step": 18200 + }, + { + "epoch": 1.1145112920007345, + "grad_norm": 0.2113070785999298, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0048, + "step": 18210 + }, + { + "epoch": 1.1151233245608667, + "grad_norm": 0.18945784866809845, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0029, + "step": 18220 + }, + { + "epoch": 1.115735357120999, + "grad_norm": 0.15259192883968353, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0043, + "step": 18230 + }, + { + "epoch": 1.1163473896811311, + "grad_norm": 0.17555822432041168, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0039, + "step": 18240 + }, + { + "epoch": 1.1169594222412633, + "grad_norm": 0.20105648040771484, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0058, + "step": 18250 + }, + { + "epoch": 1.1175714548013955, + "grad_norm": 0.31626567244529724, + "learning_rate": 8.349909816537207e-06, + "loss": 0.0044, + "step": 18260 + }, + { + "epoch": 1.1181834873615277, + "grad_norm": 0.16219007968902588, + "learning_rate": 8.340593854157868e-06, + "loss": 0.005, + "step": 18270 + }, + { + "epoch": 1.11879551992166, + "grad_norm": 0.2174186110496521, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0055, + "step": 18280 + }, + { + "epoch": 1.1194075524817921, + "grad_norm": 0.13639339804649353, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0043, + "step": 18290 + }, + { + "epoch": 1.1200195850419243, + "grad_norm": 0.15100249648094177, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0042, + "step": 18300 + }, + { + "epoch": 1.1206316176020565, + "grad_norm": 0.2114904671907425, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0043, + "step": 18310 + }, + { + "epoch": 1.1212436501621887, + "grad_norm": 0.2941966950893402, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0052, + "step": 18320 + }, + { + "epoch": 1.1218556827223207, + "grad_norm": 0.21695150434970856, + "learning_rate": 8.28476400245882e-06, + "loss": 0.005, + "step": 18330 + }, + { + "epoch": 1.122467715282453, + "grad_norm": 0.11768218129873276, + "learning_rate": 8.275470116190976e-06, + "loss": 0.005, + "step": 18340 + }, + { + "epoch": 1.1230797478425851, + "grad_norm": 0.1427483856678009, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0039, + "step": 18350 + }, + { + "epoch": 1.1236917804027173, + "grad_norm": 0.1837971955537796, + "learning_rate": 8.256891946721157e-06, + "loss": 0.004, + "step": 18360 + }, + { + "epoch": 1.1243038129628495, + "grad_norm": 0.30968883633613586, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0037, + "step": 18370 + }, + { + "epoch": 1.1249158455229817, + "grad_norm": 0.13366396725177765, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0042, + "step": 18380 + }, + { + "epoch": 1.125527878083114, + "grad_norm": 0.1829235553741455, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0039, + "step": 18390 + }, + { + "epoch": 1.1261399106432461, + "grad_norm": 0.3106991648674011, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0052, + "step": 18400 + }, + { + "epoch": 1.1267519432033783, + "grad_norm": 0.38655754923820496, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0046, + "step": 18410 + }, + { + "epoch": 1.1273639757635106, + "grad_norm": 0.23598383367061615, + "learning_rate": 8.201235047388747e-06, + "loss": 0.004, + "step": 18420 + }, + { + "epoch": 1.1279760083236428, + "grad_norm": 0.17428012192249298, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0046, + "step": 18430 + }, + { + "epoch": 1.128588040883775, + "grad_norm": 0.1847466081380844, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0043, + "step": 18440 + }, + { + "epoch": 1.1292000734439072, + "grad_norm": 0.14917762577533722, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0049, + "step": 18450 + }, + { + "epoch": 1.1298121060040394, + "grad_norm": 0.2882528305053711, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0046, + "step": 18460 + }, + { + "epoch": 1.1304241385641716, + "grad_norm": 0.36186549067497253, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0065, + "step": 18470 + }, + { + "epoch": 1.1310361711243038, + "grad_norm": 0.1604463905096054, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0037, + "step": 18480 + }, + { + "epoch": 1.131648203684436, + "grad_norm": 0.17751921713352203, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0034, + "step": 18490 + }, + { + "epoch": 1.1322602362445682, + "grad_norm": 0.15355733036994934, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0059, + "step": 18500 + }, + { + "epoch": 1.1328722688047004, + "grad_norm": 0.21558596193790436, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0044, + "step": 18510 + }, + { + "epoch": 1.1334843013648326, + "grad_norm": 0.20114412903785706, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 1.1340963339249648, + "grad_norm": 0.17260855436325073, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0053, + "step": 18530 + }, + { + "epoch": 1.134708366485097, + "grad_norm": 0.16089287400245667, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0032, + "step": 18540 + }, + { + "epoch": 1.1353203990452292, + "grad_norm": 0.14655937254428864, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0043, + "step": 18550 + }, + { + "epoch": 1.1359324316053614, + "grad_norm": 0.16373249888420105, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0058, + "step": 18560 + }, + { + "epoch": 1.1365444641654936, + "grad_norm": 0.14543801546096802, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0043, + "step": 18570 + }, + { + "epoch": 1.1371564967256258, + "grad_norm": 0.3515278100967407, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0043, + "step": 18580 + }, + { + "epoch": 1.137768529285758, + "grad_norm": 0.21776945888996124, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0053, + "step": 18590 + }, + { + "epoch": 1.1383805618458902, + "grad_norm": 0.21879829466342926, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0051, + "step": 18600 + }, + { + "epoch": 1.1389925944060224, + "grad_norm": 0.16967973113059998, + "learning_rate": 8.025779439806006e-06, + "loss": 0.0048, + "step": 18610 + }, + { + "epoch": 1.1396046269661546, + "grad_norm": 0.4298441410064697, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0056, + "step": 18620 + }, + { + "epoch": 1.1402166595262868, + "grad_norm": 0.1858961284160614, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0067, + "step": 18630 + }, + { + "epoch": 1.140828692086419, + "grad_norm": 0.25853803753852844, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0057, + "step": 18640 + }, + { + "epoch": 1.1414407246465512, + "grad_norm": 0.18566234409809113, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0048, + "step": 18650 + }, + { + "epoch": 1.1420527572066834, + "grad_norm": 0.3471083343029022, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0042, + "step": 18660 + }, + { + "epoch": 1.1426647897668156, + "grad_norm": 0.2092636376619339, + "learning_rate": 7.970630670012853e-06, + "loss": 0.004, + "step": 18670 + }, + { + "epoch": 1.1432768223269478, + "grad_norm": 0.3432580828666687, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0044, + "step": 18680 + }, + { + "epoch": 1.14388885488708, + "grad_norm": 0.14227882027626038, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0041, + "step": 18690 + }, + { + "epoch": 1.1445008874472122, + "grad_norm": 0.2128007709980011, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0037, + "step": 18700 + }, + { + "epoch": 1.1451129200073444, + "grad_norm": 0.25377482175827026, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0049, + "step": 18710 + }, + { + "epoch": 1.1457249525674766, + "grad_norm": 0.1905982494354248, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0037, + "step": 18720 + }, + { + "epoch": 1.1463369851276088, + "grad_norm": 0.3090096712112427, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0079, + "step": 18730 + }, + { + "epoch": 1.146949017687741, + "grad_norm": 0.15604345500469208, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0037, + "step": 18740 + }, + { + "epoch": 1.1475610502478732, + "grad_norm": 0.21756386756896973, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0043, + "step": 18750 + }, + { + "epoch": 1.1481730828080055, + "grad_norm": 0.23869304358959198, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0038, + "step": 18760 + }, + { + "epoch": 1.1487851153681377, + "grad_norm": 0.18082380294799805, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0073, + "step": 18770 + }, + { + "epoch": 1.1493971479282699, + "grad_norm": 0.4032754898071289, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0061, + "step": 18780 + }, + { + "epoch": 1.150009180488402, + "grad_norm": 0.3173290491104126, + "learning_rate": 7.860719408056385e-06, + "loss": 0.004, + "step": 18790 + }, + { + "epoch": 1.1506212130485343, + "grad_norm": 0.18892645835876465, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0044, + "step": 18800 + }, + { + "epoch": 1.1512332456086665, + "grad_norm": 0.26740241050720215, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0056, + "step": 18810 + }, + { + "epoch": 1.1518452781687987, + "grad_norm": 0.3046218752861023, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0072, + "step": 18820 + }, + { + "epoch": 1.1524573107289309, + "grad_norm": 0.17181983590126038, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0034, + "step": 18830 + }, + { + "epoch": 1.1530693432890629, + "grad_norm": 0.22095724940299988, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0045, + "step": 18840 + }, + { + "epoch": 1.153681375849195, + "grad_norm": 0.1514609307050705, + "learning_rate": 7.80596155940873e-06, + "loss": 0.004, + "step": 18850 + }, + { + "epoch": 1.1542934084093273, + "grad_norm": 0.15244366228580475, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0047, + "step": 18860 + }, + { + "epoch": 1.1549054409694595, + "grad_norm": 0.24359947443008423, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0039, + "step": 18870 + }, + { + "epoch": 1.1555174735295917, + "grad_norm": 0.15558156371116638, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0036, + "step": 18880 + }, + { + "epoch": 1.1561295060897239, + "grad_norm": 0.33679234981536865, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0041, + "step": 18890 + }, + { + "epoch": 1.156741538649856, + "grad_norm": 0.15811999142169952, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0062, + "step": 18900 + }, + { + "epoch": 1.1573535712099883, + "grad_norm": 0.14838527143001556, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0029, + "step": 18910 + }, + { + "epoch": 1.1579656037701205, + "grad_norm": 0.23024815320968628, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0038, + "step": 18920 + }, + { + "epoch": 1.1585776363302527, + "grad_norm": 0.18455618619918823, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0044, + "step": 18930 + }, + { + "epoch": 1.1591896688903849, + "grad_norm": 0.20213079452514648, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0038, + "step": 18940 + }, + { + "epoch": 1.159801701450517, + "grad_norm": 0.19000643491744995, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0043, + "step": 18950 + }, + { + "epoch": 1.1604137340106493, + "grad_norm": 0.14075686037540436, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0047, + "step": 18960 + }, + { + "epoch": 1.1610257665707815, + "grad_norm": 0.22101792693138123, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0045, + "step": 18970 + }, + { + "epoch": 1.1616377991309137, + "grad_norm": 0.1097906231880188, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0037, + "step": 18980 + }, + { + "epoch": 1.162249831691046, + "grad_norm": 0.16169370710849762, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0033, + "step": 18990 + }, + { + "epoch": 1.162861864251178, + "grad_norm": 0.32931753993034363, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0052, + "step": 19000 + }, + { + "epoch": 1.1634738968113103, + "grad_norm": 0.2494741678237915, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0057, + "step": 19010 + }, + { + "epoch": 1.1640859293714425, + "grad_norm": 0.18492171168327332, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0056, + "step": 19020 + }, + { + "epoch": 1.1646979619315747, + "grad_norm": 0.18830963969230652, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0036, + "step": 19030 + }, + { + "epoch": 1.165309994491707, + "grad_norm": 0.1331586092710495, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0046, + "step": 19040 + }, + { + "epoch": 1.1659220270518391, + "grad_norm": 0.2433806210756302, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0053, + "step": 19050 + }, + { + "epoch": 1.1665340596119713, + "grad_norm": 0.24491485953330994, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0037, + "step": 19060 + }, + { + "epoch": 1.1671460921721035, + "grad_norm": 0.1789211630821228, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0046, + "step": 19070 + }, + { + "epoch": 1.1677581247322357, + "grad_norm": 0.2729121148586273, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0043, + "step": 19080 + }, + { + "epoch": 1.168370157292368, + "grad_norm": 0.19535189867019653, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0056, + "step": 19090 + }, + { + "epoch": 1.1689821898525001, + "grad_norm": 0.2282983660697937, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0048, + "step": 19100 + }, + { + "epoch": 1.1695942224126323, + "grad_norm": 0.1281195729970932, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0045, + "step": 19110 + }, + { + "epoch": 1.1702062549727645, + "grad_norm": 0.2850968539714813, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0034, + "step": 19120 + }, + { + "epoch": 1.1708182875328967, + "grad_norm": 0.12891536951065063, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0054, + "step": 19130 + }, + { + "epoch": 1.171430320093029, + "grad_norm": 0.13464727997779846, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0033, + "step": 19140 + }, + { + "epoch": 1.1720423526531611, + "grad_norm": 0.2415568083524704, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0041, + "step": 19150 + }, + { + "epoch": 1.1726543852132933, + "grad_norm": 0.15686331689357758, + "learning_rate": 7.525246655150879e-06, + "loss": 0.004, + "step": 19160 + }, + { + "epoch": 1.1732664177734256, + "grad_norm": 0.15490666031837463, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0039, + "step": 19170 + }, + { + "epoch": 1.1738784503335578, + "grad_norm": 0.14095450937747955, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0034, + "step": 19180 + }, + { + "epoch": 1.17449048289369, + "grad_norm": 0.19024531543254852, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0084, + "step": 19190 + }, + { + "epoch": 1.1751025154538222, + "grad_norm": 0.2583692669868469, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0042, + "step": 19200 + }, + { + "epoch": 1.1757145480139544, + "grad_norm": 0.19117654860019684, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0038, + "step": 19210 + }, + { + "epoch": 1.1763265805740866, + "grad_norm": 0.15838374197483063, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0036, + "step": 19220 + }, + { + "epoch": 1.1769386131342188, + "grad_norm": 0.30352044105529785, + "learning_rate": 7.4623904967312e-06, + "loss": 0.0052, + "step": 19230 + }, + { + "epoch": 1.177550645694351, + "grad_norm": 0.229969322681427, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0043, + "step": 19240 + }, + { + "epoch": 1.1781626782544832, + "grad_norm": 0.17781461775302887, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0046, + "step": 19250 + }, + { + "epoch": 1.1787747108146154, + "grad_norm": 0.1306339055299759, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0032, + "step": 19260 + }, + { + "epoch": 1.1793867433747476, + "grad_norm": 0.15727253258228302, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0045, + "step": 19270 + }, + { + "epoch": 1.1799987759348798, + "grad_norm": 0.24909166991710663, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0045, + "step": 19280 + }, + { + "epoch": 1.180610808495012, + "grad_norm": 0.4604126811027527, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0053, + "step": 19290 + }, + { + "epoch": 1.1812228410551442, + "grad_norm": 0.12739762663841248, + "learning_rate": 7.399737764864619e-06, + "loss": 0.004, + "step": 19300 + }, + { + "epoch": 1.1818348736152764, + "grad_norm": 0.2849223017692566, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0043, + "step": 19310 + }, + { + "epoch": 1.1824469061754086, + "grad_norm": 0.26089897751808167, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0044, + "step": 19320 + }, + { + "epoch": 1.1830589387355408, + "grad_norm": 0.1752242147922516, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0076, + "step": 19330 + }, + { + "epoch": 1.183670971295673, + "grad_norm": 0.14917130768299103, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0097, + "step": 19340 + }, + { + "epoch": 1.1842830038558052, + "grad_norm": 0.1599114090204239, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0061, + "step": 19350 + }, + { + "epoch": 1.1848950364159374, + "grad_norm": 0.16370004415512085, + "learning_rate": 7.346200065486093e-06, + "loss": 0.0035, + "step": 19360 + }, + { + "epoch": 1.1855070689760696, + "grad_norm": 0.19354844093322754, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0032, + "step": 19370 + }, + { + "epoch": 1.1861191015362018, + "grad_norm": 0.19689561426639557, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0067, + "step": 19380 + }, + { + "epoch": 1.186731134096334, + "grad_norm": 0.22203278541564941, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0041, + "step": 19390 + }, + { + "epoch": 1.1873431666564662, + "grad_norm": 0.13579773902893066, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0048, + "step": 19400 + }, + { + "epoch": 1.1879551992165984, + "grad_norm": 0.12321218848228455, + "learning_rate": 7.301703138094429e-06, + "loss": 0.004, + "step": 19410 + }, + { + "epoch": 1.1885672317767306, + "grad_norm": 0.28819525241851807, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0042, + "step": 19420 + }, + { + "epoch": 1.1891792643368628, + "grad_norm": 0.2577916085720062, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0039, + "step": 19430 + }, + { + "epoch": 1.189791296896995, + "grad_norm": 0.26840633153915405, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0062, + "step": 19440 + }, + { + "epoch": 1.1904033294571272, + "grad_norm": 0.24222144484519958, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0044, + "step": 19450 + }, + { + "epoch": 1.1910153620172594, + "grad_norm": 0.157009556889534, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0038, + "step": 19460 + }, + { + "epoch": 1.1916273945773916, + "grad_norm": 0.19925500452518463, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0042, + "step": 19470 + }, + { + "epoch": 1.1922394271375236, + "grad_norm": 0.19200846552848816, + "learning_rate": 7.239590017751423e-06, + "loss": 0.004, + "step": 19480 + }, + { + "epoch": 1.1928514596976558, + "grad_norm": 0.18441490828990936, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0056, + "step": 19490 + }, + { + "epoch": 1.193463492257788, + "grad_norm": 0.27565324306488037, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0046, + "step": 19500 + }, + { + "epoch": 1.1940755248179202, + "grad_norm": 0.17830556631088257, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0043, + "step": 19510 + }, + { + "epoch": 1.1946875573780524, + "grad_norm": 0.2769330143928528, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0037, + "step": 19520 + }, + { + "epoch": 1.1952995899381846, + "grad_norm": 0.168451189994812, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0039, + "step": 19530 + }, + { + "epoch": 1.1959116224983168, + "grad_norm": 0.31246763467788696, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0046, + "step": 19540 + }, + { + "epoch": 1.196523655058449, + "grad_norm": 0.21112671494483948, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0041, + "step": 19550 + }, + { + "epoch": 1.1971356876185812, + "grad_norm": 0.31681302189826965, + "learning_rate": 7.168868583990693e-06, + "loss": 0.005, + "step": 19560 + }, + { + "epoch": 1.1977477201787134, + "grad_norm": 0.18634411692619324, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0042, + "step": 19570 + }, + { + "epoch": 1.1983597527388457, + "grad_norm": 0.17780153453350067, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0057, + "step": 19580 + }, + { + "epoch": 1.1989717852989779, + "grad_norm": 0.19183002412319183, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0043, + "step": 19590 + }, + { + "epoch": 1.19958381785911, + "grad_norm": 0.28469574451446533, + "learning_rate": 7.133615440411572e-06, + "loss": 0.004, + "step": 19600 + }, + { + "epoch": 1.2001958504192423, + "grad_norm": 0.22470368444919586, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0044, + "step": 19610 + }, + { + "epoch": 1.2008078829793745, + "grad_norm": 0.23563240468502045, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0041, + "step": 19620 + }, + { + "epoch": 1.2014199155395067, + "grad_norm": 0.18467430770397186, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0053, + "step": 19630 + }, + { + "epoch": 1.2020319480996389, + "grad_norm": 0.12539178133010864, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0047, + "step": 19640 + }, + { + "epoch": 1.202643980659771, + "grad_norm": 0.2552005648612976, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.004, + "step": 19650 + }, + { + "epoch": 1.2032560132199033, + "grad_norm": 0.13963459432125092, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0035, + "step": 19660 + }, + { + "epoch": 1.2038680457800355, + "grad_norm": 0.17387327551841736, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0038, + "step": 19670 + }, + { + "epoch": 1.2044800783401677, + "grad_norm": 0.1284111589193344, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0044, + "step": 19680 + }, + { + "epoch": 1.2050921109002999, + "grad_norm": 0.22337380051612854, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0041, + "step": 19690 + }, + { + "epoch": 1.205704143460432, + "grad_norm": 0.2254808247089386, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0033, + "step": 19700 + }, + { + "epoch": 1.2063161760205643, + "grad_norm": 0.19316980242729187, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0041, + "step": 19710 + }, + { + "epoch": 1.2069282085806965, + "grad_norm": 0.17951075732707977, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0038, + "step": 19720 + }, + { + "epoch": 1.2075402411408287, + "grad_norm": 0.3105165660381317, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0043, + "step": 19730 + }, + { + "epoch": 1.208152273700961, + "grad_norm": 0.21083533763885498, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0039, + "step": 19740 + }, + { + "epoch": 1.208764306261093, + "grad_norm": 0.20121195912361145, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0035, + "step": 19750 + }, + { + "epoch": 1.2093763388212253, + "grad_norm": 0.20067447423934937, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0047, + "step": 19760 + }, + { + "epoch": 1.2099883713813575, + "grad_norm": 0.15943066775798798, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0039, + "step": 19770 + }, + { + "epoch": 1.2106004039414897, + "grad_norm": 0.21581032872200012, + "learning_rate": 6.975884226362e-06, + "loss": 0.0045, + "step": 19780 + }, + { + "epoch": 1.211212436501622, + "grad_norm": 0.16258753836154938, + "learning_rate": 6.967165692827958e-06, + "loss": 0.004, + "step": 19790 + }, + { + "epoch": 1.2118244690617541, + "grad_norm": 0.18742400407791138, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0047, + "step": 19800 + }, + { + "epoch": 1.2124365016218863, + "grad_norm": 0.09035168588161469, + "learning_rate": 6.949742834253074e-06, + "loss": 0.004, + "step": 19810 + }, + { + "epoch": 1.2130485341820185, + "grad_norm": 0.21749694645404816, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0054, + "step": 19820 + }, + { + "epoch": 1.2136605667421507, + "grad_norm": 0.3189448416233063, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0043, + "step": 19830 + }, + { + "epoch": 1.214272599302283, + "grad_norm": 0.26815512776374817, + "learning_rate": 6.923644220932124e-06, + "loss": 0.005, + "step": 19840 + }, + { + "epoch": 1.2148846318624151, + "grad_norm": 0.19533704221248627, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0037, + "step": 19850 + }, + { + "epoch": 1.2154966644225473, + "grad_norm": 0.36249589920043945, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0064, + "step": 19860 + }, + { + "epoch": 1.2161086969826795, + "grad_norm": 0.19801265001296997, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0042, + "step": 19870 + }, + { + "epoch": 1.2167207295428117, + "grad_norm": 0.10341386497020721, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0053, + "step": 19880 + }, + { + "epoch": 1.217332762102944, + "grad_norm": 0.17985381186008453, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0045, + "step": 19890 + }, + { + "epoch": 1.2179447946630761, + "grad_norm": 0.18160982429981232, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0061, + "step": 19900 + }, + { + "epoch": 1.2185568272232083, + "grad_norm": 0.15552182495594025, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0047, + "step": 19910 + }, + { + "epoch": 1.2191688597833406, + "grad_norm": 0.34908807277679443, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0046, + "step": 19920 + }, + { + "epoch": 1.2197808923434728, + "grad_norm": 0.14835652709007263, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0042, + "step": 19930 + }, + { + "epoch": 1.220392924903605, + "grad_norm": 0.23276430368423462, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0042, + "step": 19940 + }, + { + "epoch": 1.2210049574637372, + "grad_norm": 0.1900823563337326, + "learning_rate": 6.828319751504063e-06, + "loss": 0.004, + "step": 19950 + }, + { + "epoch": 1.2216169900238694, + "grad_norm": 0.134046271443367, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0039, + "step": 19960 + }, + { + "epoch": 1.2222290225840013, + "grad_norm": 0.17264600098133087, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0036, + "step": 19970 + }, + { + "epoch": 1.2228410551441335, + "grad_norm": 0.24845834076404572, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0044, + "step": 19980 + }, + { + "epoch": 1.2234530877042658, + "grad_norm": 0.14805762469768524, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0049, + "step": 19990 + }, + { + "epoch": 1.224065120264398, + "grad_norm": 0.228907972574234, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0043, + "step": 20000 + }, + { + "epoch": 1.2246771528245302, + "grad_norm": 0.16869507730007172, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0041, + "step": 20010 + }, + { + "epoch": 1.2252891853846624, + "grad_norm": 0.1983603835105896, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0041, + "step": 20020 + }, + { + "epoch": 1.2259012179447946, + "grad_norm": 0.17656362056732178, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0028, + "step": 20030 + }, + { + "epoch": 1.2265132505049268, + "grad_norm": 0.1360313892364502, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0069, + "step": 20040 + }, + { + "epoch": 1.227125283065059, + "grad_norm": 0.21057721972465515, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0051, + "step": 20050 + }, + { + "epoch": 1.2277373156251912, + "grad_norm": 0.138632670044899, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0038, + "step": 20060 + }, + { + "epoch": 1.2283493481853234, + "grad_norm": 0.17815573513507843, + "learning_rate": 6.725005485342219e-06, + "loss": 0.003, + "step": 20070 + }, + { + "epoch": 1.2289613807454556, + "grad_norm": 0.1769353598356247, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0066, + "step": 20080 + }, + { + "epoch": 1.2295734133055878, + "grad_norm": 0.23068928718566895, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0048, + "step": 20090 + }, + { + "epoch": 1.23018544586572, + "grad_norm": 0.25139328837394714, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0049, + "step": 20100 + }, + { + "epoch": 1.2307974784258522, + "grad_norm": 0.09128634631633759, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0042, + "step": 20110 + }, + { + "epoch": 1.2314095109859844, + "grad_norm": 0.20516613125801086, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0031, + "step": 20120 + }, + { + "epoch": 1.2320215435461166, + "grad_norm": 0.1518358588218689, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0049, + "step": 20130 + }, + { + "epoch": 1.2326335761062488, + "grad_norm": 0.1673758625984192, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0044, + "step": 20140 + }, + { + "epoch": 1.233245608666381, + "grad_norm": 0.14084585011005402, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0053, + "step": 20150 + }, + { + "epoch": 1.2338576412265132, + "grad_norm": 0.23316942155361176, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0042, + "step": 20160 + }, + { + "epoch": 1.2344696737866454, + "grad_norm": 0.23793813586235046, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0045, + "step": 20170 + }, + { + "epoch": 1.2350817063467776, + "grad_norm": 0.4269389510154724, + "learning_rate": 6.630934952049143e-06, + "loss": 0.005, + "step": 20180 + }, + { + "epoch": 1.2356937389069098, + "grad_norm": 0.15654191374778748, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0039, + "step": 20190 + }, + { + "epoch": 1.236305771467042, + "grad_norm": 0.19204623997211456, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0032, + "step": 20200 + }, + { + "epoch": 1.2369178040271742, + "grad_norm": 0.15817691385746002, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0044, + "step": 20210 + }, + { + "epoch": 1.2375298365873064, + "grad_norm": 0.12637947499752045, + "learning_rate": 6.596880604028027e-06, + "loss": 0.004, + "step": 20220 + }, + { + "epoch": 1.2381418691474386, + "grad_norm": 0.26657921075820923, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0041, + "step": 20230 + }, + { + "epoch": 1.2387539017075708, + "grad_norm": 0.15207791328430176, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0045, + "step": 20240 + }, + { + "epoch": 1.239365934267703, + "grad_norm": 0.32583367824554443, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0042, + "step": 20250 + }, + { + "epoch": 1.2399779668278352, + "grad_norm": 0.15617726743221283, + "learning_rate": 6.562908932779455e-06, + "loss": 0.004, + "step": 20260 + }, + { + "epoch": 1.2405899993879674, + "grad_norm": 0.1935809850692749, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0041, + "step": 20270 + }, + { + "epoch": 1.2412020319480996, + "grad_norm": 0.17422369122505188, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0035, + "step": 20280 + }, + { + "epoch": 1.2418140645082318, + "grad_norm": 0.15332955121994019, + "learning_rate": 6.53748481975927e-06, + "loss": 0.0049, + "step": 20290 + }, + { + "epoch": 1.242426097068364, + "grad_norm": 0.16183018684387207, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0042, + "step": 20300 + }, + { + "epoch": 1.2430381296284962, + "grad_norm": 0.28421106934547424, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0045, + "step": 20310 + }, + { + "epoch": 1.2436501621886284, + "grad_norm": 0.23288874328136444, + "learning_rate": 6.512107839793337e-06, + "loss": 0.004, + "step": 20320 + }, + { + "epoch": 1.2442621947487607, + "grad_norm": 0.17955242097377777, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0036, + "step": 20330 + }, + { + "epoch": 1.2448742273088929, + "grad_norm": 0.20192117989063263, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0055, + "step": 20340 + }, + { + "epoch": 1.245486259869025, + "grad_norm": 0.15365810692310333, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0034, + "step": 20350 + }, + { + "epoch": 1.2460982924291573, + "grad_norm": 0.25220832228660583, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0039, + "step": 20360 + }, + { + "epoch": 1.2467103249892895, + "grad_norm": 0.25777462124824524, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0053, + "step": 20370 + }, + { + "epoch": 1.2473223575494217, + "grad_norm": 0.2693277895450592, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0053, + "step": 20380 + }, + { + "epoch": 1.2479343901095539, + "grad_norm": 0.22846420109272003, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0033, + "step": 20390 + }, + { + "epoch": 1.248546422669686, + "grad_norm": 0.17022505402565002, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0046, + "step": 20400 + }, + { + "epoch": 1.2491584552298183, + "grad_norm": 0.08295682072639465, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0035, + "step": 20410 + }, + { + "epoch": 1.2497704877899505, + "grad_norm": 0.2745625972747803, + "learning_rate": 6.427861749601945e-06, + "loss": 0.0044, + "step": 20420 + }, + { + "epoch": 1.2503825203500827, + "grad_norm": 0.12855033576488495, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0042, + "step": 20430 + }, + { + "epoch": 1.2509945529102149, + "grad_norm": 0.30358386039733887, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0049, + "step": 20440 + }, + { + "epoch": 1.251606585470347, + "grad_norm": 0.15514959394931793, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0048, + "step": 20450 + }, + { + "epoch": 1.2522186180304793, + "grad_norm": 0.1414988487958908, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0088, + "step": 20460 + }, + { + "epoch": 1.2528306505906115, + "grad_norm": 0.17399665713310242, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0046, + "step": 20470 + }, + { + "epoch": 1.2534426831507437, + "grad_norm": 0.22629426419734955, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0038, + "step": 20480 + }, + { + "epoch": 1.254054715710876, + "grad_norm": 0.30595293641090393, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0027, + "step": 20490 + }, + { + "epoch": 1.254666748271008, + "grad_norm": 0.17980262637138367, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0034, + "step": 20500 + }, + { + "epoch": 1.2552787808311403, + "grad_norm": 0.19016452133655548, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0029, + "step": 20510 + }, + { + "epoch": 1.2558908133912725, + "grad_norm": 0.20200394093990326, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0037, + "step": 20520 + }, + { + "epoch": 1.2565028459514047, + "grad_norm": 0.15347513556480408, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0041, + "step": 20530 + }, + { + "epoch": 1.257114878511537, + "grad_norm": 0.1851687729358673, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0042, + "step": 20540 + }, + { + "epoch": 1.2577269110716691, + "grad_norm": 0.2529662549495697, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0037, + "step": 20550 + }, + { + "epoch": 1.2583389436318013, + "grad_norm": 0.18209592998027802, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0037, + "step": 20560 + }, + { + "epoch": 1.2589509761919335, + "grad_norm": 0.18981963396072388, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0036, + "step": 20570 + }, + { + "epoch": 1.2595630087520657, + "grad_norm": 0.13232728838920593, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0035, + "step": 20580 + }, + { + "epoch": 1.260175041312198, + "grad_norm": 0.133514404296875, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0039, + "step": 20590 + }, + { + "epoch": 1.2607870738723301, + "grad_norm": 0.14339123666286469, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0043, + "step": 20600 + }, + { + "epoch": 1.2613991064324623, + "grad_norm": 0.48857489228248596, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0045, + "step": 20610 + }, + { + "epoch": 1.2620111389925945, + "grad_norm": 0.1513262242078781, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0029, + "step": 20620 + }, + { + "epoch": 1.2626231715527267, + "grad_norm": 0.1497354805469513, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0039, + "step": 20630 + }, + { + "epoch": 1.2632352041128587, + "grad_norm": 0.132791206240654, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0037, + "step": 20640 + }, + { + "epoch": 1.263847236672991, + "grad_norm": 0.13804496824741364, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0035, + "step": 20650 + }, + { + "epoch": 1.2644592692331231, + "grad_norm": 0.19393391907215118, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0049, + "step": 20660 + }, + { + "epoch": 1.2650713017932553, + "grad_norm": 0.17623338103294373, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0038, + "step": 20670 + }, + { + "epoch": 1.2656833343533875, + "grad_norm": 0.26931124925613403, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0042, + "step": 20680 + }, + { + "epoch": 1.2662953669135197, + "grad_norm": 0.17984439432621002, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0036, + "step": 20690 + }, + { + "epoch": 1.266907399473652, + "grad_norm": 0.19648219645023346, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0046, + "step": 20700 + }, + { + "epoch": 1.2675194320337841, + "grad_norm": 0.1464766263961792, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0024, + "step": 20710 + }, + { + "epoch": 1.2681314645939163, + "grad_norm": 0.1271074265241623, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0042, + "step": 20720 + }, + { + "epoch": 1.2687434971540485, + "grad_norm": 0.15960967540740967, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0079, + "step": 20730 + }, + { + "epoch": 1.2693555297141808, + "grad_norm": 0.13636153936386108, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0046, + "step": 20740 + }, + { + "epoch": 1.269967562274313, + "grad_norm": 0.19099050760269165, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0046, + "step": 20750 + }, + { + "epoch": 1.2705795948344452, + "grad_norm": 0.28632739186286926, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0036, + "step": 20760 + }, + { + "epoch": 1.2711916273945774, + "grad_norm": 0.2565019726753235, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0055, + "step": 20770 + }, + { + "epoch": 1.2718036599547096, + "grad_norm": 0.24443399906158447, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0053, + "step": 20780 + }, + { + "epoch": 1.2724156925148418, + "grad_norm": 0.1396762877702713, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0029, + "step": 20790 + }, + { + "epoch": 1.273027725074974, + "grad_norm": 0.3028377890586853, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0061, + "step": 20800 + }, + { + "epoch": 1.2736397576351062, + "grad_norm": 0.18195804953575134, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0034, + "step": 20810 + }, + { + "epoch": 1.2742517901952384, + "grad_norm": 0.16194652020931244, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0054, + "step": 20820 + }, + { + "epoch": 1.2748638227553706, + "grad_norm": 0.13011956214904785, + "learning_rate": 6.08816828695283e-06, + "loss": 0.003, + "step": 20830 + }, + { + "epoch": 1.2754758553155028, + "grad_norm": 0.23294220864772797, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0041, + "step": 20840 + }, + { + "epoch": 1.276087887875635, + "grad_norm": 0.1892961710691452, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0031, + "step": 20850 + }, + { + "epoch": 1.2766999204357672, + "grad_norm": 0.1984476000070572, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0046, + "step": 20860 + }, + { + "epoch": 1.2773119529958994, + "grad_norm": 0.158709317445755, + "learning_rate": 6.055535530104466e-06, + "loss": 0.003, + "step": 20870 + }, + { + "epoch": 1.2779239855560316, + "grad_norm": 0.16505110263824463, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0039, + "step": 20880 + }, + { + "epoch": 1.2785360181161638, + "grad_norm": 0.18332232534885406, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0036, + "step": 20890 + }, + { + "epoch": 1.279148050676296, + "grad_norm": 0.1797804981470108, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0049, + "step": 20900 + }, + { + "epoch": 1.2797600832364282, + "grad_norm": 0.19247964024543762, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0053, + "step": 20910 + }, + { + "epoch": 1.2803721157965604, + "grad_norm": 0.17845408618450165, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0045, + "step": 20920 + }, + { + "epoch": 1.2809841483566926, + "grad_norm": 0.09454555809497833, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0027, + "step": 20930 + }, + { + "epoch": 1.2815961809168248, + "grad_norm": 0.12647129595279694, + "learning_rate": 5.998651973182953e-06, + "loss": 0.004, + "step": 20940 + }, + { + "epoch": 1.282208213476957, + "grad_norm": 0.39115941524505615, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0051, + "step": 20950 + }, + { + "epoch": 1.2828202460370892, + "grad_norm": 0.29081296920776367, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0044, + "step": 20960 + }, + { + "epoch": 1.2834322785972214, + "grad_norm": 0.1849275827407837, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0042, + "step": 20970 + }, + { + "epoch": 1.2840443111573536, + "grad_norm": 0.24075689911842346, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0031, + "step": 20980 + }, + { + "epoch": 1.2846563437174858, + "grad_norm": 0.12463482469320297, + "learning_rate": 5.958196751005967e-06, + "loss": 0.003, + "step": 20990 + }, + { + "epoch": 1.285268376277618, + "grad_norm": 0.16987742483615875, + "learning_rate": 5.950123419134817e-06, + "loss": 0.004, + "step": 21000 + }, + { + "epoch": 1.2858804088377502, + "grad_norm": 0.20316782593727112, + "learning_rate": 5.942056013575106e-06, + "loss": 0.004, + "step": 21010 + }, + { + "epoch": 1.2864924413978824, + "grad_norm": 0.20989514887332916, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0053, + "step": 21020 + }, + { + "epoch": 1.2871044739580146, + "grad_norm": 0.33795273303985596, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0048, + "step": 21030 + }, + { + "epoch": 1.2877165065181468, + "grad_norm": 0.13918501138687134, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.004, + "step": 21040 + }, + { + "epoch": 1.288328539078279, + "grad_norm": 0.2992899715900421, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0038, + "step": 21050 + }, + { + "epoch": 1.288940571638411, + "grad_norm": 0.2540164589881897, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0037, + "step": 21060 + }, + { + "epoch": 1.2895526041985432, + "grad_norm": 0.161032035946846, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0047, + "step": 21070 + }, + { + "epoch": 1.2901646367586754, + "grad_norm": 0.1743200421333313, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0037, + "step": 21080 + }, + { + "epoch": 1.2907766693188076, + "grad_norm": 0.26604363322257996, + "learning_rate": 5.877731250949785e-06, + "loss": 0.004, + "step": 21090 + }, + { + "epoch": 1.2913887018789398, + "grad_norm": 0.275696724653244, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0044, + "step": 21100 + }, + { + "epoch": 1.292000734439072, + "grad_norm": 0.16888457536697388, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0042, + "step": 21110 + }, + { + "epoch": 1.2926127669992042, + "grad_norm": 0.12902231514453888, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0048, + "step": 21120 + }, + { + "epoch": 1.2932247995593364, + "grad_norm": 0.14577728509902954, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0046, + "step": 21130 + }, + { + "epoch": 1.2938368321194686, + "grad_norm": 0.1544434279203415, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0031, + "step": 21140 + }, + { + "epoch": 1.2944488646796009, + "grad_norm": 0.09238115698099136, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0035, + "step": 21150 + }, + { + "epoch": 1.295060897239733, + "grad_norm": 0.1770051270723343, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0033, + "step": 21160 + }, + { + "epoch": 1.2956729297998653, + "grad_norm": 0.20360831916332245, + "learning_rate": 5.813791207086085e-06, + "loss": 0.0037, + "step": 21170 + }, + { + "epoch": 1.2962849623599975, + "grad_norm": 0.18503794074058533, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0045, + "step": 21180 + }, + { + "epoch": 1.2968969949201297, + "grad_norm": 0.12918968498706818, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0048, + "step": 21190 + }, + { + "epoch": 1.2975090274802619, + "grad_norm": 0.14289438724517822, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0041, + "step": 21200 + }, + { + "epoch": 1.298121060040394, + "grad_norm": 0.17546117305755615, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0042, + "step": 21210 + }, + { + "epoch": 1.2987330926005263, + "grad_norm": 0.2919277846813202, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0051, + "step": 21220 + }, + { + "epoch": 1.2993451251606585, + "grad_norm": 0.0988069474697113, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0044, + "step": 21230 + }, + { + "epoch": 1.2999571577207907, + "grad_norm": 0.19284513592720032, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0037, + "step": 21240 + }, + { + "epoch": 1.3005691902809229, + "grad_norm": 0.12894058227539062, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0031, + "step": 21250 + }, + { + "epoch": 1.301181222841055, + "grad_norm": 0.14740346372127533, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0037, + "step": 21260 + }, + { + "epoch": 1.3017932554011873, + "grad_norm": 0.16817794740200043, + "learning_rate": 5.734414476316747e-06, + "loss": 0.005, + "step": 21270 + }, + { + "epoch": 1.3024052879613195, + "grad_norm": 0.29237234592437744, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0039, + "step": 21280 + }, + { + "epoch": 1.3030173205214517, + "grad_norm": 0.12649856507778168, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0035, + "step": 21290 + }, + { + "epoch": 1.303629353081584, + "grad_norm": 0.11057443916797638, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0039, + "step": 21300 + }, + { + "epoch": 1.304241385641716, + "grad_norm": 0.13494674861431122, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0035, + "step": 21310 + }, + { + "epoch": 1.3048534182018483, + "grad_norm": 0.3079472482204437, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0042, + "step": 21320 + }, + { + "epoch": 1.3054654507619805, + "grad_norm": 0.13513535261154175, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0048, + "step": 21330 + }, + { + "epoch": 1.3060774833221127, + "grad_norm": 0.39266663789749146, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0046, + "step": 21340 + }, + { + "epoch": 1.306689515882245, + "grad_norm": 0.15097978711128235, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0047, + "step": 21350 + }, + { + "epoch": 1.3073015484423771, + "grad_norm": 0.25206202268600464, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0049, + "step": 21360 + }, + { + "epoch": 1.3079135810025093, + "grad_norm": 0.16765817999839783, + "learning_rate": 5.655655685355026e-06, + "loss": 0.005, + "step": 21370 + }, + { + "epoch": 1.3085256135626415, + "grad_norm": 0.2137158215045929, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0048, + "step": 21380 + }, + { + "epoch": 1.3091376461227737, + "grad_norm": 0.19711454212665558, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0043, + "step": 21390 + }, + { + "epoch": 1.309749678682906, + "grad_norm": 0.1722051054239273, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0044, + "step": 21400 + }, + { + "epoch": 1.3103617112430381, + "grad_norm": 0.1807536482810974, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0045, + "step": 21410 + }, + { + "epoch": 1.3109737438031703, + "grad_norm": 0.15052185952663422, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.004, + "step": 21420 + }, + { + "epoch": 1.3115857763633025, + "grad_norm": 0.1485220491886139, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0036, + "step": 21430 + }, + { + "epoch": 1.3121978089234347, + "grad_norm": 0.15065325796604156, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0037, + "step": 21440 + }, + { + "epoch": 1.312809841483567, + "grad_norm": 0.17903591692447662, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0047, + "step": 21450 + }, + { + "epoch": 1.3134218740436991, + "grad_norm": 0.14310622215270996, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0043, + "step": 21460 + }, + { + "epoch": 1.3140339066038313, + "grad_norm": 0.12117830663919449, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0053, + "step": 21470 + }, + { + "epoch": 1.3146459391639636, + "grad_norm": 0.1484573632478714, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0036, + "step": 21480 + }, + { + "epoch": 1.3152579717240958, + "grad_norm": 0.16559219360351562, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0037, + "step": 21490 + }, + { + "epoch": 1.315870004284228, + "grad_norm": 0.21626432240009308, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0031, + "step": 21500 + }, + { + "epoch": 1.3164820368443602, + "grad_norm": 0.08177383989095688, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0052, + "step": 21510 + }, + { + "epoch": 1.3170940694044924, + "grad_norm": 0.18640732765197754, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0062, + "step": 21520 + }, + { + "epoch": 1.3177061019646246, + "grad_norm": 0.2599853277206421, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0039, + "step": 21530 + }, + { + "epoch": 1.3183181345247568, + "grad_norm": 0.1591203212738037, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0034, + "step": 21540 + }, + { + "epoch": 1.318930167084889, + "grad_norm": 0.2834412455558777, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0037, + "step": 21550 + }, + { + "epoch": 1.3195421996450212, + "grad_norm": 0.13853803277015686, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0038, + "step": 21560 + }, + { + "epoch": 1.3201542322051534, + "grad_norm": 0.14707128703594208, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0042, + "step": 21570 + }, + { + "epoch": 1.3207662647652856, + "grad_norm": 0.12561920285224915, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0038, + "step": 21580 + }, + { + "epoch": 1.3213782973254178, + "grad_norm": 0.4156799018383026, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0051, + "step": 21590 + }, + { + "epoch": 1.32199032988555, + "grad_norm": 0.11400662362575531, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0031, + "step": 21600 + }, + { + "epoch": 1.3226023624456822, + "grad_norm": 0.15658807754516602, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0052, + "step": 21610 + }, + { + "epoch": 1.3232143950058144, + "grad_norm": 0.1212862953543663, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0034, + "step": 21620 + }, + { + "epoch": 1.3238264275659466, + "grad_norm": 0.2201654314994812, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0036, + "step": 21630 + }, + { + "epoch": 1.3244384601260788, + "grad_norm": 0.11623375117778778, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0032, + "step": 21640 + }, + { + "epoch": 1.325050492686211, + "grad_norm": 0.13092897832393646, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0035, + "step": 21650 + }, + { + "epoch": 1.3256625252463432, + "grad_norm": 0.15409153699874878, + "learning_rate": 5.430834687545416e-06, + "loss": 0.004, + "step": 21660 + }, + { + "epoch": 1.3262745578064754, + "grad_norm": 0.3148297369480133, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0031, + "step": 21670 + }, + { + "epoch": 1.3268865903666076, + "grad_norm": 0.13435055315494537, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0033, + "step": 21680 + }, + { + "epoch": 1.3274986229267398, + "grad_norm": 0.17878089845180511, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0029, + "step": 21690 + }, + { + "epoch": 1.328110655486872, + "grad_norm": 0.1823783665895462, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0039, + "step": 21700 + }, + { + "epoch": 1.3287226880470042, + "grad_norm": 0.14492660760879517, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0033, + "step": 21710 + }, + { + "epoch": 1.3293347206071364, + "grad_norm": 0.1730341762304306, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0041, + "step": 21720 + }, + { + "epoch": 1.3299467531672686, + "grad_norm": 0.07961586117744446, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0039, + "step": 21730 + }, + { + "epoch": 1.3305587857274008, + "grad_norm": 0.14440582692623138, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0038, + "step": 21740 + }, + { + "epoch": 1.331170818287533, + "grad_norm": 0.22034496068954468, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0023, + "step": 21750 + }, + { + "epoch": 1.3317828508476652, + "grad_norm": 0.1861305832862854, + "learning_rate": 5.354573491223212e-06, + "loss": 0.005, + "step": 21760 + }, + { + "epoch": 1.3323948834077972, + "grad_norm": 0.15587164461612701, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.0044, + "step": 21770 + }, + { + "epoch": 1.3330069159679294, + "grad_norm": 0.6852900981903076, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0075, + "step": 21780 + }, + { + "epoch": 1.3336189485280616, + "grad_norm": 0.14315280318260193, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0034, + "step": 21790 + }, + { + "epoch": 1.3342309810881938, + "grad_norm": 0.350981205701828, + "learning_rate": 5.324254018551227e-06, + "loss": 0.004, + "step": 21800 + }, + { + "epoch": 1.334843013648326, + "grad_norm": 0.12344911694526672, + "learning_rate": 5.316690780174352e-06, + "loss": 0.004, + "step": 21810 + }, + { + "epoch": 1.3354550462084582, + "grad_norm": 0.18744061887264252, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0035, + "step": 21820 + }, + { + "epoch": 1.3360670787685904, + "grad_norm": 0.22747837007045746, + "learning_rate": 5.301584321328435e-06, + "loss": 0.004, + "step": 21830 + }, + { + "epoch": 1.3366791113287226, + "grad_norm": 0.22695699334144592, + "learning_rate": 5.294041118587667e-06, + "loss": 0.0042, + "step": 21840 + }, + { + "epoch": 1.3372911438888548, + "grad_norm": 0.17258964478969574, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0044, + "step": 21850 + }, + { + "epoch": 1.337903176448987, + "grad_norm": 0.1523793637752533, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0047, + "step": 21860 + }, + { + "epoch": 1.3385152090091192, + "grad_norm": 0.1983587145805359, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0037, + "step": 21870 + }, + { + "epoch": 1.3391272415692514, + "grad_norm": 0.1263747215270996, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0034, + "step": 21880 + }, + { + "epoch": 1.3397392741293837, + "grad_norm": 0.1550009399652481, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0037, + "step": 21890 + }, + { + "epoch": 1.3403513066895159, + "grad_norm": 0.14963915944099426, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0038, + "step": 21900 + }, + { + "epoch": 1.340963339249648, + "grad_norm": 0.17783671617507935, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0036, + "step": 21910 + }, + { + "epoch": 1.3415753718097803, + "grad_norm": 0.2715896964073181, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0036, + "step": 21920 + }, + { + "epoch": 1.3421874043699125, + "grad_norm": 0.22924886643886566, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0037, + "step": 21930 + }, + { + "epoch": 1.3427994369300447, + "grad_norm": 0.13689789175987244, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0033, + "step": 21940 + }, + { + "epoch": 1.3434114694901769, + "grad_norm": 0.09137748926877975, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0031, + "step": 21950 + }, + { + "epoch": 1.344023502050309, + "grad_norm": 0.17097881436347961, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0031, + "step": 21960 + }, + { + "epoch": 1.3446355346104413, + "grad_norm": 0.23919200897216797, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0046, + "step": 21970 + }, + { + "epoch": 1.3452475671705735, + "grad_norm": 0.14261527359485626, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0037, + "step": 21980 + }, + { + "epoch": 1.3458595997307057, + "grad_norm": 0.156734898686409, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0043, + "step": 21990 + }, + { + "epoch": 1.3464716322908379, + "grad_norm": 0.21755588054656982, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0032, + "step": 22000 + }, + { + "epoch": 1.34708366485097, + "grad_norm": 0.1373317390680313, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0033, + "step": 22010 + }, + { + "epoch": 1.3476956974111023, + "grad_norm": 0.1646856814622879, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0047, + "step": 22020 + }, + { + "epoch": 1.3483077299712345, + "grad_norm": 0.1908850073814392, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0044, + "step": 22030 + }, + { + "epoch": 1.3489197625313667, + "grad_norm": 0.24862833321094513, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0041, + "step": 22040 + }, + { + "epoch": 1.349531795091499, + "grad_norm": 0.15980397164821625, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0033, + "step": 22050 + }, + { + "epoch": 1.350143827651631, + "grad_norm": 0.1157977357506752, + "learning_rate": 5.129800405815733e-06, + "loss": 0.0036, + "step": 22060 + }, + { + "epoch": 1.3507558602117633, + "grad_norm": 0.11186888068914413, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0046, + "step": 22070 + }, + { + "epoch": 1.3513678927718955, + "grad_norm": 0.17715996503829956, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0035, + "step": 22080 + }, + { + "epoch": 1.3519799253320277, + "grad_norm": 0.1265174001455307, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0048, + "step": 22090 + }, + { + "epoch": 1.35259195789216, + "grad_norm": 0.13969522714614868, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0028, + "step": 22100 + }, + { + "epoch": 1.3532039904522921, + "grad_norm": 0.13246525824069977, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0026, + "step": 22110 + }, + { + "epoch": 1.3538160230124243, + "grad_norm": 0.14675064384937286, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0082, + "step": 22120 + }, + { + "epoch": 1.3544280555725565, + "grad_norm": 0.15810683369636536, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0031, + "step": 22130 + }, + { + "epoch": 1.3550400881326887, + "grad_norm": 0.20675864815711975, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0035, + "step": 22140 + }, + { + "epoch": 1.355652120692821, + "grad_norm": 0.1921442300081253, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0038, + "step": 22150 + }, + { + "epoch": 1.3562641532529531, + "grad_norm": 0.14300711452960968, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0035, + "step": 22160 + }, + { + "epoch": 1.3568761858130853, + "grad_norm": 0.0656728520989418, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0047, + "step": 22170 + }, + { + "epoch": 1.3574882183732175, + "grad_norm": 0.148203507065773, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0041, + "step": 22180 + }, + { + "epoch": 1.3581002509333495, + "grad_norm": 0.15472126007080078, + "learning_rate": 5.034310349217475e-06, + "loss": 0.004, + "step": 22190 + }, + { + "epoch": 1.3587122834934817, + "grad_norm": 0.12006669491529465, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0034, + "step": 22200 + }, + { + "epoch": 1.359324316053614, + "grad_norm": 0.15345145761966705, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0032, + "step": 22210 + }, + { + "epoch": 1.3599363486137461, + "grad_norm": 0.17429186403751373, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0039, + "step": 22220 + }, + { + "epoch": 1.3605483811738783, + "grad_norm": 0.20691345632076263, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0029, + "step": 22230 + }, + { + "epoch": 1.3611604137340105, + "grad_norm": 0.1874946504831314, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0042, + "step": 22240 + }, + { + "epoch": 1.3617724462941427, + "grad_norm": 0.12159912288188934, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0033, + "step": 22250 + }, + { + "epoch": 1.362384478854275, + "grad_norm": 0.29434919357299805, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0044, + "step": 22260 + }, + { + "epoch": 1.3629965114144071, + "grad_norm": 0.06661798804998398, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0031, + "step": 22270 + }, + { + "epoch": 1.3636085439745393, + "grad_norm": 0.14819994568824768, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0039, + "step": 22280 + }, + { + "epoch": 1.3642205765346715, + "grad_norm": 0.17289887368679047, + "learning_rate": 4.961660586405147e-06, + "loss": 0.0035, + "step": 22290 + }, + { + "epoch": 1.3648326090948038, + "grad_norm": 0.18789313733577728, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0036, + "step": 22300 + }, + { + "epoch": 1.365444641654936, + "grad_norm": 0.1877586394548416, + "learning_rate": 4.947215397583639e-06, + "loss": 0.004, + "step": 22310 + }, + { + "epoch": 1.3660566742150682, + "grad_norm": 0.11696574836969376, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0036, + "step": 22320 + }, + { + "epoch": 1.3666687067752004, + "grad_norm": 0.2511763274669647, + "learning_rate": 4.932798621873274e-06, + "loss": 0.004, + "step": 22330 + }, + { + "epoch": 1.3672807393353326, + "grad_norm": 0.15005314350128174, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0046, + "step": 22340 + }, + { + "epoch": 1.3678927718954648, + "grad_norm": 0.16856855154037476, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0056, + "step": 22350 + }, + { + "epoch": 1.368504804455597, + "grad_norm": 0.24532385170459747, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0035, + "step": 22360 + }, + { + "epoch": 1.3691168370157292, + "grad_norm": 0.29320162534713745, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0047, + "step": 22370 + }, + { + "epoch": 1.3697288695758614, + "grad_norm": 0.1518300473690033, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0041, + "step": 22380 + }, + { + "epoch": 1.3703409021359936, + "grad_norm": 0.13431201875209808, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0033, + "step": 22390 + }, + { + "epoch": 1.3709529346961258, + "grad_norm": 0.17390409111976624, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0039, + "step": 22400 + }, + { + "epoch": 1.371564967256258, + "grad_norm": 0.16482478380203247, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.007, + "step": 22410 + }, + { + "epoch": 1.3721769998163902, + "grad_norm": 0.11469490826129913, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0041, + "step": 22420 + }, + { + "epoch": 1.3727890323765224, + "grad_norm": 0.2327135056257248, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0043, + "step": 22430 + }, + { + "epoch": 1.3734010649366546, + "grad_norm": 0.1373092532157898, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0036, + "step": 22440 + }, + { + "epoch": 1.3740130974967868, + "grad_norm": 0.1534084528684616, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0028, + "step": 22450 + }, + { + "epoch": 1.374625130056919, + "grad_norm": 0.3217960596084595, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0044, + "step": 22460 + }, + { + "epoch": 1.3752371626170512, + "grad_norm": 0.14245563745498657, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0039, + "step": 22470 + }, + { + "epoch": 1.3758491951771834, + "grad_norm": 0.17652876675128937, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0031, + "step": 22480 + }, + { + "epoch": 1.3764612277373156, + "grad_norm": 0.1996244192123413, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0034, + "step": 22490 + }, + { + "epoch": 1.3770732602974478, + "grad_norm": 0.1658472716808319, + "learning_rate": 4.81141273556404e-06, + "loss": 0.003, + "step": 22500 + }, + { + "epoch": 1.37768529285758, + "grad_norm": 0.16233472526073456, + "learning_rate": 4.804337352679613e-06, + "loss": 0.004, + "step": 22510 + }, + { + "epoch": 1.3782973254177122, + "grad_norm": 0.13045033812522888, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0049, + "step": 22520 + }, + { + "epoch": 1.3789093579778444, + "grad_norm": 0.1195274218916893, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0042, + "step": 22530 + }, + { + "epoch": 1.3795213905379766, + "grad_norm": 0.14395804703235626, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0036, + "step": 22540 + }, + { + "epoch": 1.3801334230981088, + "grad_norm": 0.24495497345924377, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0047, + "step": 22550 + }, + { + "epoch": 1.380745455658241, + "grad_norm": 0.14288006722927094, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0044, + "step": 22560 + }, + { + "epoch": 1.3813574882183732, + "grad_norm": 0.16967979073524475, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0051, + "step": 22570 + }, + { + "epoch": 1.3819695207785054, + "grad_norm": 0.2023036777973175, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0032, + "step": 22580 + }, + { + "epoch": 1.3825815533386376, + "grad_norm": 0.1191902756690979, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0026, + "step": 22590 + }, + { + "epoch": 1.3831935858987698, + "grad_norm": 0.16922403872013092, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0037, + "step": 22600 + }, + { + "epoch": 1.383805618458902, + "grad_norm": 0.12394976615905762, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0038, + "step": 22610 + }, + { + "epoch": 1.3844176510190342, + "grad_norm": 0.23889753222465515, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0041, + "step": 22620 + }, + { + "epoch": 1.3850296835791664, + "grad_norm": 0.31215062737464905, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0036, + "step": 22630 + }, + { + "epoch": 1.3856417161392987, + "grad_norm": 0.1519152820110321, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0042, + "step": 22640 + }, + { + "epoch": 1.3862537486994309, + "grad_norm": 0.3375433683395386, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0039, + "step": 22650 + }, + { + "epoch": 1.386865781259563, + "grad_norm": 0.21715323626995087, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0027, + "step": 22660 + }, + { + "epoch": 1.3874778138196953, + "grad_norm": 0.2066027969121933, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0033, + "step": 22670 + }, + { + "epoch": 1.3880898463798275, + "grad_norm": 0.11542408168315887, + "learning_rate": 4.6851750421442e-06, + "loss": 0.004, + "step": 22680 + }, + { + "epoch": 1.3887018789399597, + "grad_norm": 0.1183561235666275, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0037, + "step": 22690 + }, + { + "epoch": 1.3893139115000919, + "grad_norm": 0.24478662014007568, + "learning_rate": 4.67129597392514e-06, + "loss": 0.004, + "step": 22700 + }, + { + "epoch": 1.389925944060224, + "grad_norm": 0.28880801796913147, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0039, + "step": 22710 + }, + { + "epoch": 1.3905379766203563, + "grad_norm": 0.14014701545238495, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0034, + "step": 22720 + }, + { + "epoch": 1.3911500091804885, + "grad_norm": 0.1549793928861618, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0033, + "step": 22730 + }, + { + "epoch": 1.3917620417406207, + "grad_norm": 0.1423012614250183, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0041, + "step": 22740 + }, + { + "epoch": 1.3923740743007529, + "grad_norm": 0.291273832321167, + "learning_rate": 4.636728419531758e-06, + "loss": 0.004, + "step": 22750 + }, + { + "epoch": 1.392986106860885, + "grad_norm": 0.38278621435165405, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0045, + "step": 22760 + }, + { + "epoch": 1.3935981394210173, + "grad_norm": 0.20528365671634674, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0042, + "step": 22770 + }, + { + "epoch": 1.3942101719811495, + "grad_norm": 0.11913729459047318, + "learning_rate": 4.616077433849538e-06, + "loss": 0.003, + "step": 22780 + }, + { + "epoch": 1.3948222045412817, + "grad_norm": 0.21683627367019653, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0027, + "step": 22790 + }, + { + "epoch": 1.395434237101414, + "grad_norm": 0.12143554538488388, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0031, + "step": 22800 + }, + { + "epoch": 1.396046269661546, + "grad_norm": 0.14171159267425537, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0039, + "step": 22810 + }, + { + "epoch": 1.3966583022216783, + "grad_norm": 0.19254790246486664, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0043, + "step": 22820 + }, + { + "epoch": 1.3972703347818105, + "grad_norm": 0.12295825034379959, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0045, + "step": 22830 + }, + { + "epoch": 1.3978823673419427, + "grad_norm": 0.1274985820055008, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0037, + "step": 22840 + }, + { + "epoch": 1.398494399902075, + "grad_norm": 0.2940427362918854, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0059, + "step": 22850 + }, + { + "epoch": 1.3991064324622071, + "grad_norm": 0.15357589721679688, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0032, + "step": 22860 + }, + { + "epoch": 1.3997184650223393, + "grad_norm": 0.12781603634357452, + "learning_rate": 4.554529907376127e-06, + "loss": 0.003, + "step": 22870 + }, + { + "epoch": 1.4003304975824715, + "grad_norm": 0.34976109862327576, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0047, + "step": 22880 + }, + { + "epoch": 1.4009425301426035, + "grad_norm": 0.1797824203968048, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0034, + "step": 22890 + }, + { + "epoch": 1.4015545627027357, + "grad_norm": 0.13750647008419037, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0046, + "step": 22900 + }, + { + "epoch": 1.402166595262868, + "grad_norm": 0.22893266379833221, + "learning_rate": 4.527371771040039e-06, + "loss": 0.005, + "step": 22910 + }, + { + "epoch": 1.4027786278230001, + "grad_norm": 0.1595923751592636, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0045, + "step": 22920 + }, + { + "epoch": 1.4033906603831323, + "grad_norm": 0.11474192142486572, + "learning_rate": 4.513838246961138e-06, + "loss": 0.003, + "step": 22930 + }, + { + "epoch": 1.4040026929432645, + "grad_norm": 0.12208060175180435, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0038, + "step": 22940 + }, + { + "epoch": 1.4046147255033967, + "grad_norm": 0.2919016480445862, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0036, + "step": 22950 + }, + { + "epoch": 1.405226758063529, + "grad_norm": 0.19161155819892883, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0049, + "step": 22960 + }, + { + "epoch": 1.4058387906236611, + "grad_norm": 0.1454700380563736, + "learning_rate": 4.486862604628113e-06, + "loss": 0.004, + "step": 22970 + }, + { + "epoch": 1.4064508231837933, + "grad_norm": 0.227305606007576, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0043, + "step": 22980 + }, + { + "epoch": 1.4070628557439255, + "grad_norm": 0.09430288523435593, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0051, + "step": 22990 + }, + { + "epoch": 1.4076748883040577, + "grad_norm": 0.09664178639650345, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0036, + "step": 23000 + }, + { + "epoch": 1.40828692086419, + "grad_norm": 0.21268269419670105, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0031, + "step": 23010 + }, + { + "epoch": 1.4088989534243221, + "grad_norm": 0.09796992689371109, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0041, + "step": 23020 + }, + { + "epoch": 1.4095109859844543, + "grad_norm": 0.18376071751117706, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0039, + "step": 23030 + }, + { + "epoch": 1.4101230185445865, + "grad_norm": 0.10276145488023758, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0035, + "step": 23040 + }, + { + "epoch": 1.4107350511047188, + "grad_norm": 0.16089564561843872, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0051, + "step": 23050 + }, + { + "epoch": 1.411347083664851, + "grad_norm": 0.1825491487979889, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0036, + "step": 23060 + }, + { + "epoch": 1.4119591162249832, + "grad_norm": 0.24405492842197418, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0028, + "step": 23070 + }, + { + "epoch": 1.4125711487851154, + "grad_norm": 0.14085668325424194, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0039, + "step": 23080 + }, + { + "epoch": 1.4131831813452476, + "grad_norm": 0.11708472669124603, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0035, + "step": 23090 + }, + { + "epoch": 1.4137952139053798, + "grad_norm": 0.12108796834945679, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0036, + "step": 23100 + }, + { + "epoch": 1.414407246465512, + "grad_norm": 0.14601854979991913, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0036, + "step": 23110 + }, + { + "epoch": 1.4150192790256442, + "grad_norm": 0.10614772886037827, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0031, + "step": 23120 + }, + { + "epoch": 1.4156313115857764, + "grad_norm": 0.09014416486024857, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0027, + "step": 23130 + }, + { + "epoch": 1.4162433441459086, + "grad_norm": 0.15246634185314178, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0031, + "step": 23140 + }, + { + "epoch": 1.4168553767060408, + "grad_norm": 0.20104879140853882, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0075, + "step": 23150 + }, + { + "epoch": 1.417467409266173, + "grad_norm": 0.1359969973564148, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0036, + "step": 23160 + }, + { + "epoch": 1.4180794418263052, + "grad_norm": 0.19849587976932526, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0054, + "step": 23170 + }, + { + "epoch": 1.4186914743864374, + "grad_norm": 0.12617377936840057, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0024, + "step": 23180 + }, + { + "epoch": 1.4193035069465696, + "grad_norm": 0.15024134516716003, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0034, + "step": 23190 + }, + { + "epoch": 1.4199155395067018, + "grad_norm": 0.2345605194568634, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0036, + "step": 23200 + }, + { + "epoch": 1.420527572066834, + "grad_norm": 0.13125917315483093, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0035, + "step": 23210 + }, + { + "epoch": 1.4211396046269662, + "grad_norm": 0.20977836847305298, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0038, + "step": 23220 + }, + { + "epoch": 1.4217516371870984, + "grad_norm": 0.3925677537918091, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0043, + "step": 23230 + }, + { + "epoch": 1.4223636697472306, + "grad_norm": 0.17691555619239807, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0048, + "step": 23240 + }, + { + "epoch": 1.4229757023073628, + "grad_norm": 0.18366187810897827, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0033, + "step": 23250 + }, + { + "epoch": 1.423587734867495, + "grad_norm": 0.15539205074310303, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0039, + "step": 23260 + }, + { + "epoch": 1.4241997674276272, + "grad_norm": 0.15048520267009735, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0032, + "step": 23270 + }, + { + "epoch": 1.4248117999877594, + "grad_norm": 0.2631739675998688, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0061, + "step": 23280 + }, + { + "epoch": 1.4254238325478916, + "grad_norm": 0.18545641005039215, + "learning_rate": 4.275502195405868e-06, + "loss": 0.005, + "step": 23290 + }, + { + "epoch": 1.4260358651080238, + "grad_norm": 0.25486356019973755, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0033, + "step": 23300 + }, + { + "epoch": 1.426647897668156, + "grad_norm": 0.2514204978942871, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0043, + "step": 23310 + }, + { + "epoch": 1.427259930228288, + "grad_norm": 0.12997376918792725, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0034, + "step": 23320 + }, + { + "epoch": 1.4278719627884202, + "grad_norm": 0.26096200942993164, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0047, + "step": 23330 + }, + { + "epoch": 1.4284839953485524, + "grad_norm": 0.2292930781841278, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0038, + "step": 23340 + }, + { + "epoch": 1.4290960279086846, + "grad_norm": 0.20056717097759247, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0037, + "step": 23350 + }, + { + "epoch": 1.4297080604688168, + "grad_norm": 0.1608581393957138, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0032, + "step": 23360 + }, + { + "epoch": 1.430320093028949, + "grad_norm": 0.235102578997612, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0034, + "step": 23370 + }, + { + "epoch": 1.4309321255890812, + "grad_norm": 0.11869259178638458, + "learning_rate": 4.217502203129258e-06, + "loss": 0.005, + "step": 23380 + }, + { + "epoch": 1.4315441581492134, + "grad_norm": 0.167036771774292, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0045, + "step": 23390 + }, + { + "epoch": 1.4321561907093456, + "grad_norm": 0.13766071200370789, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0044, + "step": 23400 + }, + { + "epoch": 1.4327682232694778, + "grad_norm": 0.15444986522197723, + "learning_rate": 4.198311874248223e-06, + "loss": 0.004, + "step": 23410 + }, + { + "epoch": 1.43338025582961, + "grad_norm": 0.11997724324464798, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0034, + "step": 23420 + }, + { + "epoch": 1.4339922883897422, + "grad_norm": 0.1533307433128357, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0038, + "step": 23430 + }, + { + "epoch": 1.4346043209498744, + "grad_norm": 0.10954161733388901, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0045, + "step": 23440 + }, + { + "epoch": 1.4352163535100066, + "grad_norm": 0.16601058840751648, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0039, + "step": 23450 + }, + { + "epoch": 1.4358283860701389, + "grad_norm": 0.1756889373064041, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0035, + "step": 23460 + }, + { + "epoch": 1.436440418630271, + "grad_norm": 0.12633845210075378, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0046, + "step": 23470 + }, + { + "epoch": 1.4370524511904033, + "grad_norm": 0.15678541362285614, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0025, + "step": 23480 + }, + { + "epoch": 1.4376644837505355, + "grad_norm": 0.13923659920692444, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0052, + "step": 23490 + }, + { + "epoch": 1.4382765163106677, + "grad_norm": 0.28792211413383484, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0053, + "step": 23500 + }, + { + "epoch": 1.4388885488707999, + "grad_norm": 0.16125047206878662, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0037, + "step": 23510 + }, + { + "epoch": 1.439500581430932, + "grad_norm": 0.2653597593307495, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0067, + "step": 23520 + }, + { + "epoch": 1.4401126139910643, + "grad_norm": 0.2692917585372925, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0031, + "step": 23530 + }, + { + "epoch": 1.4407246465511965, + "grad_norm": 0.2234862893819809, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0044, + "step": 23540 + }, + { + "epoch": 1.4413366791113287, + "grad_norm": 0.17526887357234955, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0025, + "step": 23550 + }, + { + "epoch": 1.4419487116714609, + "grad_norm": 0.10404029488563538, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0041, + "step": 23560 + }, + { + "epoch": 1.442560744231593, + "grad_norm": 0.1385052353143692, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0027, + "step": 23570 + }, + { + "epoch": 1.4431727767917253, + "grad_norm": 0.30865412950515747, + "learning_rate": 4.090929556079854e-06, + "loss": 0.004, + "step": 23580 + }, + { + "epoch": 1.4437848093518575, + "grad_norm": 0.10908320546150208, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0041, + "step": 23590 + }, + { + "epoch": 1.4443968419119897, + "grad_norm": 0.09885916113853455, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0045, + "step": 23600 + }, + { + "epoch": 1.445008874472122, + "grad_norm": 0.1685211956501007, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0031, + "step": 23610 + }, + { + "epoch": 1.445620907032254, + "grad_norm": 0.0967954769730568, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0031, + "step": 23620 + }, + { + "epoch": 1.4462329395923863, + "grad_norm": 0.07489120960235596, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0038, + "step": 23630 + }, + { + "epoch": 1.4468449721525185, + "grad_norm": 0.20616063475608826, + "learning_rate": 4.053587511509546e-06, + "loss": 0.0043, + "step": 23640 + }, + { + "epoch": 1.4474570047126507, + "grad_norm": 0.15788249671459198, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0031, + "step": 23650 + }, + { + "epoch": 1.448069037272783, + "grad_norm": 0.10360633581876755, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0027, + "step": 23660 + }, + { + "epoch": 1.4486810698329151, + "grad_norm": 0.2871163785457611, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0028, + "step": 23670 + }, + { + "epoch": 1.4492931023930473, + "grad_norm": 0.15280364453792572, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0042, + "step": 23680 + }, + { + "epoch": 1.4499051349531795, + "grad_norm": 0.17502477765083313, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0031, + "step": 23690 + }, + { + "epoch": 1.4505171675133117, + "grad_norm": 0.2154005616903305, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0036, + "step": 23700 + }, + { + "epoch": 1.451129200073444, + "grad_norm": 0.15002919733524323, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0033, + "step": 23710 + }, + { + "epoch": 1.4517412326335761, + "grad_norm": 0.10422170162200928, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0032, + "step": 23720 + }, + { + "epoch": 1.4523532651937083, + "grad_norm": 0.15197636187076569, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0043, + "step": 23730 + }, + { + "epoch": 1.4529652977538405, + "grad_norm": 0.2571481466293335, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0039, + "step": 23740 + }, + { + "epoch": 1.4535773303139727, + "grad_norm": 0.12697578966617584, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0034, + "step": 23750 + }, + { + "epoch": 1.454189362874105, + "grad_norm": 0.14347535371780396, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0051, + "step": 23760 + }, + { + "epoch": 1.4548013954342371, + "grad_norm": 0.1494351178407669, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0037, + "step": 23770 + }, + { + "epoch": 1.4554134279943693, + "grad_norm": 0.23901797831058502, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0032, + "step": 23780 + }, + { + "epoch": 1.4560254605545015, + "grad_norm": 0.1434790939092636, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0036, + "step": 23790 + }, + { + "epoch": 1.4566374931146338, + "grad_norm": 0.1456829458475113, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0034, + "step": 23800 + }, + { + "epoch": 1.457249525674766, + "grad_norm": 0.33969590067863464, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0038, + "step": 23810 + }, + { + "epoch": 1.4578615582348982, + "grad_norm": 0.1768753081560135, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0027, + "step": 23820 + }, + { + "epoch": 1.4584735907950304, + "grad_norm": 0.15212708711624146, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0032, + "step": 23830 + }, + { + "epoch": 1.4590856233551626, + "grad_norm": 0.10870973765850067, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0033, + "step": 23840 + }, + { + "epoch": 1.4596976559152948, + "grad_norm": 0.17898528277873993, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0022, + "step": 23850 + }, + { + "epoch": 1.460309688475427, + "grad_norm": 0.15515227615833282, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0034, + "step": 23860 + }, + { + "epoch": 1.4609217210355592, + "grad_norm": 0.11047070473432541, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0032, + "step": 23870 + }, + { + "epoch": 1.4615337535956914, + "grad_norm": 0.08628113567829132, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0032, + "step": 23880 + }, + { + "epoch": 1.4621457861558236, + "grad_norm": 0.358903706073761, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0043, + "step": 23890 + }, + { + "epoch": 1.4627578187159558, + "grad_norm": 0.13986052572727203, + "learning_rate": 3.895183209452123e-06, + "loss": 0.003, + "step": 23900 + }, + { + "epoch": 1.463369851276088, + "grad_norm": 0.09236793220043182, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0029, + "step": 23910 + }, + { + "epoch": 1.4639818838362202, + "grad_norm": 0.14616963267326355, + "learning_rate": 3.883230136754435e-06, + "loss": 0.005, + "step": 23920 + }, + { + "epoch": 1.4645939163963524, + "grad_norm": 0.0754290223121643, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0031, + "step": 23930 + }, + { + "epoch": 1.4652059489564846, + "grad_norm": 0.16520163416862488, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0037, + "step": 23940 + }, + { + "epoch": 1.4658179815166168, + "grad_norm": 0.06801608204841614, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0023, + "step": 23950 + }, + { + "epoch": 1.466430014076749, + "grad_norm": 0.3087909519672394, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0042, + "step": 23960 + }, + { + "epoch": 1.4670420466368812, + "grad_norm": 0.23470532894134521, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0046, + "step": 23970 + }, + { + "epoch": 1.4676540791970134, + "grad_norm": 0.10248749703168869, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0025, + "step": 23980 + }, + { + "epoch": 1.4682661117571456, + "grad_norm": 0.12478570640087128, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0035, + "step": 23990 + }, + { + "epoch": 1.4688781443172778, + "grad_norm": 0.16669252514839172, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0034, + "step": 24000 + }, + { + "epoch": 1.46949017687741, + "grad_norm": 0.12477939575910568, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0033, + "step": 24010 + }, + { + "epoch": 1.470102209437542, + "grad_norm": 0.1738445907831192, + "learning_rate": 3.823967005382315e-06, + "loss": 0.003, + "step": 24020 + }, + { + "epoch": 1.4707142419976742, + "grad_norm": 0.11228524148464203, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0029, + "step": 24030 + }, + { + "epoch": 1.4713262745578064, + "grad_norm": 0.28472721576690674, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0035, + "step": 24040 + }, + { + "epoch": 1.4719383071179386, + "grad_norm": 0.18087328970432281, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0037, + "step": 24050 + }, + { + "epoch": 1.4725503396780708, + "grad_norm": 0.39030423760414124, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0043, + "step": 24060 + }, + { + "epoch": 1.473162372238203, + "grad_norm": 0.164345845580101, + "learning_rate": 3.794650811106129e-06, + "loss": 0.0029, + "step": 24070 + }, + { + "epoch": 1.4737744047983352, + "grad_norm": 0.14081600308418274, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.0038, + "step": 24080 + }, + { + "epoch": 1.4743864373584674, + "grad_norm": 0.27649205923080444, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0037, + "step": 24090 + }, + { + "epoch": 1.4749984699185996, + "grad_norm": 0.08673480153083801, + "learning_rate": 3.777162510056721e-06, + "loss": 0.004, + "step": 24100 + }, + { + "epoch": 1.4756105024787318, + "grad_norm": 0.11770286411046982, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0033, + "step": 24110 + }, + { + "epoch": 1.476222535038864, + "grad_norm": 0.11967290937900543, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0035, + "step": 24120 + }, + { + "epoch": 1.4768345675989962, + "grad_norm": 0.12635833024978638, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0043, + "step": 24130 + }, + { + "epoch": 1.4774466001591284, + "grad_norm": 0.13505803048610687, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.0034, + "step": 24140 + }, + { + "epoch": 1.4780586327192606, + "grad_norm": 0.17781652510166168, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0031, + "step": 24150 + }, + { + "epoch": 1.4786706652793928, + "grad_norm": 0.18974725902080536, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0051, + "step": 24160 + }, + { + "epoch": 1.479282697839525, + "grad_norm": 0.12072815746068954, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0027, + "step": 24170 + }, + { + "epoch": 1.4798947303996572, + "grad_norm": 0.10813914984464645, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0023, + "step": 24180 + }, + { + "epoch": 1.4805067629597894, + "grad_norm": 0.07975378632545471, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0035, + "step": 24190 + }, + { + "epoch": 1.4811187955199216, + "grad_norm": 0.0948014184832573, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0033, + "step": 24200 + }, + { + "epoch": 1.4817308280800539, + "grad_norm": 0.11943913251161575, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0038, + "step": 24210 + }, + { + "epoch": 1.482342860640186, + "grad_norm": 0.34374934434890747, + "learning_rate": 3.707974016467e-06, + "loss": 0.0043, + "step": 24220 + }, + { + "epoch": 1.4829548932003183, + "grad_norm": 0.264528751373291, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0037, + "step": 24230 + }, + { + "epoch": 1.4835669257604505, + "grad_norm": 0.08419078588485718, + "learning_rate": 3.696562092850226e-06, + "loss": 0.0031, + "step": 24240 + }, + { + "epoch": 1.4841789583205827, + "grad_norm": 0.3805602192878723, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0039, + "step": 24250 + }, + { + "epoch": 1.4847909908807149, + "grad_norm": 0.09091196954250336, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0034, + "step": 24260 + }, + { + "epoch": 1.485403023440847, + "grad_norm": 0.1352047175168991, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0033, + "step": 24270 + }, + { + "epoch": 1.4860150560009793, + "grad_norm": 0.14287787675857544, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0033, + "step": 24280 + }, + { + "epoch": 1.4866270885611115, + "grad_norm": 0.15490861237049103, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0037, + "step": 24290 + }, + { + "epoch": 1.4872391211212437, + "grad_norm": 0.08607941120862961, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0042, + "step": 24300 + }, + { + "epoch": 1.4878511536813759, + "grad_norm": 0.2872561514377594, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0039, + "step": 24310 + }, + { + "epoch": 1.488463186241508, + "grad_norm": 0.09383561462163925, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.0036, + "step": 24320 + }, + { + "epoch": 1.4890752188016403, + "grad_norm": 0.13576671481132507, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0039, + "step": 24330 + }, + { + "epoch": 1.4896872513617725, + "grad_norm": 0.21924526989459991, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0038, + "step": 24340 + }, + { + "epoch": 1.4902992839219047, + "grad_norm": 0.24333837628364563, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0034, + "step": 24350 + }, + { + "epoch": 1.490911316482037, + "grad_norm": 0.08171682059764862, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0034, + "step": 24360 + }, + { + "epoch": 1.491523349042169, + "grad_norm": 0.11815544962882996, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.0027, + "step": 24370 + }, + { + "epoch": 1.4921353816023013, + "grad_norm": 0.15248773992061615, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0035, + "step": 24380 + }, + { + "epoch": 1.4927474141624335, + "grad_norm": 0.13664020597934723, + "learning_rate": 3.612069140022124e-06, + "loss": 0.0032, + "step": 24390 + }, + { + "epoch": 1.4933594467225657, + "grad_norm": 0.2877022624015808, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0034, + "step": 24400 + }, + { + "epoch": 1.493971479282698, + "grad_norm": 0.1447642594575882, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0033, + "step": 24410 + }, + { + "epoch": 1.4945835118428301, + "grad_norm": 0.18032193183898926, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0036, + "step": 24420 + }, + { + "epoch": 1.4951955444029623, + "grad_norm": 0.1249038353562355, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0023, + "step": 24430 + }, + { + "epoch": 1.4958075769630943, + "grad_norm": 0.21674089133739471, + "learning_rate": 3.584337233394337e-06, + "loss": 0.0036, + "step": 24440 + }, + { + "epoch": 1.4964196095232265, + "grad_norm": 0.2503979504108429, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0039, + "step": 24450 + }, + { + "epoch": 1.4970316420833587, + "grad_norm": 0.15412171185016632, + "learning_rate": 3.573305344104808e-06, + "loss": 0.0035, + "step": 24460 + }, + { + "epoch": 1.497643674643491, + "grad_norm": 0.17718803882598877, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0055, + "step": 24470 + }, + { + "epoch": 1.498255707203623, + "grad_norm": 0.24290283024311066, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0033, + "step": 24480 + }, + { + "epoch": 1.4988677397637553, + "grad_norm": 0.20131447911262512, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.0035, + "step": 24490 + }, + { + "epoch": 1.4994797723238875, + "grad_norm": 0.18041104078292847, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0037, + "step": 24500 + }, + { + "epoch": 1.5000918048840197, + "grad_norm": 0.11311472952365875, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0027, + "step": 24510 + }, + { + "epoch": 1.500703837444152, + "grad_norm": 0.10401099175214767, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0035, + "step": 24520 + }, + { + "epoch": 1.5013158700042841, + "grad_norm": 0.16640698909759521, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.0029, + "step": 24530 + }, + { + "epoch": 1.5019279025644163, + "grad_norm": 0.1116192489862442, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0027, + "step": 24540 + }, + { + "epoch": 1.5025399351245485, + "grad_norm": 0.14617346227169037, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0037, + "step": 24550 + }, + { + "epoch": 1.5031519676846807, + "grad_norm": 0.10546499490737915, + "learning_rate": 3.518669865884119e-06, + "loss": 0.0025, + "step": 24560 + }, + { + "epoch": 1.503764000244813, + "grad_norm": 0.11696954816579819, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0039, + "step": 24570 + }, + { + "epoch": 1.5043760328049451, + "grad_norm": 0.1503429412841797, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0038, + "step": 24580 + }, + { + "epoch": 1.5049880653650773, + "grad_norm": 0.13094773888587952, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0037, + "step": 24590 + }, + { + "epoch": 1.5056000979252095, + "grad_norm": 0.1519947648048401, + "learning_rate": 3.497061149826966e-06, + "loss": 0.0027, + "step": 24600 + }, + { + "epoch": 1.5062121304853417, + "grad_norm": 0.3586391806602478, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0041, + "step": 24610 + }, + { + "epoch": 1.506824163045474, + "grad_norm": 0.14964115619659424, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0029, + "step": 24620 + }, + { + "epoch": 1.5074361956056062, + "grad_norm": 0.2676304578781128, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.0033, + "step": 24630 + }, + { + "epoch": 1.5080482281657384, + "grad_norm": 0.117411769926548, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0041, + "step": 24640 + }, + { + "epoch": 1.5086602607258706, + "grad_norm": 0.11224953830242157, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0035, + "step": 24650 + }, + { + "epoch": 1.5092722932860028, + "grad_norm": 0.14367471635341644, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.0033, + "step": 24660 + }, + { + "epoch": 1.509884325846135, + "grad_norm": 0.27663105726242065, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.004, + "step": 24670 + }, + { + "epoch": 1.5104963584062672, + "grad_norm": 0.08599471300840378, + "learning_rate": 3.454266765790622e-06, + "loss": 0.0031, + "step": 24680 + }, + { + "epoch": 1.5111083909663994, + "grad_norm": 0.11320041120052338, + "learning_rate": 3.448957251110008e-06, + "loss": 0.0033, + "step": 24690 + }, + { + "epoch": 1.5117204235265316, + "grad_norm": 0.0896427258849144, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0032, + "step": 24700 + }, + { + "epoch": 1.5123324560866638, + "grad_norm": 0.1055784597992897, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0028, + "step": 24710 + }, + { + "epoch": 1.512944488646796, + "grad_norm": 0.0936208963394165, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0036, + "step": 24720 + }, + { + "epoch": 1.5135565212069282, + "grad_norm": 0.13069137930870056, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.0033, + "step": 24730 + }, + { + "epoch": 1.5141685537670604, + "grad_norm": 0.17260710895061493, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0034, + "step": 24740 + }, + { + "epoch": 1.5147805863271926, + "grad_norm": 0.26109611988067627, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.0044, + "step": 24750 + }, + { + "epoch": 1.5153926188873248, + "grad_norm": 0.22439827024936676, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.0042, + "step": 24760 + }, + { + "epoch": 1.516004651447457, + "grad_norm": 0.2269357591867447, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0039, + "step": 24770 + }, + { + "epoch": 1.5166166840075892, + "grad_norm": 0.20416954159736633, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0039, + "step": 24780 + }, + { + "epoch": 1.5172287165677214, + "grad_norm": 0.1766926646232605, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0031, + "step": 24790 + }, + { + "epoch": 1.5178407491278536, + "grad_norm": 0.05759773403406143, + "learning_rate": 3.391138816571675e-06, + "loss": 0.0029, + "step": 24800 + }, + { + "epoch": 1.5184527816879858, + "grad_norm": 0.19152496755123138, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0034, + "step": 24810 + }, + { + "epoch": 1.519064814248118, + "grad_norm": 0.09876703470945358, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0032, + "step": 24820 + }, + { + "epoch": 1.5196768468082502, + "grad_norm": 0.11626110225915909, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0037, + "step": 24830 + }, + { + "epoch": 1.5202888793683824, + "grad_norm": 0.13713783025741577, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0027, + "step": 24840 + }, + { + "epoch": 1.5209009119285146, + "grad_norm": 0.19144660234451294, + "learning_rate": 3.36521439484193e-06, + "loss": 0.004, + "step": 24850 + }, + { + "epoch": 1.5215129444886468, + "grad_norm": 0.1376778483390808, + "learning_rate": 3.36005636574796e-06, + "loss": 0.0037, + "step": 24860 + }, + { + "epoch": 1.522124977048779, + "grad_norm": 0.4120432436466217, + "learning_rate": 3.354907302553392e-06, + "loss": 0.0042, + "step": 24870 + }, + { + "epoch": 1.5227370096089112, + "grad_norm": 0.14245551824569702, + "learning_rate": 3.349767211300933e-06, + "loss": 0.003, + "step": 24880 + }, + { + "epoch": 1.5233490421690434, + "grad_norm": 0.19136923551559448, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0053, + "step": 24890 + }, + { + "epoch": 1.5239610747291756, + "grad_norm": 0.28412777185440063, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.0031, + "step": 24900 + }, + { + "epoch": 1.5245731072893078, + "grad_norm": 0.18925072252750397, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.003, + "step": 24910 + }, + { + "epoch": 1.52518513984944, + "grad_norm": 0.21378494799137115, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0034, + "step": 24920 + }, + { + "epoch": 1.5257971724095722, + "grad_norm": 0.19160443544387817, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0037, + "step": 24930 + }, + { + "epoch": 1.5264092049697044, + "grad_norm": 0.19070027768611908, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0038, + "step": 24940 + }, + { + "epoch": 1.5270212375298367, + "grad_norm": 0.20489074289798737, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.003, + "step": 24950 + }, + { + "epoch": 1.5276332700899689, + "grad_norm": 0.15747228264808655, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0037, + "step": 24960 + }, + { + "epoch": 1.528245302650101, + "grad_norm": 0.21312901377677917, + "learning_rate": 3.303911119253872e-06, + "loss": 0.0035, + "step": 24970 + }, + { + "epoch": 1.5288573352102333, + "grad_norm": 0.10329846292734146, + "learning_rate": 3.298861077451818e-06, + "loss": 0.0033, + "step": 24980 + }, + { + "epoch": 1.5294693677703655, + "grad_norm": 0.13872355222702026, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0056, + "step": 24990 + }, + { + "epoch": 1.5300814003304977, + "grad_norm": 0.08532251417636871, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0026, + "step": 25000 + }, + { + "epoch": 1.5306934328906299, + "grad_norm": 0.1309783011674881, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.0038, + "step": 25010 + }, + { + "epoch": 1.531305465450762, + "grad_norm": 0.16484731435775757, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.0034, + "step": 25020 + }, + { + "epoch": 1.5319174980108943, + "grad_norm": 0.1756003201007843, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0044, + "step": 25030 + }, + { + "epoch": 1.5325295305710265, + "grad_norm": 0.13745243847370148, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0063, + "step": 25040 + }, + { + "epoch": 1.5331415631311587, + "grad_norm": 0.1077183336019516, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0032, + "step": 25050 + }, + { + "epoch": 1.5337535956912909, + "grad_norm": 0.3091605007648468, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0036, + "step": 25060 + }, + { + "epoch": 1.534365628251423, + "grad_norm": 0.13469856977462769, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0031, + "step": 25070 + }, + { + "epoch": 1.5349776608115553, + "grad_norm": 0.2445354014635086, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0037, + "step": 25080 + }, + { + "epoch": 1.5355896933716875, + "grad_norm": 0.1065889522433281, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0038, + "step": 25090 + }, + { + "epoch": 1.5362017259318197, + "grad_norm": 0.1539459079504013, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0036, + "step": 25100 + }, + { + "epoch": 1.536813758491952, + "grad_norm": 0.23242861032485962, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0037, + "step": 25110 + }, + { + "epoch": 1.537425791052084, + "grad_norm": 0.18660615384578705, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0034, + "step": 25120 + }, + { + "epoch": 1.5380378236122163, + "grad_norm": 0.14089861512184143, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0057, + "step": 25130 + }, + { + "epoch": 1.5386498561723485, + "grad_norm": 0.30568358302116394, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0036, + "step": 25140 + }, + { + "epoch": 1.5392618887324807, + "grad_norm": 0.0965384691953659, + "learning_rate": 3.214397932123149e-06, + "loss": 0.0029, + "step": 25150 + }, + { + "epoch": 1.539873921292613, + "grad_norm": 0.12925416231155396, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0028, + "step": 25160 + }, + { + "epoch": 1.5404859538527451, + "grad_norm": 0.10820749402046204, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0025, + "step": 25170 + }, + { + "epoch": 1.5410979864128773, + "grad_norm": 0.200232595205307, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0038, + "step": 25180 + }, + { + "epoch": 1.5417100189730095, + "grad_norm": 0.13515910506248474, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.0037, + "step": 25190 + }, + { + "epoch": 1.5423220515331417, + "grad_norm": 0.08493158221244812, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0026, + "step": 25200 + }, + { + "epoch": 1.542934084093274, + "grad_norm": 0.21674226224422455, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0047, + "step": 25210 + }, + { + "epoch": 1.543546116653406, + "grad_norm": 0.18259066343307495, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0029, + "step": 25220 + }, + { + "epoch": 1.5441581492135381, + "grad_norm": 0.14857260882854462, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0029, + "step": 25230 + }, + { + "epoch": 1.5447701817736703, + "grad_norm": 0.1540914922952652, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.0026, + "step": 25240 + }, + { + "epoch": 1.5453822143338025, + "grad_norm": 0.08827090263366699, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0029, + "step": 25250 + }, + { + "epoch": 1.5459942468939347, + "grad_norm": 0.07511961460113525, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0035, + "step": 25260 + }, + { + "epoch": 1.546606279454067, + "grad_norm": 0.26209381222724915, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.0033, + "step": 25270 + }, + { + "epoch": 1.5472183120141991, + "grad_norm": 0.08861620724201202, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.0033, + "step": 25280 + }, + { + "epoch": 1.5478303445743313, + "grad_norm": 0.1642802655696869, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0032, + "step": 25290 + }, + { + "epoch": 1.5484423771344635, + "grad_norm": 0.24771225452423096, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0034, + "step": 25300 + }, + { + "epoch": 1.5490544096945957, + "grad_norm": 0.2717854976654053, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.004, + "step": 25310 + }, + { + "epoch": 1.549666442254728, + "grad_norm": 0.12177802622318268, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.0029, + "step": 25320 + }, + { + "epoch": 1.5502784748148601, + "grad_norm": 0.09988416731357574, + "learning_rate": 3.127844986891409e-06, + "loss": 0.0052, + "step": 25330 + }, + { + "epoch": 1.5508905073749923, + "grad_norm": 0.08877446502447128, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0024, + "step": 25340 + }, + { + "epoch": 1.5515025399351245, + "grad_norm": 0.16233091056346893, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.003, + "step": 25350 + }, + { + "epoch": 1.5521145724952568, + "grad_norm": 0.10167178511619568, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.0034, + "step": 25360 + }, + { + "epoch": 1.552726605055389, + "grad_norm": 0.14738866686820984, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0036, + "step": 25370 + }, + { + "epoch": 1.5533386376155212, + "grad_norm": 0.07526370882987976, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0036, + "step": 25380 + }, + { + "epoch": 1.5539506701756534, + "grad_norm": 0.1659732311964035, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0032, + "step": 25390 + }, + { + "epoch": 1.5545627027357856, + "grad_norm": 0.18707287311553955, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0023, + "step": 25400 + }, + { + "epoch": 1.5551747352959178, + "grad_norm": 0.21416662633419037, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0029, + "step": 25410 + }, + { + "epoch": 1.55578676785605, + "grad_norm": 0.3034561574459076, + "learning_rate": 3.085688933413021e-06, + "loss": 0.003, + "step": 25420 + }, + { + "epoch": 1.5563988004161822, + "grad_norm": 0.18879717588424683, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0035, + "step": 25430 + }, + { + "epoch": 1.5570108329763144, + "grad_norm": 0.12917254865169525, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.0038, + "step": 25440 + }, + { + "epoch": 1.5576228655364466, + "grad_norm": 0.0970548763871193, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0046, + "step": 25450 + }, + { + "epoch": 1.5582348980965788, + "grad_norm": 0.17424598336219788, + "learning_rate": 3.067194157156521e-06, + "loss": 0.003, + "step": 25460 + }, + { + "epoch": 1.558846930656711, + "grad_norm": 0.11429346352815628, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0039, + "step": 25470 + }, + { + "epoch": 1.5594589632168432, + "grad_norm": 0.19154596328735352, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0028, + "step": 25480 + }, + { + "epoch": 1.5600709957769754, + "grad_norm": 0.1475156843662262, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.0028, + "step": 25490 + }, + { + "epoch": 1.5606830283371074, + "grad_norm": 0.29066604375839233, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0037, + "step": 25500 + }, + { + "epoch": 1.5612950608972396, + "grad_norm": 0.21379634737968445, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.004, + "step": 25510 + }, + { + "epoch": 1.5619070934573718, + "grad_norm": 0.1648091822862625, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.003, + "step": 25520 + }, + { + "epoch": 1.562519126017504, + "grad_norm": 0.2791198790073395, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0038, + "step": 25530 + }, + { + "epoch": 1.5631311585776362, + "grad_norm": 0.13038018345832825, + "learning_rate": 3.030651808761638e-06, + "loss": 0.0028, + "step": 25540 + }, + { + "epoch": 1.5637431911377684, + "grad_norm": 0.07513634115457535, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0032, + "step": 25550 + }, + { + "epoch": 1.5643552236979006, + "grad_norm": 0.34259703755378723, + "learning_rate": 3.021609639602321e-06, + "loss": 0.0034, + "step": 25560 + }, + { + "epoch": 1.5649672562580328, + "grad_norm": 0.1602829545736313, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.0036, + "step": 25570 + }, + { + "epoch": 1.565579288818165, + "grad_norm": 0.11303776502609253, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.003, + "step": 25580 + }, + { + "epoch": 1.5661913213782972, + "grad_norm": 0.06348636001348495, + "learning_rate": 3.008116622200155e-06, + "loss": 0.0038, + "step": 25590 + }, + { + "epoch": 1.5668033539384294, + "grad_norm": 0.2563594579696655, + "learning_rate": 3.003637700546652e-06, + "loss": 0.0027, + "step": 25600 + }, + { + "epoch": 1.5674153864985616, + "grad_norm": 0.08260748535394669, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0029, + "step": 25610 + }, + { + "epoch": 1.5680274190586938, + "grad_norm": 0.15986980497837067, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0044, + "step": 25620 + }, + { + "epoch": 1.568639451618826, + "grad_norm": 0.19412761926651, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.004, + "step": 25630 + }, + { + "epoch": 1.5692514841789582, + "grad_norm": 0.16794568300247192, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0042, + "step": 25640 + }, + { + "epoch": 1.5698635167390904, + "grad_norm": 0.34898805618286133, + "learning_rate": 2.981383959667165e-06, + "loss": 0.003, + "step": 25650 + }, + { + "epoch": 1.5704755492992226, + "grad_norm": 0.11825685203075409, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0037, + "step": 25660 + }, + { + "epoch": 1.5710875818593548, + "grad_norm": 0.1430155634880066, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0049, + "step": 25670 + }, + { + "epoch": 1.571699614419487, + "grad_norm": 0.13148540258407593, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0031, + "step": 25680 + }, + { + "epoch": 1.5723116469796192, + "grad_norm": 0.14384756982326508, + "learning_rate": 2.963750320724704e-06, + "loss": 0.0036, + "step": 25690 + }, + { + "epoch": 1.5729236795397514, + "grad_norm": 0.11322541534900665, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0028, + "step": 25700 + }, + { + "epoch": 1.5735357120998836, + "grad_norm": 0.1428067833185196, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0033, + "step": 25710 + }, + { + "epoch": 1.5741477446600158, + "grad_norm": 0.1169947013258934, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.0024, + "step": 25720 + }, + { + "epoch": 1.574759777220148, + "grad_norm": 0.33150142431259155, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0038, + "step": 25730 + }, + { + "epoch": 1.5753718097802802, + "grad_norm": 0.12486983090639114, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.003, + "step": 25740 + }, + { + "epoch": 1.5759838423404124, + "grad_norm": 0.12485318630933762, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0025, + "step": 25750 + }, + { + "epoch": 1.5765958749005446, + "grad_norm": 0.10158280283212662, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0032, + "step": 25760 + }, + { + "epoch": 1.5772079074606769, + "grad_norm": 0.13820113241672516, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0041, + "step": 25770 + }, + { + "epoch": 1.577819940020809, + "grad_norm": 0.18718287348747253, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0027, + "step": 25780 + }, + { + "epoch": 1.5784319725809413, + "grad_norm": 0.154324010014534, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.004, + "step": 25790 + }, + { + "epoch": 1.5790440051410735, + "grad_norm": 0.10862802714109421, + "learning_rate": 2.916036854664115e-06, + "loss": 0.0039, + "step": 25800 + }, + { + "epoch": 1.5796560377012057, + "grad_norm": 0.11738114804029465, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0032, + "step": 25810 + }, + { + "epoch": 1.5802680702613379, + "grad_norm": 0.08674368262290955, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0035, + "step": 25820 + }, + { + "epoch": 1.58088010282147, + "grad_norm": 0.16917847096920013, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0032, + "step": 25830 + }, + { + "epoch": 1.5814921353816023, + "grad_norm": 0.10122957825660706, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0044, + "step": 25840 + }, + { + "epoch": 1.5821041679417345, + "grad_norm": 0.14450572431087494, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.0031, + "step": 25850 + }, + { + "epoch": 1.5827162005018667, + "grad_norm": 0.11220426112413406, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0035, + "step": 25860 + }, + { + "epoch": 1.5833282330619989, + "grad_norm": 0.15793107450008392, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0034, + "step": 25870 + }, + { + "epoch": 1.583940265622131, + "grad_norm": 0.11485118418931961, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0049, + "step": 25880 + }, + { + "epoch": 1.5845522981822633, + "grad_norm": 0.11588255316019058, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0032, + "step": 25890 + }, + { + "epoch": 1.5851643307423955, + "grad_norm": 0.09770877659320831, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0033, + "step": 25900 + }, + { + "epoch": 1.5857763633025277, + "grad_norm": 0.4078996479511261, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0041, + "step": 25910 + }, + { + "epoch": 1.58638839586266, + "grad_norm": 0.16744333505630493, + "learning_rate": 2.865295218604555e-06, + "loss": 0.003, + "step": 25920 + }, + { + "epoch": 1.587000428422792, + "grad_norm": 0.10358662158250809, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0034, + "step": 25930 + }, + { + "epoch": 1.5876124609829243, + "grad_norm": 0.1420212686061859, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0024, + "step": 25940 + }, + { + "epoch": 1.5882244935430565, + "grad_norm": 0.1387208104133606, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0035, + "step": 25950 + }, + { + "epoch": 1.5888365261031887, + "grad_norm": 0.2383398711681366, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0044, + "step": 25960 + }, + { + "epoch": 1.589448558663321, + "grad_norm": 0.1263049691915512, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.0029, + "step": 25970 + }, + { + "epoch": 1.5900605912234531, + "grad_norm": 0.10938797891139984, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0029, + "step": 25980 + }, + { + "epoch": 1.5906726237835853, + "grad_norm": 0.18173988163471222, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0033, + "step": 25990 + }, + { + "epoch": 1.5912846563437175, + "grad_norm": 0.20956522226333618, + "learning_rate": 2.832230653119002e-06, + "loss": 0.003, + "step": 26000 + }, + { + "epoch": 1.5918966889038497, + "grad_norm": 0.5168828368186951, + "learning_rate": 2.828140665735232e-06, + "loss": 0.0038, + "step": 26010 + }, + { + "epoch": 1.592508721463982, + "grad_norm": 0.19130735099315643, + "learning_rate": 2.8240602684835614e-06, + "loss": 0.003, + "step": 26020 + }, + { + "epoch": 1.5931207540241141, + "grad_norm": 0.2398800253868103, + "learning_rate": 2.8199894661525695e-06, + "loss": 0.0031, + "step": 26030 + }, + { + "epoch": 1.5937327865842463, + "grad_norm": 0.13288211822509766, + "learning_rate": 2.8159282635195604e-06, + "loss": 0.0039, + "step": 26040 + }, + { + "epoch": 1.5943448191443785, + "grad_norm": 0.12008156627416611, + "learning_rate": 2.8118766653505857e-06, + "loss": 0.005, + "step": 26050 + }, + { + "epoch": 1.5949568517045107, + "grad_norm": 0.06939925253391266, + "learning_rate": 2.8078346764004217e-06, + "loss": 0.0026, + "step": 26060 + }, + { + "epoch": 1.595568884264643, + "grad_norm": 0.11179028451442719, + "learning_rate": 2.8038023014125693e-06, + "loss": 0.0032, + "step": 26070 + }, + { + "epoch": 1.5961809168247751, + "grad_norm": 0.07841819524765015, + "learning_rate": 2.799779545119241e-06, + "loss": 0.0035, + "step": 26080 + }, + { + "epoch": 1.5967929493849073, + "grad_norm": 0.3470489978790283, + "learning_rate": 2.7957664122413685e-06, + "loss": 0.0067, + "step": 26090 + }, + { + "epoch": 1.5974049819450395, + "grad_norm": 0.13002917170524597, + "learning_rate": 2.7917629074885855e-06, + "loss": 0.0035, + "step": 26100 + }, + { + "epoch": 1.5980170145051718, + "grad_norm": 0.10265816748142242, + "learning_rate": 2.78776903555923e-06, + "loss": 0.0026, + "step": 26110 + }, + { + "epoch": 1.598629047065304, + "grad_norm": 0.0917414203286171, + "learning_rate": 2.7837848011403307e-06, + "loss": 0.003, + "step": 26120 + }, + { + "epoch": 1.5992410796254362, + "grad_norm": 0.11112091690301895, + "learning_rate": 2.7798102089076096e-06, + "loss": 0.0039, + "step": 26130 + }, + { + "epoch": 1.5998531121855684, + "grad_norm": 0.08949574083089828, + "learning_rate": 2.7758452635254706e-06, + "loss": 0.0035, + "step": 26140 + }, + { + "epoch": 1.6004651447457006, + "grad_norm": 0.10606437176465988, + "learning_rate": 2.771889969647e-06, + "loss": 0.0046, + "step": 26150 + }, + { + "epoch": 1.6010771773058328, + "grad_norm": 0.1891089379787445, + "learning_rate": 2.7679443319139547e-06, + "loss": 0.0025, + "step": 26160 + }, + { + "epoch": 1.601689209865965, + "grad_norm": 0.11007837951183319, + "learning_rate": 2.76400835495676e-06, + "loss": 0.0032, + "step": 26170 + }, + { + "epoch": 1.6023012424260972, + "grad_norm": 0.2129961997270584, + "learning_rate": 2.760082043394504e-06, + "loss": 0.0032, + "step": 26180 + }, + { + "epoch": 1.6029132749862294, + "grad_norm": 0.2265758216381073, + "learning_rate": 2.756165401834933e-06, + "loss": 0.003, + "step": 26190 + }, + { + "epoch": 1.6035253075463616, + "grad_norm": 0.29450783133506775, + "learning_rate": 2.7522584348744443e-06, + "loss": 0.0039, + "step": 26200 + }, + { + "epoch": 1.6041373401064938, + "grad_norm": 0.48828232288360596, + "learning_rate": 2.748361147098079e-06, + "loss": 0.0054, + "step": 26210 + }, + { + "epoch": 1.604749372666626, + "grad_norm": 0.2561551630496979, + "learning_rate": 2.7444735430795245e-06, + "loss": 0.0038, + "step": 26220 + }, + { + "epoch": 1.6053614052267582, + "grad_norm": 0.1838567554950714, + "learning_rate": 2.740595627381096e-06, + "loss": 0.004, + "step": 26230 + }, + { + "epoch": 1.6059734377868904, + "grad_norm": 0.1419040560722351, + "learning_rate": 2.7367274045537477e-06, + "loss": 0.0028, + "step": 26240 + }, + { + "epoch": 1.6065854703470226, + "grad_norm": 0.11946547776460648, + "learning_rate": 2.732868879137055e-06, + "loss": 0.004, + "step": 26250 + }, + { + "epoch": 1.6071975029071548, + "grad_norm": 0.2451052963733673, + "learning_rate": 2.7290200556592094e-06, + "loss": 0.0041, + "step": 26260 + }, + { + "epoch": 1.607809535467287, + "grad_norm": 0.11013349890708923, + "learning_rate": 2.72518093863702e-06, + "loss": 0.0031, + "step": 26270 + }, + { + "epoch": 1.6084215680274192, + "grad_norm": 0.13513876497745514, + "learning_rate": 2.721351532575906e-06, + "loss": 0.0067, + "step": 26280 + }, + { + "epoch": 1.6090336005875514, + "grad_norm": 0.13167037069797516, + "learning_rate": 2.717531841969889e-06, + "loss": 0.0054, + "step": 26290 + }, + { + "epoch": 1.6096456331476836, + "grad_norm": 0.17578460276126862, + "learning_rate": 2.713721871301588e-06, + "loss": 0.0042, + "step": 26300 + }, + { + "epoch": 1.6102576657078158, + "grad_norm": 0.26278436183929443, + "learning_rate": 2.709921625042214e-06, + "loss": 0.0052, + "step": 26310 + }, + { + "epoch": 1.610869698267948, + "grad_norm": 0.12841887772083282, + "learning_rate": 2.7061311076515717e-06, + "loss": 0.0029, + "step": 26320 + }, + { + "epoch": 1.6114817308280802, + "grad_norm": 0.08532734215259552, + "learning_rate": 2.7023503235780395e-06, + "loss": 0.0037, + "step": 26330 + }, + { + "epoch": 1.6120937633882122, + "grad_norm": 0.23955127596855164, + "learning_rate": 2.6985792772585826e-06, + "loss": 0.0026, + "step": 26340 + }, + { + "epoch": 1.6127057959483444, + "grad_norm": 0.11942708492279053, + "learning_rate": 2.6948179731187315e-06, + "loss": 0.0032, + "step": 26350 + }, + { + "epoch": 1.6133178285084766, + "grad_norm": 0.2980901002883911, + "learning_rate": 2.6910664155725847e-06, + "loss": 0.0036, + "step": 26360 + }, + { + "epoch": 1.6139298610686088, + "grad_norm": 0.18042345345020294, + "learning_rate": 2.6873246090228063e-06, + "loss": 0.0023, + "step": 26370 + }, + { + "epoch": 1.614541893628741, + "grad_norm": 0.09250669926404953, + "learning_rate": 2.683592557860616e-06, + "loss": 0.0028, + "step": 26380 + }, + { + "epoch": 1.6151539261888732, + "grad_norm": 0.11877484619617462, + "learning_rate": 2.6798702664657803e-06, + "loss": 0.0042, + "step": 26390 + }, + { + "epoch": 1.6157659587490054, + "grad_norm": 0.20574252307415009, + "learning_rate": 2.6761577392066163e-06, + "loss": 0.0028, + "step": 26400 + }, + { + "epoch": 1.6163779913091376, + "grad_norm": 0.18342842161655426, + "learning_rate": 2.6724549804399845e-06, + "loss": 0.0028, + "step": 26410 + }, + { + "epoch": 1.6169900238692698, + "grad_norm": 0.18038654327392578, + "learning_rate": 2.6687619945112743e-06, + "loss": 0.0031, + "step": 26420 + }, + { + "epoch": 1.617602056429402, + "grad_norm": 0.14160999655723572, + "learning_rate": 2.6650787857544134e-06, + "loss": 0.0035, + "step": 26430 + }, + { + "epoch": 1.6182140889895342, + "grad_norm": 0.09427947551012039, + "learning_rate": 2.661405358491851e-06, + "loss": 0.0041, + "step": 26440 + }, + { + "epoch": 1.6188261215496664, + "grad_norm": 0.07515032589435577, + "learning_rate": 2.6577417170345594e-06, + "loss": 0.0032, + "step": 26450 + }, + { + "epoch": 1.6194381541097986, + "grad_norm": 0.19633768498897552, + "learning_rate": 2.6540878656820246e-06, + "loss": 0.0043, + "step": 26460 + }, + { + "epoch": 1.6200501866699308, + "grad_norm": 0.22237136960029602, + "learning_rate": 2.6504438087222474e-06, + "loss": 0.003, + "step": 26470 + }, + { + "epoch": 1.620662219230063, + "grad_norm": 0.21898943185806274, + "learning_rate": 2.6468095504317275e-06, + "loss": 0.0033, + "step": 26480 + }, + { + "epoch": 1.6212742517901952, + "grad_norm": 0.14833909273147583, + "learning_rate": 2.643185095075473e-06, + "loss": 0.003, + "step": 26490 + }, + { + "epoch": 1.6218862843503274, + "grad_norm": 0.10988935828208923, + "learning_rate": 2.6395704469069837e-06, + "loss": 0.0036, + "step": 26500 + }, + { + "epoch": 1.6224983169104596, + "grad_norm": 0.17635370790958405, + "learning_rate": 2.635965610168249e-06, + "loss": 0.0047, + "step": 26510 + }, + { + "epoch": 1.6231103494705919, + "grad_norm": 0.15108852088451385, + "learning_rate": 2.6323705890897464e-06, + "loss": 0.0034, + "step": 26520 + }, + { + "epoch": 1.623722382030724, + "grad_norm": 0.1829880177974701, + "learning_rate": 2.628785387890433e-06, + "loss": 0.0041, + "step": 26530 + }, + { + "epoch": 1.6243344145908563, + "grad_norm": 0.15146563947200775, + "learning_rate": 2.6252100107777422e-06, + "loss": 0.0034, + "step": 26540 + }, + { + "epoch": 1.6249464471509885, + "grad_norm": 0.1440849006175995, + "learning_rate": 2.6216444619475786e-06, + "loss": 0.0044, + "step": 26550 + }, + { + "epoch": 1.6255584797111207, + "grad_norm": 0.1681547313928604, + "learning_rate": 2.6180887455843135e-06, + "loss": 0.0045, + "step": 26560 + }, + { + "epoch": 1.6261705122712529, + "grad_norm": 0.07170043885707855, + "learning_rate": 2.6145428658607753e-06, + "loss": 0.0028, + "step": 26570 + }, + { + "epoch": 1.626782544831385, + "grad_norm": 0.0961712971329689, + "learning_rate": 2.6110068269382534e-06, + "loss": 0.0026, + "step": 26580 + }, + { + "epoch": 1.6273945773915173, + "grad_norm": 0.0957784354686737, + "learning_rate": 2.6074806329664854e-06, + "loss": 0.0031, + "step": 26590 + }, + { + "epoch": 1.6280066099516495, + "grad_norm": 0.09888478368520737, + "learning_rate": 2.6039642880836585e-06, + "loss": 0.0042, + "step": 26600 + }, + { + "epoch": 1.6286186425117817, + "grad_norm": 0.1469460278749466, + "learning_rate": 2.600457796416397e-06, + "loss": 0.003, + "step": 26610 + }, + { + "epoch": 1.6292306750719139, + "grad_norm": 0.23431086540222168, + "learning_rate": 2.5969611620797636e-06, + "loss": 0.003, + "step": 26620 + }, + { + "epoch": 1.6298427076320459, + "grad_norm": 0.11390798538923264, + "learning_rate": 2.593474389177255e-06, + "loss": 0.0026, + "step": 26630 + }, + { + "epoch": 1.630454740192178, + "grad_norm": 0.17735126614570618, + "learning_rate": 2.5899974818007924e-06, + "loss": 0.0032, + "step": 26640 + }, + { + "epoch": 1.6310667727523103, + "grad_norm": 0.047082606703042984, + "learning_rate": 2.586530444030723e-06, + "loss": 0.0036, + "step": 26650 + }, + { + "epoch": 1.6316788053124425, + "grad_norm": 0.3262721598148346, + "learning_rate": 2.583073279935805e-06, + "loss": 0.004, + "step": 26660 + }, + { + "epoch": 1.6322908378725747, + "grad_norm": 0.2153632938861847, + "learning_rate": 2.5796259935732143e-06, + "loss": 0.003, + "step": 26670 + }, + { + "epoch": 1.6329028704327069, + "grad_norm": 0.12398967891931534, + "learning_rate": 2.5761885889885346e-06, + "loss": 0.0031, + "step": 26680 + }, + { + "epoch": 1.633514902992839, + "grad_norm": 0.404419481754303, + "learning_rate": 2.5727610702157518e-06, + "loss": 0.0038, + "step": 26690 + }, + { + "epoch": 1.6341269355529713, + "grad_norm": 0.3094029426574707, + "learning_rate": 2.5693434412772496e-06, + "loss": 0.005, + "step": 26700 + }, + { + "epoch": 1.6347389681131035, + "grad_norm": 0.23702147603034973, + "learning_rate": 2.565935706183804e-06, + "loss": 0.003, + "step": 26710 + }, + { + "epoch": 1.6353510006732357, + "grad_norm": 0.175592839717865, + "learning_rate": 2.5625378689345837e-06, + "loss": 0.0038, + "step": 26720 + }, + { + "epoch": 1.635963033233368, + "grad_norm": 0.20330312848091125, + "learning_rate": 2.5591499335171394e-06, + "loss": 0.0034, + "step": 26730 + }, + { + "epoch": 1.6365750657935, + "grad_norm": 0.1990291029214859, + "learning_rate": 2.555771903907403e-06, + "loss": 0.0031, + "step": 26740 + }, + { + "epoch": 1.6371870983536323, + "grad_norm": 0.2611120343208313, + "learning_rate": 2.5524037840696787e-06, + "loss": 0.0026, + "step": 26750 + }, + { + "epoch": 1.6377991309137645, + "grad_norm": 0.15563850104808807, + "learning_rate": 2.5490455779566446e-06, + "loss": 0.0029, + "step": 26760 + }, + { + "epoch": 1.6384111634738967, + "grad_norm": 0.10159289091825485, + "learning_rate": 2.545697289509341e-06, + "loss": 0.0027, + "step": 26770 + }, + { + "epoch": 1.639023196034029, + "grad_norm": 0.14164364337921143, + "learning_rate": 2.5423589226571733e-06, + "loss": 0.0022, + "step": 26780 + }, + { + "epoch": 1.639635228594161, + "grad_norm": 0.09149957448244095, + "learning_rate": 2.5390304813179e-06, + "loss": 0.0042, + "step": 26790 + }, + { + "epoch": 1.6402472611542933, + "grad_norm": 0.19528718292713165, + "learning_rate": 2.5357119693976297e-06, + "loss": 0.0021, + "step": 26800 + }, + { + "epoch": 1.6408592937144255, + "grad_norm": 0.11716540157794952, + "learning_rate": 2.532403390790823e-06, + "loss": 0.0029, + "step": 26810 + }, + { + "epoch": 1.6414713262745577, + "grad_norm": 0.06402851641178131, + "learning_rate": 2.529104749380281e-06, + "loss": 0.0023, + "step": 26820 + }, + { + "epoch": 1.64208335883469, + "grad_norm": 0.12224840372800827, + "learning_rate": 2.5258160490371446e-06, + "loss": 0.0029, + "step": 26830 + }, + { + "epoch": 1.6426953913948221, + "grad_norm": 0.13217593729496002, + "learning_rate": 2.5225372936208854e-06, + "loss": 0.0035, + "step": 26840 + }, + { + "epoch": 1.6433074239549543, + "grad_norm": 0.15030793845653534, + "learning_rate": 2.5192684869793043e-06, + "loss": 0.0026, + "step": 26850 + }, + { + "epoch": 1.6439194565150865, + "grad_norm": 0.10057740658521652, + "learning_rate": 2.51600963294853e-06, + "loss": 0.0044, + "step": 26860 + }, + { + "epoch": 1.6445314890752187, + "grad_norm": 0.19387565553188324, + "learning_rate": 2.5127607353530097e-06, + "loss": 0.0032, + "step": 26870 + }, + { + "epoch": 1.645143521635351, + "grad_norm": 0.32513365149497986, + "learning_rate": 2.5095217980055052e-06, + "loss": 0.0026, + "step": 26880 + }, + { + "epoch": 1.6457555541954831, + "grad_norm": 0.11426142603158951, + "learning_rate": 2.5062928247070873e-06, + "loss": 0.0035, + "step": 26890 + }, + { + "epoch": 1.6463675867556153, + "grad_norm": 0.15678571164608002, + "learning_rate": 2.503073819247138e-06, + "loss": 0.0033, + "step": 26900 + }, + { + "epoch": 1.6469796193157475, + "grad_norm": 0.0901828184723854, + "learning_rate": 2.4998647854033393e-06, + "loss": 0.0033, + "step": 26910 + }, + { + "epoch": 1.6475916518758797, + "grad_norm": 0.1439771205186844, + "learning_rate": 2.4966657269416738e-06, + "loss": 0.0036, + "step": 26920 + }, + { + "epoch": 1.648203684436012, + "grad_norm": 0.08516893535852432, + "learning_rate": 2.49347664761641e-06, + "loss": 0.0031, + "step": 26930 + }, + { + "epoch": 1.6488157169961442, + "grad_norm": 0.13487808406352997, + "learning_rate": 2.490297551170112e-06, + "loss": 0.0038, + "step": 26940 + }, + { + "epoch": 1.6494277495562764, + "grad_norm": 0.12181483954191208, + "learning_rate": 2.487128441333628e-06, + "loss": 0.0048, + "step": 26950 + }, + { + "epoch": 1.6500397821164086, + "grad_norm": 0.11907542496919632, + "learning_rate": 2.4839693218260844e-06, + "loss": 0.0028, + "step": 26960 + }, + { + "epoch": 1.6506518146765408, + "grad_norm": 0.11463847011327744, + "learning_rate": 2.4808201963548844e-06, + "loss": 0.0032, + "step": 26970 + }, + { + "epoch": 1.651263847236673, + "grad_norm": 0.10308004170656204, + "learning_rate": 2.477681068615698e-06, + "loss": 0.0027, + "step": 26980 + }, + { + "epoch": 1.6518758797968052, + "grad_norm": 0.1553436815738678, + "learning_rate": 2.4745519422924715e-06, + "loss": 0.0032, + "step": 26990 + }, + { + "epoch": 1.6524879123569374, + "grad_norm": 0.11983859539031982, + "learning_rate": 2.471432821057406e-06, + "loss": 0.0031, + "step": 27000 + }, + { + "epoch": 1.6530999449170696, + "grad_norm": 0.07867950201034546, + "learning_rate": 2.4683237085709673e-06, + "loss": 0.003, + "step": 27010 + }, + { + "epoch": 1.6537119774772018, + "grad_norm": 0.0990489274263382, + "learning_rate": 2.4652246084818678e-06, + "loss": 0.0035, + "step": 27020 + }, + { + "epoch": 1.654324010037334, + "grad_norm": 0.15849289298057556, + "learning_rate": 2.4621355244270764e-06, + "loss": 0.0029, + "step": 27030 + }, + { + "epoch": 1.6549360425974662, + "grad_norm": 0.23918525874614716, + "learning_rate": 2.4590564600318047e-06, + "loss": 0.0032, + "step": 27040 + }, + { + "epoch": 1.6555480751575984, + "grad_norm": 0.15686926245689392, + "learning_rate": 2.4559874189095077e-06, + "loss": 0.003, + "step": 27050 + }, + { + "epoch": 1.6561601077177306, + "grad_norm": 0.06435749679803848, + "learning_rate": 2.4529284046618745e-06, + "loss": 0.003, + "step": 27060 + }, + { + "epoch": 1.6567721402778628, + "grad_norm": 0.1966746598482132, + "learning_rate": 2.4498794208788296e-06, + "loss": 0.0029, + "step": 27070 + }, + { + "epoch": 1.657384172837995, + "grad_norm": 0.1173984557390213, + "learning_rate": 2.446840471138524e-06, + "loss": 0.0034, + "step": 27080 + }, + { + "epoch": 1.6579962053981272, + "grad_norm": 0.15185165405273438, + "learning_rate": 2.443811559007335e-06, + "loss": 0.0036, + "step": 27090 + }, + { + "epoch": 1.6586082379582594, + "grad_norm": 0.1371954381465912, + "learning_rate": 2.440792688039862e-06, + "loss": 0.002, + "step": 27100 + }, + { + "epoch": 1.6592202705183916, + "grad_norm": 0.10718704760074615, + "learning_rate": 2.437783861778914e-06, + "loss": 0.003, + "step": 27110 + }, + { + "epoch": 1.6598323030785238, + "grad_norm": 0.09085255861282349, + "learning_rate": 2.4347850837555197e-06, + "loss": 0.0028, + "step": 27120 + }, + { + "epoch": 1.660444335638656, + "grad_norm": 0.12604662775993347, + "learning_rate": 2.4317963574889108e-06, + "loss": 0.0029, + "step": 27130 + }, + { + "epoch": 1.6610563681987882, + "grad_norm": 0.06227592006325722, + "learning_rate": 2.428817686486524e-06, + "loss": 0.0034, + "step": 27140 + }, + { + "epoch": 1.6616684007589204, + "grad_norm": 0.15667739510536194, + "learning_rate": 2.425849074243997e-06, + "loss": 0.0029, + "step": 27150 + }, + { + "epoch": 1.6622804333190526, + "grad_norm": 0.11927297711372375, + "learning_rate": 2.4228905242451593e-06, + "loss": 0.0025, + "step": 27160 + }, + { + "epoch": 1.6628924658791848, + "grad_norm": 0.13583429157733917, + "learning_rate": 2.419942039962035e-06, + "loss": 0.0049, + "step": 27170 + }, + { + "epoch": 1.663504498439317, + "grad_norm": 0.31264790892601013, + "learning_rate": 2.4170036248548345e-06, + "loss": 0.0032, + "step": 27180 + }, + { + "epoch": 1.6641165309994492, + "grad_norm": 0.1507059931755066, + "learning_rate": 2.414075282371954e-06, + "loss": 0.0041, + "step": 27190 + }, + { + "epoch": 1.6647285635595814, + "grad_norm": 0.22571611404418945, + "learning_rate": 2.411157015949963e-06, + "loss": 0.006, + "step": 27200 + }, + { + "epoch": 1.6653405961197136, + "grad_norm": 0.07582036405801773, + "learning_rate": 2.408248829013611e-06, + "loss": 0.0025, + "step": 27210 + }, + { + "epoch": 1.6659526286798458, + "grad_norm": 0.16827397048473358, + "learning_rate": 2.4053507249758174e-06, + "loss": 0.0025, + "step": 27220 + }, + { + "epoch": 1.666564661239978, + "grad_norm": 0.26645299792289734, + "learning_rate": 2.40246270723767e-06, + "loss": 0.0034, + "step": 27230 + }, + { + "epoch": 1.6671766938001102, + "grad_norm": 0.15947957336902618, + "learning_rate": 2.399584779188417e-06, + "loss": 0.003, + "step": 27240 + }, + { + "epoch": 1.6677887263602424, + "grad_norm": 0.16127845644950867, + "learning_rate": 2.396716944205467e-06, + "loss": 0.0049, + "step": 27250 + }, + { + "epoch": 1.6684007589203746, + "grad_norm": 0.1279461681842804, + "learning_rate": 2.3938592056543853e-06, + "loss": 0.0027, + "step": 27260 + }, + { + "epoch": 1.6690127914805069, + "grad_norm": 0.06649098545312881, + "learning_rate": 2.391011566888887e-06, + "loss": 0.0021, + "step": 27270 + }, + { + "epoch": 1.669624824040639, + "grad_norm": 0.196940615773201, + "learning_rate": 2.3881740312508346e-06, + "loss": 0.0024, + "step": 27280 + }, + { + "epoch": 1.6702368566007713, + "grad_norm": 0.07980433851480484, + "learning_rate": 2.3853466020702323e-06, + "loss": 0.0028, + "step": 27290 + }, + { + "epoch": 1.6708488891609035, + "grad_norm": 0.10023880004882812, + "learning_rate": 2.382529282665229e-06, + "loss": 0.0027, + "step": 27300 + }, + { + "epoch": 1.6714609217210357, + "grad_norm": 0.12118209153413773, + "learning_rate": 2.379722076342103e-06, + "loss": 0.0035, + "step": 27310 + }, + { + "epoch": 1.6720729542811679, + "grad_norm": 0.1536104530096054, + "learning_rate": 2.376924986395271e-06, + "loss": 0.0032, + "step": 27320 + }, + { + "epoch": 1.6726849868413, + "grad_norm": 0.0671612024307251, + "learning_rate": 2.37413801610727e-06, + "loss": 0.0025, + "step": 27330 + }, + { + "epoch": 1.6732970194014323, + "grad_norm": 0.17756326496601105, + "learning_rate": 2.371361168748767e-06, + "loss": 0.0033, + "step": 27340 + }, + { + "epoch": 1.6739090519615645, + "grad_norm": 0.07412310689687729, + "learning_rate": 2.3685944475785463e-06, + "loss": 0.0039, + "step": 27350 + }, + { + "epoch": 1.6745210845216967, + "grad_norm": 0.17036253213882446, + "learning_rate": 2.3658378558435098e-06, + "loss": 0.0046, + "step": 27360 + }, + { + "epoch": 1.6751331170818289, + "grad_norm": 0.07159245759248734, + "learning_rate": 2.363091396778672e-06, + "loss": 0.0025, + "step": 27370 + }, + { + "epoch": 1.675745149641961, + "grad_norm": 0.11311008781194687, + "learning_rate": 2.3603550736071535e-06, + "loss": 0.0028, + "step": 27380 + }, + { + "epoch": 1.6763571822020933, + "grad_norm": 0.062365781515836716, + "learning_rate": 2.357628889540182e-06, + "loss": 0.0029, + "step": 27390 + }, + { + "epoch": 1.6769692147622255, + "grad_norm": 0.1132882833480835, + "learning_rate": 2.3549128477770894e-06, + "loss": 0.003, + "step": 27400 + }, + { + "epoch": 1.6775812473223577, + "grad_norm": 0.2946174740791321, + "learning_rate": 2.3522069515052996e-06, + "loss": 0.003, + "step": 27410 + }, + { + "epoch": 1.67819327988249, + "grad_norm": 0.22978715598583221, + "learning_rate": 2.349511203900333e-06, + "loss": 0.0028, + "step": 27420 + }, + { + "epoch": 1.678805312442622, + "grad_norm": 0.12381251156330109, + "learning_rate": 2.3468256081258e-06, + "loss": 0.0035, + "step": 27430 + }, + { + "epoch": 1.6794173450027543, + "grad_norm": 0.3918306231498718, + "learning_rate": 2.344150167333397e-06, + "loss": 0.0036, + "step": 27440 + }, + { + "epoch": 1.6800293775628865, + "grad_norm": 0.1729428470134735, + "learning_rate": 2.3414848846629013e-06, + "loss": 0.0039, + "step": 27450 + }, + { + "epoch": 1.6806414101230187, + "grad_norm": 0.10841631144285202, + "learning_rate": 2.3388297632421727e-06, + "loss": 0.0032, + "step": 27460 + }, + { + "epoch": 1.6812534426831507, + "grad_norm": 0.12045114487409592, + "learning_rate": 2.3361848061871417e-06, + "loss": 0.0048, + "step": 27470 + }, + { + "epoch": 1.681865475243283, + "grad_norm": 0.15946263074874878, + "learning_rate": 2.333550016601814e-06, + "loss": 0.0025, + "step": 27480 + }, + { + "epoch": 1.682477507803415, + "grad_norm": 0.20978282392024994, + "learning_rate": 2.3309253975782623e-06, + "loss": 0.0039, + "step": 27490 + }, + { + "epoch": 1.6830895403635473, + "grad_norm": 0.4889276325702667, + "learning_rate": 2.3283109521966236e-06, + "loss": 0.005, + "step": 27500 + }, + { + "epoch": 1.6837015729236795, + "grad_norm": 0.10033760219812393, + "learning_rate": 2.325706683525094e-06, + "loss": 0.0032, + "step": 27510 + }, + { + "epoch": 1.6843136054838117, + "grad_norm": 0.16516660153865814, + "learning_rate": 2.3231125946199277e-06, + "loss": 0.0043, + "step": 27520 + }, + { + "epoch": 1.684925638043944, + "grad_norm": 0.15988346934318542, + "learning_rate": 2.320528688525433e-06, + "loss": 0.0031, + "step": 27530 + }, + { + "epoch": 1.685537670604076, + "grad_norm": 0.0838918536901474, + "learning_rate": 2.317954968273969e-06, + "loss": 0.0033, + "step": 27540 + }, + { + "epoch": 1.6861497031642083, + "grad_norm": 0.09774886816740036, + "learning_rate": 2.3153914368859386e-06, + "loss": 0.0037, + "step": 27550 + }, + { + "epoch": 1.6867617357243405, + "grad_norm": 0.11428319662809372, + "learning_rate": 2.3128380973697868e-06, + "loss": 0.0028, + "step": 27560 + }, + { + "epoch": 1.6873737682844727, + "grad_norm": 0.0789853185415268, + "learning_rate": 2.3102949527220025e-06, + "loss": 0.0033, + "step": 27570 + }, + { + "epoch": 1.687985800844605, + "grad_norm": 0.12702232599258423, + "learning_rate": 2.3077620059271054e-06, + "loss": 0.0033, + "step": 27580 + }, + { + "epoch": 1.6885978334047371, + "grad_norm": 0.12080296128988266, + "learning_rate": 2.305239259957653e-06, + "loss": 0.0027, + "step": 27590 + }, + { + "epoch": 1.6892098659648693, + "grad_norm": 0.21917396783828735, + "learning_rate": 2.302726717774224e-06, + "loss": 0.0032, + "step": 27600 + }, + { + "epoch": 1.6898218985250015, + "grad_norm": 0.28265318274497986, + "learning_rate": 2.3002243823254294e-06, + "loss": 0.0036, + "step": 27610 + }, + { + "epoch": 1.6904339310851337, + "grad_norm": 0.09106706827878952, + "learning_rate": 2.2977322565478988e-06, + "loss": 0.0029, + "step": 27620 + }, + { + "epoch": 1.691045963645266, + "grad_norm": 0.1670890897512436, + "learning_rate": 2.2952503433662806e-06, + "loss": 0.0024, + "step": 27630 + }, + { + "epoch": 1.6916579962053981, + "grad_norm": 0.16830581426620483, + "learning_rate": 2.2927786456932383e-06, + "loss": 0.0033, + "step": 27640 + }, + { + "epoch": 1.6922700287655303, + "grad_norm": 0.3394775092601776, + "learning_rate": 2.2903171664294446e-06, + "loss": 0.0045, + "step": 27650 + }, + { + "epoch": 1.6928820613256625, + "grad_norm": 0.11403192579746246, + "learning_rate": 2.287865908463585e-06, + "loss": 0.0047, + "step": 27660 + }, + { + "epoch": 1.6934940938857947, + "grad_norm": 0.12133318930864334, + "learning_rate": 2.2854248746723464e-06, + "loss": 0.0038, + "step": 27670 + }, + { + "epoch": 1.694106126445927, + "grad_norm": 0.2074453979730606, + "learning_rate": 2.2829940679204192e-06, + "loss": 0.0045, + "step": 27680 + }, + { + "epoch": 1.6947181590060592, + "grad_norm": 0.0654371827840805, + "learning_rate": 2.280573491060488e-06, + "loss": 0.0023, + "step": 27690 + }, + { + "epoch": 1.6953301915661914, + "grad_norm": 0.3289278745651245, + "learning_rate": 2.278163146933236e-06, + "loss": 0.0043, + "step": 27700 + }, + { + "epoch": 1.6959422241263236, + "grad_norm": 0.10692958533763885, + "learning_rate": 2.275763038367336e-06, + "loss": 0.0026, + "step": 27710 + }, + { + "epoch": 1.6965542566864558, + "grad_norm": 0.06414066255092621, + "learning_rate": 2.2733731681794505e-06, + "loss": 0.0022, + "step": 27720 + }, + { + "epoch": 1.697166289246588, + "grad_norm": 0.22467097640037537, + "learning_rate": 2.270993539174225e-06, + "loss": 0.0028, + "step": 27730 + }, + { + "epoch": 1.6977783218067202, + "grad_norm": 0.14074043929576874, + "learning_rate": 2.268624154144283e-06, + "loss": 0.0033, + "step": 27740 + }, + { + "epoch": 1.6983903543668522, + "grad_norm": 0.17113615572452545, + "learning_rate": 2.266265015870234e-06, + "loss": 0.0033, + "step": 27750 + }, + { + "epoch": 1.6990023869269844, + "grad_norm": 0.09429248422384262, + "learning_rate": 2.2639161271206562e-06, + "loss": 0.0027, + "step": 27760 + }, + { + "epoch": 1.6996144194871166, + "grad_norm": 0.06843049824237823, + "learning_rate": 2.261577490652103e-06, + "loss": 0.0044, + "step": 27770 + }, + { + "epoch": 1.7002264520472488, + "grad_norm": 0.08251061290502548, + "learning_rate": 2.259249109209093e-06, + "loss": 0.0029, + "step": 27780 + }, + { + "epoch": 1.700838484607381, + "grad_norm": 0.29461193084716797, + "learning_rate": 2.256930985524111e-06, + "loss": 0.0037, + "step": 27790 + }, + { + "epoch": 1.7014505171675132, + "grad_norm": 0.11461394280195236, + "learning_rate": 2.2546231223176062e-06, + "loss": 0.0027, + "step": 27800 + }, + { + "epoch": 1.7020625497276454, + "grad_norm": 0.15875136852264404, + "learning_rate": 2.2523255222979846e-06, + "loss": 0.0031, + "step": 27810 + }, + { + "epoch": 1.7026745822877776, + "grad_norm": 0.097860187292099, + "learning_rate": 2.2500381881616064e-06, + "loss": 0.0034, + "step": 27820 + }, + { + "epoch": 1.7032866148479098, + "grad_norm": 0.07356908917427063, + "learning_rate": 2.2477611225927847e-06, + "loss": 0.0026, + "step": 27830 + }, + { + "epoch": 1.703898647408042, + "grad_norm": 0.1890958547592163, + "learning_rate": 2.2454943282637852e-06, + "loss": 0.0037, + "step": 27840 + }, + { + "epoch": 1.7045106799681742, + "grad_norm": 0.1173754408955574, + "learning_rate": 2.2432378078348166e-06, + "loss": 0.0034, + "step": 27850 + }, + { + "epoch": 1.7051227125283064, + "grad_norm": 0.2559126019477844, + "learning_rate": 2.2409915639540295e-06, + "loss": 0.0024, + "step": 27860 + }, + { + "epoch": 1.7057347450884386, + "grad_norm": 0.17337289452552795, + "learning_rate": 2.2387555992575192e-06, + "loss": 0.0026, + "step": 27870 + }, + { + "epoch": 1.7063467776485708, + "grad_norm": 0.34073203802108765, + "learning_rate": 2.236529916369313e-06, + "loss": 0.0057, + "step": 27880 + }, + { + "epoch": 1.706958810208703, + "grad_norm": 0.1395779252052307, + "learning_rate": 2.2343145179013726e-06, + "loss": 0.0029, + "step": 27890 + }, + { + "epoch": 1.7075708427688352, + "grad_norm": 0.07645416259765625, + "learning_rate": 2.232109406453595e-06, + "loss": 0.0034, + "step": 27900 + }, + { + "epoch": 1.7081828753289674, + "grad_norm": 0.19695641100406647, + "learning_rate": 2.229914584613798e-06, + "loss": 0.0031, + "step": 27910 + }, + { + "epoch": 1.7087949078890996, + "grad_norm": 0.09641100466251373, + "learning_rate": 2.22773005495773e-06, + "loss": 0.0029, + "step": 27920 + }, + { + "epoch": 1.7094069404492318, + "grad_norm": 0.13393571972846985, + "learning_rate": 2.2255558200490557e-06, + "loss": 0.0029, + "step": 27930 + }, + { + "epoch": 1.710018973009364, + "grad_norm": 0.12252296507358551, + "learning_rate": 2.2233918824393625e-06, + "loss": 0.0035, + "step": 27940 + }, + { + "epoch": 1.7106310055694962, + "grad_norm": 0.18026909232139587, + "learning_rate": 2.221238244668151e-06, + "loss": 0.0033, + "step": 27950 + }, + { + "epoch": 1.7112430381296284, + "grad_norm": 0.11210714280605316, + "learning_rate": 2.219094909262834e-06, + "loss": 0.0041, + "step": 27960 + }, + { + "epoch": 1.7118550706897606, + "grad_norm": 0.08154530823230743, + "learning_rate": 2.2169618787387374e-06, + "loss": 0.0023, + "step": 27970 + }, + { + "epoch": 1.7124671032498928, + "grad_norm": 0.11625959724187851, + "learning_rate": 2.2148391555990905e-06, + "loss": 0.002, + "step": 27980 + }, + { + "epoch": 1.713079135810025, + "grad_norm": 0.17261847853660583, + "learning_rate": 2.212726742335025e-06, + "loss": 0.0034, + "step": 27990 + }, + { + "epoch": 1.7136911683701572, + "grad_norm": 0.2842121422290802, + "learning_rate": 2.210624641425579e-06, + "loss": 0.0037, + "step": 28000 + }, + { + "epoch": 1.7143032009302894, + "grad_norm": 0.0846833735704422, + "learning_rate": 2.208532855337684e-06, + "loss": 0.003, + "step": 28010 + }, + { + "epoch": 1.7149152334904216, + "grad_norm": 0.14987042546272278, + "learning_rate": 2.2064513865261646e-06, + "loss": 0.0028, + "step": 28020 + }, + { + "epoch": 1.7155272660505538, + "grad_norm": 0.11375584453344345, + "learning_rate": 2.204380237433745e-06, + "loss": 0.0035, + "step": 28030 + }, + { + "epoch": 1.716139298610686, + "grad_norm": 0.15772263705730438, + "learning_rate": 2.202319410491029e-06, + "loss": 0.0028, + "step": 28040 + }, + { + "epoch": 1.7167513311708182, + "grad_norm": 0.09632930904626846, + "learning_rate": 2.2002689081165155e-06, + "loss": 0.0026, + "step": 28050 + }, + { + "epoch": 1.7173633637309504, + "grad_norm": 0.10105090588331223, + "learning_rate": 2.1982287327165827e-06, + "loss": 0.0028, + "step": 28060 + }, + { + "epoch": 1.7179753962910826, + "grad_norm": 0.07200506329536438, + "learning_rate": 2.19619888668549e-06, + "loss": 0.0026, + "step": 28070 + }, + { + "epoch": 1.7185874288512148, + "grad_norm": 0.11725947260856628, + "learning_rate": 2.1941793724053733e-06, + "loss": 0.0036, + "step": 28080 + }, + { + "epoch": 1.719199461411347, + "grad_norm": 0.20034383237361908, + "learning_rate": 2.1921701922462463e-06, + "loss": 0.0027, + "step": 28090 + }, + { + "epoch": 1.7198114939714793, + "grad_norm": 0.1051931157708168, + "learning_rate": 2.190171348565994e-06, + "loss": 0.0035, + "step": 28100 + }, + { + "epoch": 1.7204235265316115, + "grad_norm": 0.15733452141284943, + "learning_rate": 2.188182843710369e-06, + "loss": 0.0032, + "step": 28110 + }, + { + "epoch": 1.7210355590917437, + "grad_norm": 0.1562259942293167, + "learning_rate": 2.1862046800129964e-06, + "loss": 0.0037, + "step": 28120 + }, + { + "epoch": 1.7216475916518759, + "grad_norm": 0.12120307981967926, + "learning_rate": 2.1842368597953578e-06, + "loss": 0.0027, + "step": 28130 + }, + { + "epoch": 1.722259624212008, + "grad_norm": 0.10682159662246704, + "learning_rate": 2.1822793853668e-06, + "loss": 0.0028, + "step": 28140 + }, + { + "epoch": 1.7228716567721403, + "grad_norm": 0.3744218349456787, + "learning_rate": 2.18033225902453e-06, + "loss": 0.0031, + "step": 28150 + }, + { + "epoch": 1.7234836893322725, + "grad_norm": 0.10296724736690521, + "learning_rate": 2.17839548305361e-06, + "loss": 0.0037, + "step": 28160 + }, + { + "epoch": 1.7240957218924047, + "grad_norm": 0.10784043371677399, + "learning_rate": 2.1764690597269507e-06, + "loss": 0.0026, + "step": 28170 + }, + { + "epoch": 1.7247077544525369, + "grad_norm": 0.11451563239097595, + "learning_rate": 2.17455299130532e-06, + "loss": 0.0042, + "step": 28180 + }, + { + "epoch": 1.725319787012669, + "grad_norm": 0.23215091228485107, + "learning_rate": 2.17264728003733e-06, + "loss": 0.0057, + "step": 28190 + }, + { + "epoch": 1.7259318195728013, + "grad_norm": 0.26827526092529297, + "learning_rate": 2.17075192815944e-06, + "loss": 0.0045, + "step": 28200 + }, + { + "epoch": 1.7265438521329335, + "grad_norm": 0.18574558198451996, + "learning_rate": 2.168866937895951e-06, + "loss": 0.0031, + "step": 28210 + }, + { + "epoch": 1.7271558846930657, + "grad_norm": 0.13175436854362488, + "learning_rate": 2.166992311459001e-06, + "loss": 0.0037, + "step": 28220 + }, + { + "epoch": 1.727767917253198, + "grad_norm": 0.11453181505203247, + "learning_rate": 2.1651280510485727e-06, + "loss": 0.0024, + "step": 28230 + }, + { + "epoch": 1.72837994981333, + "grad_norm": 0.07552323490381241, + "learning_rate": 2.163274158852476e-06, + "loss": 0.0028, + "step": 28240 + }, + { + "epoch": 1.7289919823734623, + "grad_norm": 0.19949491322040558, + "learning_rate": 2.1614306370463605e-06, + "loss": 0.0037, + "step": 28250 + }, + { + "epoch": 1.7296040149335945, + "grad_norm": 0.11841476708650589, + "learning_rate": 2.1595974877936977e-06, + "loss": 0.003, + "step": 28260 + }, + { + "epoch": 1.7302160474937267, + "grad_norm": 0.07479251176118851, + "learning_rate": 2.1577747132457933e-06, + "loss": 0.0026, + "step": 28270 + }, + { + "epoch": 1.730828080053859, + "grad_norm": 0.09975548088550568, + "learning_rate": 2.155962315541773e-06, + "loss": 0.0038, + "step": 28280 + }, + { + "epoch": 1.7314401126139911, + "grad_norm": 0.11624854803085327, + "learning_rate": 2.154160296808588e-06, + "loss": 0.0022, + "step": 28290 + }, + { + "epoch": 1.7320521451741233, + "grad_norm": 0.11251319199800491, + "learning_rate": 2.1523686591610064e-06, + "loss": 0.0028, + "step": 28300 + }, + { + "epoch": 1.7326641777342555, + "grad_norm": 0.1166340559720993, + "learning_rate": 2.1505874047016146e-06, + "loss": 0.0021, + "step": 28310 + }, + { + "epoch": 1.7332762102943877, + "grad_norm": 0.09875024855136871, + "learning_rate": 2.1488165355208147e-06, + "loss": 0.0035, + "step": 28320 + }, + { + "epoch": 1.73388824285452, + "grad_norm": 0.1096075028181076, + "learning_rate": 2.14705605369682e-06, + "loss": 0.0023, + "step": 28330 + }, + { + "epoch": 1.7345002754146521, + "grad_norm": 0.07303491234779358, + "learning_rate": 2.145305961295655e-06, + "loss": 0.0033, + "step": 28340 + }, + { + "epoch": 1.7351123079747843, + "grad_norm": 0.079298235476017, + "learning_rate": 2.143566260371149e-06, + "loss": 0.0029, + "step": 28350 + }, + { + "epoch": 1.7357243405349165, + "grad_norm": 0.23943912982940674, + "learning_rate": 2.141836952964938e-06, + "loss": 0.0028, + "step": 28360 + }, + { + "epoch": 1.7363363730950487, + "grad_norm": 0.16530318558216095, + "learning_rate": 2.1401180411064616e-06, + "loss": 0.0026, + "step": 28370 + }, + { + "epoch": 1.736948405655181, + "grad_norm": 0.30809924006462097, + "learning_rate": 2.138409526812959e-06, + "loss": 0.0038, + "step": 28380 + }, + { + "epoch": 1.7375604382153131, + "grad_norm": 0.1776202917098999, + "learning_rate": 2.1367114120894663e-06, + "loss": 0.0025, + "step": 28390 + }, + { + "epoch": 1.7381724707754453, + "grad_norm": 0.12845134735107422, + "learning_rate": 2.1350236989288136e-06, + "loss": 0.0025, + "step": 28400 + }, + { + "epoch": 1.7387845033355775, + "grad_norm": 0.3023861050605774, + "learning_rate": 2.1333463893116294e-06, + "loss": 0.0027, + "step": 28410 + }, + { + "epoch": 1.7393965358957098, + "grad_norm": 0.20150741934776306, + "learning_rate": 2.131679485206329e-06, + "loss": 0.0037, + "step": 28420 + }, + { + "epoch": 1.740008568455842, + "grad_norm": 0.13612216711044312, + "learning_rate": 2.130022988569117e-06, + "loss": 0.003, + "step": 28430 + }, + { + "epoch": 1.7406206010159742, + "grad_norm": 0.08449587225914001, + "learning_rate": 2.128376901343984e-06, + "loss": 0.0029, + "step": 28440 + }, + { + "epoch": 1.7412326335761064, + "grad_norm": 0.15262214839458466, + "learning_rate": 2.1267412254627056e-06, + "loss": 0.0036, + "step": 28450 + }, + { + "epoch": 1.7418446661362386, + "grad_norm": 0.12141858786344528, + "learning_rate": 2.1251159628448386e-06, + "loss": 0.0033, + "step": 28460 + }, + { + "epoch": 1.7424566986963708, + "grad_norm": 0.16376341879367828, + "learning_rate": 2.1235011153977192e-06, + "loss": 0.0026, + "step": 28470 + }, + { + "epoch": 1.743068731256503, + "grad_norm": 0.20567956566810608, + "learning_rate": 2.121896685016461e-06, + "loss": 0.0035, + "step": 28480 + }, + { + "epoch": 1.7436807638166352, + "grad_norm": 0.09294500946998596, + "learning_rate": 2.1203026735839514e-06, + "loss": 0.003, + "step": 28490 + }, + { + "epoch": 1.7442927963767674, + "grad_norm": 0.08701831847429276, + "learning_rate": 2.118719082970852e-06, + "loss": 0.0035, + "step": 28500 + }, + { + "epoch": 1.7449048289368996, + "grad_norm": 0.05340641364455223, + "learning_rate": 2.1171459150355947e-06, + "loss": 0.0033, + "step": 28510 + }, + { + "epoch": 1.7455168614970318, + "grad_norm": 0.16895434260368347, + "learning_rate": 2.115583171624381e-06, + "loss": 0.0041, + "step": 28520 + }, + { + "epoch": 1.746128894057164, + "grad_norm": 0.192590594291687, + "learning_rate": 2.114030854571176e-06, + "loss": 0.004, + "step": 28530 + }, + { + "epoch": 1.7467409266172962, + "grad_norm": 0.07753138244152069, + "learning_rate": 2.1124889656977097e-06, + "loss": 0.0029, + "step": 28540 + }, + { + "epoch": 1.7473529591774284, + "grad_norm": 0.2521173655986786, + "learning_rate": 2.1109575068134756e-06, + "loss": 0.004, + "step": 28550 + }, + { + "epoch": 1.7479649917375606, + "grad_norm": 0.0666038915514946, + "learning_rate": 2.1094364797157267e-06, + "loss": 0.0027, + "step": 28560 + }, + { + "epoch": 1.7485770242976928, + "grad_norm": 0.098371222615242, + "learning_rate": 2.107925886189472e-06, + "loss": 0.0046, + "step": 28570 + }, + { + "epoch": 1.749189056857825, + "grad_norm": 0.10023763030767441, + "learning_rate": 2.1064257280074763e-06, + "loss": 0.003, + "step": 28580 + }, + { + "epoch": 1.7498010894179572, + "grad_norm": 0.18487419188022614, + "learning_rate": 2.1049360069302594e-06, + "loss": 0.0023, + "step": 28590 + }, + { + "epoch": 1.7504131219780892, + "grad_norm": 0.068140909075737, + "learning_rate": 2.1034567247060926e-06, + "loss": 0.0031, + "step": 28600 + }, + { + "epoch": 1.7510251545382214, + "grad_norm": 0.203145369887352, + "learning_rate": 2.1019878830709968e-06, + "loss": 0.0045, + "step": 28610 + }, + { + "epoch": 1.7516371870983536, + "grad_norm": 0.1300811469554901, + "learning_rate": 2.100529483748737e-06, + "loss": 0.0033, + "step": 28620 + }, + { + "epoch": 1.7522492196584858, + "grad_norm": 0.10490277409553528, + "learning_rate": 2.099081528450828e-06, + "loss": 0.003, + "step": 28630 + }, + { + "epoch": 1.752861252218618, + "grad_norm": 0.07734280824661255, + "learning_rate": 2.097644018876524e-06, + "loss": 0.0027, + "step": 28640 + }, + { + "epoch": 1.7534732847787502, + "grad_norm": 0.09990867972373962, + "learning_rate": 2.096216956712826e-06, + "loss": 0.0025, + "step": 28650 + }, + { + "epoch": 1.7540853173388824, + "grad_norm": 0.087434321641922, + "learning_rate": 2.0948003436344666e-06, + "loss": 0.0026, + "step": 28660 + }, + { + "epoch": 1.7546973498990146, + "grad_norm": 0.14746612310409546, + "learning_rate": 2.0933941813039244e-06, + "loss": 0.0024, + "step": 28670 + }, + { + "epoch": 1.7553093824591468, + "grad_norm": 0.10767928510904312, + "learning_rate": 2.091998471371406e-06, + "loss": 0.0026, + "step": 28680 + }, + { + "epoch": 1.755921415019279, + "grad_norm": 0.1551862210035324, + "learning_rate": 2.0906132154748557e-06, + "loss": 0.0025, + "step": 28690 + }, + { + "epoch": 1.7565334475794112, + "grad_norm": 0.09829024225473404, + "learning_rate": 2.0892384152399504e-06, + "loss": 0.0039, + "step": 28700 + }, + { + "epoch": 1.7571454801395434, + "grad_norm": 0.10503874719142914, + "learning_rate": 2.0878740722800917e-06, + "loss": 0.0032, + "step": 28710 + }, + { + "epoch": 1.7577575126996756, + "grad_norm": 0.0730491355061531, + "learning_rate": 2.086520188196413e-06, + "loss": 0.0031, + "step": 28720 + }, + { + "epoch": 1.7583695452598078, + "grad_norm": 0.10079263150691986, + "learning_rate": 2.085176764577774e-06, + "loss": 0.0049, + "step": 28730 + }, + { + "epoch": 1.75898157781994, + "grad_norm": 0.09458324313163757, + "learning_rate": 2.083843803000755e-06, + "loss": 0.0032, + "step": 28740 + }, + { + "epoch": 1.7595936103800722, + "grad_norm": 0.10003770887851715, + "learning_rate": 2.0825213050296636e-06, + "loss": 0.0028, + "step": 28750 + }, + { + "epoch": 1.7602056429402044, + "grad_norm": 0.08591483533382416, + "learning_rate": 2.081209272216522e-06, + "loss": 0.004, + "step": 28760 + }, + { + "epoch": 1.7608176755003366, + "grad_norm": 0.06842748820781708, + "learning_rate": 2.079907706101075e-06, + "loss": 0.0027, + "step": 28770 + }, + { + "epoch": 1.7614297080604688, + "grad_norm": 0.20110534131526947, + "learning_rate": 2.0786166082107833e-06, + "loss": 0.0032, + "step": 28780 + }, + { + "epoch": 1.762041740620601, + "grad_norm": 0.0891185775399208, + "learning_rate": 2.0773359800608217e-06, + "loss": 0.0032, + "step": 28790 + }, + { + "epoch": 1.7626537731807332, + "grad_norm": 0.0719524472951889, + "learning_rate": 2.076065823154079e-06, + "loss": 0.0032, + "step": 28800 + }, + { + "epoch": 1.7632658057408654, + "grad_norm": 0.08921847492456436, + "learning_rate": 2.0748061389811543e-06, + "loss": 0.0021, + "step": 28810 + }, + { + "epoch": 1.7638778383009976, + "grad_norm": 0.15532712638378143, + "learning_rate": 2.073556929020357e-06, + "loss": 0.0036, + "step": 28820 + }, + { + "epoch": 1.7644898708611298, + "grad_norm": 0.09795820713043213, + "learning_rate": 2.0723181947377057e-06, + "loss": 0.0038, + "step": 28830 + }, + { + "epoch": 1.765101903421262, + "grad_norm": 0.16977304220199585, + "learning_rate": 2.0710899375869237e-06, + "loss": 0.0027, + "step": 28840 + }, + { + "epoch": 1.7657139359813943, + "grad_norm": 0.15005043148994446, + "learning_rate": 2.0698721590094387e-06, + "loss": 0.0034, + "step": 28850 + }, + { + "epoch": 1.7663259685415265, + "grad_norm": 0.2764229476451874, + "learning_rate": 2.0686648604343824e-06, + "loss": 0.0036, + "step": 28860 + }, + { + "epoch": 1.7669380011016587, + "grad_norm": 0.10011457651853561, + "learning_rate": 2.067468043278587e-06, + "loss": 0.0036, + "step": 28870 + }, + { + "epoch": 1.7675500336617906, + "grad_norm": 0.13169759511947632, + "learning_rate": 2.066281708946583e-06, + "loss": 0.0042, + "step": 28880 + }, + { + "epoch": 1.7681620662219228, + "grad_norm": 0.09271719306707382, + "learning_rate": 2.0651058588306007e-06, + "loss": 0.0024, + "step": 28890 + }, + { + "epoch": 1.768774098782055, + "grad_norm": 0.08888175338506699, + "learning_rate": 2.063940494310565e-06, + "loss": 0.003, + "step": 28900 + }, + { + "epoch": 1.7693861313421873, + "grad_norm": 0.09285194426774979, + "learning_rate": 2.062785616754097e-06, + "loss": 0.0029, + "step": 28910 + }, + { + "epoch": 1.7699981639023195, + "grad_norm": 0.16032962501049042, + "learning_rate": 2.0616412275165097e-06, + "loss": 0.0036, + "step": 28920 + }, + { + "epoch": 1.7706101964624517, + "grad_norm": 0.1677922010421753, + "learning_rate": 2.0605073279408063e-06, + "loss": 0.0029, + "step": 28930 + }, + { + "epoch": 1.7712222290225839, + "grad_norm": 0.16370612382888794, + "learning_rate": 2.0593839193576833e-06, + "loss": 0.0032, + "step": 28940 + }, + { + "epoch": 1.771834261582716, + "grad_norm": 0.19864866137504578, + "learning_rate": 2.058271003085521e-06, + "loss": 0.0035, + "step": 28950 + }, + { + "epoch": 1.7724462941428483, + "grad_norm": 0.06023133546113968, + "learning_rate": 2.0571685804303905e-06, + "loss": 0.0037, + "step": 28960 + }, + { + "epoch": 1.7730583267029805, + "grad_norm": 0.15308921039104462, + "learning_rate": 2.0560766526860447e-06, + "loss": 0.0037, + "step": 28970 + }, + { + "epoch": 1.7736703592631127, + "grad_norm": 0.061173055320978165, + "learning_rate": 2.054995221133923e-06, + "loss": 0.0036, + "step": 28980 + }, + { + "epoch": 1.7742823918232449, + "grad_norm": 0.11913572996854782, + "learning_rate": 2.053924287043144e-06, + "loss": 0.0028, + "step": 28990 + }, + { + "epoch": 1.774894424383377, + "grad_norm": 0.09992241114377975, + "learning_rate": 2.0528638516705106e-06, + "loss": 0.0029, + "step": 29000 + }, + { + "epoch": 1.7755064569435093, + "grad_norm": 0.2562020719051361, + "learning_rate": 2.051813916260501e-06, + "loss": 0.0034, + "step": 29010 + }, + { + "epoch": 1.7761184895036415, + "grad_norm": 0.06800663471221924, + "learning_rate": 2.050774482045273e-06, + "loss": 0.0031, + "step": 29020 + }, + { + "epoch": 1.7767305220637737, + "grad_norm": 0.09397796541452408, + "learning_rate": 2.049745550244661e-06, + "loss": 0.0029, + "step": 29030 + }, + { + "epoch": 1.777342554623906, + "grad_norm": 0.4348801076412201, + "learning_rate": 2.0487271220661735e-06, + "loss": 0.0031, + "step": 29040 + }, + { + "epoch": 1.777954587184038, + "grad_norm": 0.11066912859678268, + "learning_rate": 2.047719198704994e-06, + "loss": 0.0032, + "step": 29050 + }, + { + "epoch": 1.7785666197441703, + "grad_norm": 0.12962423264980316, + "learning_rate": 2.0467217813439762e-06, + "loss": 0.0025, + "step": 29060 + }, + { + "epoch": 1.7791786523043025, + "grad_norm": 0.12331631779670715, + "learning_rate": 2.0457348711536426e-06, + "loss": 0.003, + "step": 29070 + }, + { + "epoch": 1.7797906848644347, + "grad_norm": 0.14841991662979126, + "learning_rate": 2.0447584692921894e-06, + "loss": 0.004, + "step": 29080 + }, + { + "epoch": 1.780402717424567, + "grad_norm": 0.060349978506565094, + "learning_rate": 2.043792576905478e-06, + "loss": 0.0021, + "step": 29090 + }, + { + "epoch": 1.781014749984699, + "grad_norm": 0.3353869616985321, + "learning_rate": 2.0428371951270394e-06, + "loss": 0.004, + "step": 29100 + }, + { + "epoch": 1.7816267825448313, + "grad_norm": 0.1450352966785431, + "learning_rate": 2.0418923250780633e-06, + "loss": 0.0027, + "step": 29110 + }, + { + "epoch": 1.7822388151049635, + "grad_norm": 0.17684252560138702, + "learning_rate": 2.0409579678674084e-06, + "loss": 0.0032, + "step": 29120 + }, + { + "epoch": 1.7828508476650957, + "grad_norm": 0.153119757771492, + "learning_rate": 2.040034124591597e-06, + "loss": 0.0031, + "step": 29130 + }, + { + "epoch": 1.783462880225228, + "grad_norm": 0.09753888100385666, + "learning_rate": 2.039120796334809e-06, + "loss": 0.0038, + "step": 29140 + }, + { + "epoch": 1.7840749127853601, + "grad_norm": 0.1232074424624443, + "learning_rate": 2.0382179841688868e-06, + "loss": 0.0033, + "step": 29150 + }, + { + "epoch": 1.7846869453454923, + "grad_norm": 0.13487598299980164, + "learning_rate": 2.0373256891533293e-06, + "loss": 0.004, + "step": 29160 + }, + { + "epoch": 1.7852989779056245, + "grad_norm": 0.1717495173215866, + "learning_rate": 2.0364439123352956e-06, + "loss": 0.0032, + "step": 29170 + }, + { + "epoch": 1.7859110104657567, + "grad_norm": 0.21602065861225128, + "learning_rate": 2.0355726547495998e-06, + "loss": 0.0036, + "step": 29180 + }, + { + "epoch": 1.786523043025889, + "grad_norm": 0.18952055275440216, + "learning_rate": 2.034711917418711e-06, + "loss": 0.0039, + "step": 29190 + }, + { + "epoch": 1.7871350755860211, + "grad_norm": 0.18922209739685059, + "learning_rate": 2.033861701352752e-06, + "loss": 0.0031, + "step": 29200 + }, + { + "epoch": 1.7877471081461533, + "grad_norm": 0.10511717200279236, + "learning_rate": 2.0330220075494992e-06, + "loss": 0.0034, + "step": 29210 + }, + { + "epoch": 1.7883591407062855, + "grad_norm": 0.11389610171318054, + "learning_rate": 2.0321928369943807e-06, + "loss": 0.0049, + "step": 29220 + }, + { + "epoch": 1.7889711732664177, + "grad_norm": 0.08670853078365326, + "learning_rate": 2.031374190660474e-06, + "loss": 0.0021, + "step": 29230 + }, + { + "epoch": 1.78958320582655, + "grad_norm": 0.08372897654771805, + "learning_rate": 2.0305660695085054e-06, + "loss": 0.0063, + "step": 29240 + }, + { + "epoch": 1.7901952383866822, + "grad_norm": 0.08623497933149338, + "learning_rate": 2.0297684744868494e-06, + "loss": 0.0022, + "step": 29250 + }, + { + "epoch": 1.7908072709468144, + "grad_norm": 0.2859722375869751, + "learning_rate": 2.0289814065315306e-06, + "loss": 0.0034, + "step": 29260 + }, + { + "epoch": 1.7914193035069466, + "grad_norm": 0.12175265699625015, + "learning_rate": 2.0282048665662153e-06, + "loss": 0.0033, + "step": 29270 + }, + { + "epoch": 1.7920313360670788, + "grad_norm": 0.07477760314941406, + "learning_rate": 2.0274388555022176e-06, + "loss": 0.0025, + "step": 29280 + }, + { + "epoch": 1.792643368627211, + "grad_norm": 0.16364359855651855, + "learning_rate": 2.0266833742384928e-06, + "loss": 0.0025, + "step": 29290 + }, + { + "epoch": 1.7932554011873432, + "grad_norm": 0.1571386754512787, + "learning_rate": 2.0259384236616404e-06, + "loss": 0.0034, + "step": 29300 + }, + { + "epoch": 1.7938674337474754, + "grad_norm": 0.4267171323299408, + "learning_rate": 2.0252040046459022e-06, + "loss": 0.0036, + "step": 29310 + }, + { + "epoch": 1.7944794663076076, + "grad_norm": 0.1351214498281479, + "learning_rate": 2.02448011805316e-06, + "loss": 0.0035, + "step": 29320 + }, + { + "epoch": 1.7950914988677398, + "grad_norm": 0.39643657207489014, + "learning_rate": 2.023766764732934e-06, + "loss": 0.0036, + "step": 29330 + }, + { + "epoch": 1.795703531427872, + "grad_norm": 0.10161790996789932, + "learning_rate": 2.0230639455223853e-06, + "loss": 0.0032, + "step": 29340 + }, + { + "epoch": 1.7963155639880042, + "grad_norm": 0.1493646800518036, + "learning_rate": 2.0223716612463095e-06, + "loss": 0.0036, + "step": 29350 + }, + { + "epoch": 1.7969275965481364, + "grad_norm": 0.12911222875118256, + "learning_rate": 2.0216899127171424e-06, + "loss": 0.0029, + "step": 29360 + }, + { + "epoch": 1.7975396291082686, + "grad_norm": 0.11447032541036606, + "learning_rate": 2.0210187007349534e-06, + "loss": 0.0042, + "step": 29370 + }, + { + "epoch": 1.7981516616684008, + "grad_norm": 0.15647603571414948, + "learning_rate": 2.0203580260874474e-06, + "loss": 0.0034, + "step": 29380 + }, + { + "epoch": 1.798763694228533, + "grad_norm": 0.14736993610858917, + "learning_rate": 2.019707889549963e-06, + "loss": 0.0028, + "step": 29390 + }, + { + "epoch": 1.7993757267886652, + "grad_norm": 0.10555008798837662, + "learning_rate": 2.01906829188547e-06, + "loss": 0.0031, + "step": 29400 + }, + { + "epoch": 1.7999877593487974, + "grad_norm": 0.08822382241487503, + "learning_rate": 2.018439233844574e-06, + "loss": 0.0029, + "step": 29410 + }, + { + "epoch": 1.8005997919089296, + "grad_norm": 0.08774827420711517, + "learning_rate": 2.0178207161655087e-06, + "loss": 0.0029, + "step": 29420 + }, + { + "epoch": 1.8012118244690618, + "grad_norm": 0.12119588255882263, + "learning_rate": 2.0172127395741398e-06, + "loss": 0.0031, + "step": 29430 + }, + { + "epoch": 1.801823857029194, + "grad_norm": 0.14716175198554993, + "learning_rate": 2.0166153047839603e-06, + "loss": 0.0037, + "step": 29440 + }, + { + "epoch": 1.8024358895893262, + "grad_norm": 0.09904798865318298, + "learning_rate": 2.016028412496094e-06, + "loss": 0.004, + "step": 29450 + }, + { + "epoch": 1.8030479221494584, + "grad_norm": 0.05114385858178139, + "learning_rate": 2.015452063399292e-06, + "loss": 0.003, + "step": 29460 + }, + { + "epoch": 1.8036599547095906, + "grad_norm": 0.12696151435375214, + "learning_rate": 2.014886258169932e-06, + "loss": 0.0033, + "step": 29470 + }, + { + "epoch": 1.8042719872697228, + "grad_norm": 0.155229389667511, + "learning_rate": 2.014330997472017e-06, + "loss": 0.0045, + "step": 29480 + }, + { + "epoch": 1.804884019829855, + "grad_norm": 0.22578737139701843, + "learning_rate": 2.013786281957177e-06, + "loss": 0.0023, + "step": 29490 + }, + { + "epoch": 1.8054960523899872, + "grad_norm": 0.28504616022109985, + "learning_rate": 2.0132521122646662e-06, + "loss": 0.0037, + "step": 29500 + }, + { + "epoch": 1.8061080849501194, + "grad_norm": 0.1696653664112091, + "learning_rate": 2.0127284890213623e-06, + "loss": 0.0028, + "step": 29510 + }, + { + "epoch": 1.8067201175102516, + "grad_norm": 0.13287198543548584, + "learning_rate": 2.012215412841767e-06, + "loss": 0.0029, + "step": 29520 + }, + { + "epoch": 1.8073321500703838, + "grad_norm": 0.3142126202583313, + "learning_rate": 2.011712884328003e-06, + "loss": 0.0027, + "step": 29530 + }, + { + "epoch": 1.807944182630516, + "grad_norm": 0.19873814284801483, + "learning_rate": 2.011220904069815e-06, + "loss": 0.0047, + "step": 29540 + }, + { + "epoch": 1.8085562151906482, + "grad_norm": 0.19563670456409454, + "learning_rate": 2.01073947264457e-06, + "loss": 0.0026, + "step": 29550 + }, + { + "epoch": 1.8091682477507804, + "grad_norm": 0.10068873316049576, + "learning_rate": 2.0102685906172543e-06, + "loss": 0.0026, + "step": 29560 + }, + { + "epoch": 1.8097802803109126, + "grad_norm": 0.32799556851387024, + "learning_rate": 2.009808258540475e-06, + "loss": 0.0035, + "step": 29570 + }, + { + "epoch": 1.8103923128710449, + "grad_norm": 0.11536014825105667, + "learning_rate": 2.009358476954456e-06, + "loss": 0.0033, + "step": 29580 + }, + { + "epoch": 1.811004345431177, + "grad_norm": 0.2622664272785187, + "learning_rate": 2.008919246387043e-06, + "loss": 0.0036, + "step": 29590 + }, + { + "epoch": 1.8116163779913093, + "grad_norm": 0.14324435591697693, + "learning_rate": 2.0084905673536952e-06, + "loss": 0.0022, + "step": 29600 + }, + { + "epoch": 1.8122284105514415, + "grad_norm": 0.1239459365606308, + "learning_rate": 2.0080724403574922e-06, + "loss": 0.0025, + "step": 29610 + }, + { + "epoch": 1.8128404431115737, + "grad_norm": 0.17734837532043457, + "learning_rate": 2.007664865889131e-06, + "loss": 0.0038, + "step": 29620 + }, + { + "epoch": 1.8134524756717059, + "grad_norm": 0.09072575718164444, + "learning_rate": 2.0072678444269208e-06, + "loss": 0.004, + "step": 29630 + }, + { + "epoch": 1.814064508231838, + "grad_norm": 0.09809702634811401, + "learning_rate": 2.006881376436789e-06, + "loss": 0.0029, + "step": 29640 + }, + { + "epoch": 1.8146765407919703, + "grad_norm": 0.25450852513313293, + "learning_rate": 2.0065054623722772e-06, + "loss": 0.0034, + "step": 29650 + }, + { + "epoch": 1.8152885733521025, + "grad_norm": 0.06607849150896072, + "learning_rate": 2.0061401026745425e-06, + "loss": 0.0034, + "step": 29660 + }, + { + "epoch": 1.8159006059122347, + "grad_norm": 0.06259845197200775, + "learning_rate": 2.005785297772354e-06, + "loss": 0.0022, + "step": 29670 + }, + { + "epoch": 1.8165126384723669, + "grad_norm": 0.10518421977758408, + "learning_rate": 2.005441048082095e-06, + "loss": 0.0033, + "step": 29680 + }, + { + "epoch": 1.817124671032499, + "grad_norm": 0.15561337769031525, + "learning_rate": 2.0051073540077617e-06, + "loss": 0.0031, + "step": 29690 + }, + { + "epoch": 1.8177367035926313, + "grad_norm": 0.08990275114774704, + "learning_rate": 2.0047842159409633e-06, + "loss": 0.003, + "step": 29700 + }, + { + "epoch": 1.8183487361527635, + "grad_norm": 0.14854039251804352, + "learning_rate": 2.004471634260919e-06, + "loss": 0.0034, + "step": 29710 + }, + { + "epoch": 1.8189607687128957, + "grad_norm": 0.08208440989255905, + "learning_rate": 2.004169609334462e-06, + "loss": 0.0027, + "step": 29720 + }, + { + "epoch": 1.8195728012730277, + "grad_norm": 0.18652454018592834, + "learning_rate": 2.003878141516035e-06, + "loss": 0.0049, + "step": 29730 + }, + { + "epoch": 1.8201848338331599, + "grad_norm": 0.1906939297914505, + "learning_rate": 2.0035972311476916e-06, + "loss": 0.0035, + "step": 29740 + }, + { + "epoch": 1.820796866393292, + "grad_norm": 0.4511241614818573, + "learning_rate": 2.0033268785590954e-06, + "loss": 0.004, + "step": 29750 + }, + { + "epoch": 1.8214088989534243, + "grad_norm": 0.12219764292240143, + "learning_rate": 2.003067084067522e-06, + "loss": 0.0027, + "step": 29760 + }, + { + "epoch": 1.8220209315135565, + "grad_norm": 0.17036347091197968, + "learning_rate": 2.0028178479778523e-06, + "loss": 0.004, + "step": 29770 + }, + { + "epoch": 1.8226329640736887, + "grad_norm": 0.12429634481668472, + "learning_rate": 2.0025791705825805e-06, + "loss": 0.0038, + "step": 29780 + }, + { + "epoch": 1.823244996633821, + "grad_norm": 0.08393344283103943, + "learning_rate": 2.0023510521618066e-06, + "loss": 0.003, + "step": 29790 + }, + { + "epoch": 1.823857029193953, + "grad_norm": 0.15894703567028046, + "learning_rate": 2.0021334929832407e-06, + "loss": 0.0035, + "step": 29800 + }, + { + "epoch": 1.8244690617540853, + "grad_norm": 0.04971808195114136, + "learning_rate": 2.0019264933022016e-06, + "loss": 0.0034, + "step": 29810 + }, + { + "epoch": 1.8250810943142175, + "grad_norm": 0.0734478309750557, + "learning_rate": 2.001730053361614e-06, + "loss": 0.0025, + "step": 29820 + }, + { + "epoch": 1.8256931268743497, + "grad_norm": 0.10533800721168518, + "learning_rate": 2.0015441733920105e-06, + "loss": 0.0048, + "step": 29830 + }, + { + "epoch": 1.826305159434482, + "grad_norm": 0.1547422856092453, + "learning_rate": 2.0013688536115332e-06, + "loss": 0.0044, + "step": 29840 + }, + { + "epoch": 1.826917191994614, + "grad_norm": 0.09437263756990433, + "learning_rate": 2.0012040942259285e-06, + "loss": 0.0033, + "step": 29850 + }, + { + "epoch": 1.8275292245547463, + "grad_norm": 0.12579235434532166, + "learning_rate": 2.0010498954285506e-06, + "loss": 0.0025, + "step": 29860 + }, + { + "epoch": 1.8281412571148785, + "grad_norm": 0.06368619203567505, + "learning_rate": 2.00090625740036e-06, + "loss": 0.0022, + "step": 29870 + }, + { + "epoch": 1.8287532896750107, + "grad_norm": 0.09379997849464417, + "learning_rate": 2.0007731803099256e-06, + "loss": 0.0035, + "step": 29880 + }, + { + "epoch": 1.829365322235143, + "grad_norm": 0.11959333717823029, + "learning_rate": 2.00065066431342e-06, + "loss": 0.0023, + "step": 29890 + }, + { + "epoch": 1.8299773547952751, + "grad_norm": 0.14770719408988953, + "learning_rate": 2.0005387095546222e-06, + "loss": 0.0043, + "step": 29900 + }, + { + "epoch": 1.8305893873554073, + "grad_norm": 0.10033386945724487, + "learning_rate": 2.000437316164917e-06, + "loss": 0.0028, + "step": 29910 + }, + { + "epoch": 1.8312014199155395, + "grad_norm": 0.1918601095676422, + "learning_rate": 2.000346484263297e-06, + "loss": 0.0028, + "step": 29920 + }, + { + "epoch": 1.8318134524756717, + "grad_norm": 0.11692646890878677, + "learning_rate": 2.0002662139563564e-06, + "loss": 0.0036, + "step": 29930 + }, + { + "epoch": 1.832425485035804, + "grad_norm": 0.07981011271476746, + "learning_rate": 2.0001965053382976e-06, + "loss": 0.0028, + "step": 29940 + }, + { + "epoch": 1.8330375175959361, + "grad_norm": 0.08957688510417938, + "learning_rate": 2.000137358490928e-06, + "loss": 0.0029, + "step": 29950 + }, + { + "epoch": 1.8336495501560683, + "grad_norm": 0.16067251563072205, + "learning_rate": 2.0000887734836583e-06, + "loss": 0.0033, + "step": 29960 + }, + { + "epoch": 1.8342615827162005, + "grad_norm": 0.08392494916915894, + "learning_rate": 2.0000507503735076e-06, + "loss": 0.0021, + "step": 29970 + }, + { + "epoch": 1.8348736152763327, + "grad_norm": 0.11575599759817123, + "learning_rate": 2.0000232892050976e-06, + "loss": 0.0027, + "step": 29980 + }, + { + "epoch": 1.835485647836465, + "grad_norm": 0.13176386058330536, + "learning_rate": 2.000006390010655e-06, + "loss": 0.0029, + "step": 29990 + }, + { + "epoch": 1.8360976803965972, + "grad_norm": 0.11743218451738358, + "learning_rate": 2.0000000528100118e-06, + "loss": 0.003, + "step": 30000 + }, + { + "epoch": 1.8360976803965972, + "step": 30000, + "total_flos": 1.87391671271424e+17, + "train_loss": 0.006151843793193499, + "train_runtime": 19781.9737, + "train_samples_per_second": 12.132, + "train_steps_per_second": 1.517 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.87391671271424e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/training_args.bin b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cd9e28a44ae85140e2ef027a82e8be4c39167cc4 --- /dev/null +++ b/libero_on_top_extra_noops_pi0_VIS_PROJ_HEAD/libero_on_top_extra_noops_pi0_20260205-201028_lr2e-05_batchsize8/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5644791eb57bcb4c4808b4c2429b71e4c49eece4fc60f263f4553a3380f230bb +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-192603_lr2e-05_batchsize8/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-192603_lr2e-05_batchsize8/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..e3a2eaff653110c2ba0009cb56eb4b212cb1a946 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-192603_lr2e-05_batchsize8/norm_stats.json @@ -0,0 +1,414 @@ +{ + "state": { + "mean": [ + 6.545124530792236, + 1.3164341449737549, + -3.4697155952453613, + -0.00962071679532528, + -0.7082296013832092, + -0.43808361887931824, + 0.13391299545764923, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.980162143707275, + 16.702543258666992, + 8.168180465698242, + 0.6913491487503052, + 1.1232151985168457, + 0.9606267809867859, + 0.990993082523346, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "q01": [ + -2.702568125152588, + -21.763728466033935, + -21.216347326660156, + -2.3684931322097778, + -4.066458044528961, + -3.2888745792388914, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "q99": [ + 24.601868363571164, + 30.525507734680176, + 14.354210775756833, + 1.8357849156379702, + 2.250663768482209, + 1.934181491851806, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "min": [ + -6.664285659790039, + -24.918750762939453, + -26.890178680419922, + -4.980000019073486, + -6.679285526275635, + -8.928214073181152, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 36.96696472167969, + 45.19285583496094, + 22.706249237060547, + 3.8785715103149414, + 6.19178581237793, + 7.332857131958008, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "action": { + "mean": [ + -6.435277462005615, + -1.046771764755249, + 3.5443263053894043, + 0.010237408801913261, + 0.7088965773582458, + 0.433538019657135, + 0.11327514797449112, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 7.037599563598633, + 16.91518783569336, + 8.290277481079102, + 0.6919190883636475, + 1.1289485692977905, + 0.9604002833366394, + 0.9935636520385742, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "q01": [ + -24.608807465362545, + -30.57493604888916, + -14.421680474472046, + -1.8400005650520326, + -2.2583390679359434, + -1.9374337060928344, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "q99": [ + 2.947746359062201, + 22.348905650329584, + 21.642364361572263, + 2.36660552740097, + 4.0908002225875855, + 3.2823701507568366, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "min": [ + -36.96696472167969, + -45.431251525878906, + -23.038393020629883, + -3.96321439743042, + -6.19178581237793, + -7.332857131958008, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 7.5508928298950195, + 25.856250762939453, + 27.827678680419922, + 5.0871429443359375, + 7.05214262008667, + 8.928214073181152, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e8c20e0b14dc3e064d4acd0d654c71b594b6a812 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d37e544edbfe4fc241ba6612e2906a94f57350175d3b093cfd09068c0281c8c3 +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f058fa351058107f3b78dcda87c525f59f69a969 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66a69bf122b6b34be24a6449d84375dedb4629c65321194da0ecf5047bb30739 +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..68345b3bc2f7cc7f19a15a2bd812aea6ee49db1d --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e894a8bcaee136ce7dd2834297b06b11da2e734fe1a381c19f5b68917dbc9da +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..0da6184f9732635317d9591566929a0f088174db --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -24.608807465362545, + -30.57493604888916, + -14.421680474472046, + -1.8400005650520326, + -2.2583390679359434, + -1.9374337060928344, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 2.947746359062201, + 22.348905650329584, + 21.642364361572263, + 2.36660552740097, + 4.0908002225875855, + 3.2823701507568366, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + -6.435277462005615, + -1.046771764755249, + 3.5443263053894043, + 0.010237408801913261, + 0.7088965773582458, + 0.433538019657135, + 0.11327514797449112, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 7.037599563598633, + 16.91518783569336, + 8.290277481079102, + 0.6919190883636475, + 1.1289485692977905, + 0.9604002833366394, + 0.9935636520385742, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.702568125152588, + -21.763728466033935, + -21.216347326660156, + -2.3684931322097778, + -4.066458044528961, + -3.2888745792388914, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.601868363571164, + 30.525507734680176, + 14.354210775756833, + 1.8357849156379702, + 2.250663768482209, + 1.934181491851806, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.545124530792236, + 1.3164341449737549, + -3.4697155952453613, + -0.00962071679532528, + -0.7082296013832092, + -0.43808361887931824, + 0.13391299545764923, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.980162143707275, + 16.702543258666992, + 8.168180465698242, + 0.6913491487503052, + 1.1232151985168457, + 0.9606267809867859, + 0.990993082523346, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff8c0ecf109af377331583e4079865e7d8037bc8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 5 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2aa9b94474a8bec8a4ffe05d5b36c631e54295e9 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/trainer_state.json @@ -0,0 +1,15434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.318233567020193, + "eval_steps": 500, + "global_step": 22000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 2.688621997833252, + "learning_rate": 1.8e-07, + "loss": 0.1495, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.1722424030303955, + "learning_rate": 3.8e-07, + "loss": 0.1358, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 2.3095974922180176, + "learning_rate": 5.800000000000001e-07, + "loss": 0.1268, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 2.131070852279663, + "learning_rate": 7.8e-07, + "loss": 0.1224, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 2.273555278778076, + "learning_rate": 9.800000000000001e-07, + "loss": 0.118, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 1.3571869134902954, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.111, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 1.6004165410995483, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0826, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 1.0413638353347778, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.0657, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 1.1965473890304565, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.0493, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 1.1422100067138672, + "learning_rate": 1.98e-06, + "loss": 0.0444, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 0.6911118626594543, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.0457, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 0.6770259737968445, + "learning_rate": 2.38e-06, + "loss": 0.0257, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 0.4811704456806183, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.0208, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 0.7260023951530457, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.0203, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 0.4369716942310333, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.0174, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 0.4100959300994873, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.0133, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 1.0024627447128296, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.0149, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.4598183035850525, + "learning_rate": 3.58e-06, + "loss": 0.0143, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 0.7042055130004883, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.0143, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 0.7677909731864929, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0151, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 0.45090702176094055, + "learning_rate": 4.18e-06, + "loss": 0.0113, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 0.4400976598262787, + "learning_rate": 4.38e-06, + "loss": 0.0155, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 0.2424178272485733, + "learning_rate": 4.58e-06, + "loss": 0.0113, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 0.4720967411994934, + "learning_rate": 4.78e-06, + "loss": 0.0166, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 0.41622042655944824, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0104, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 0.6915765404701233, + "learning_rate": 5.18e-06, + "loss": 0.0108, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.25931113958358765, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0104, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.42486071586608887, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0084, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.3798843324184418, + "learning_rate": 5.78e-06, + "loss": 0.0107, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.3281213343143463, + "learning_rate": 5.98e-06, + "loss": 0.0081, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 0.3394489884376526, + "learning_rate": 6.18e-06, + "loss": 0.01, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 0.38298189640045166, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0098, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 0.3188078999519348, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0104, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.3152049779891968, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0087, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.34163472056388855, + "learning_rate": 6.98e-06, + "loss": 0.01, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 0.43860143423080444, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0065, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.2845093309879303, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0086, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 0.4009752869606018, + "learning_rate": 7.58e-06, + "loss": 0.0099, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.37756970524787903, + "learning_rate": 7.78e-06, + "loss": 0.0097, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.38135284185409546, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0076, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 0.3145769536495209, + "learning_rate": 8.18e-06, + "loss": 0.0106, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 0.32534345984458923, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0069, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.24024507403373718, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0089, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 0.32857799530029297, + "learning_rate": 8.78e-06, + "loss": 0.0105, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.28823110461235046, + "learning_rate": 8.98e-06, + "loss": 0.0101, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 0.32506972551345825, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0126, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 0.19875723123550415, + "learning_rate": 9.38e-06, + "loss": 0.0081, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.3245992958545685, + "learning_rate": 9.58e-06, + "loss": 0.0099, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.24933603405952454, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0117, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.3154098391532898, + "learning_rate": 9.980000000000001e-06, + "loss": 0.009, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.3685779273509979, + "learning_rate": 1.018e-05, + "loss": 0.0101, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 0.7251449823379517, + "learning_rate": 1.038e-05, + "loss": 0.0119, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 0.3183727264404297, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.009, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.3737810254096985, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0089, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.45293235778808594, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.011, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 0.3476772606372833, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.008, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.38373252749443054, + "learning_rate": 1.138e-05, + "loss": 0.0088, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 0.2530902624130249, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.008, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 0.19455896317958832, + "learning_rate": 1.178e-05, + "loss": 0.008, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.3315221071243286, + "learning_rate": 1.198e-05, + "loss": 0.0102, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.23430880904197693, + "learning_rate": 1.218e-05, + "loss": 0.007, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.4636307656764984, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0075, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.3785994052886963, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0109, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.2804955542087555, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0099, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.393702894449234, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0132, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.400641530752182, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0099, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 0.24428881704807281, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0076, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 0.4449252188205719, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0103, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.406582236289978, + "learning_rate": 1.378e-05, + "loss": 0.0098, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.36386170983314514, + "learning_rate": 1.398e-05, + "loss": 0.0088, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.38196030259132385, + "learning_rate": 1.418e-05, + "loss": 0.01, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.28740620613098145, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.008, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.3616485297679901, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0094, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.4004146158695221, + "learning_rate": 1.478e-05, + "loss": 0.009, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.4585514962673187, + "learning_rate": 1.498e-05, + "loss": 0.0092, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.20028235018253326, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0138, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 0.46603646874427795, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0139, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.3518030047416687, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0116, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.22323082387447357, + "learning_rate": 1.578e-05, + "loss": 0.0097, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.26777058839797974, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0081, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.32380548119544983, + "learning_rate": 1.618e-05, + "loss": 0.0087, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.5248059630393982, + "learning_rate": 1.638e-05, + "loss": 0.0102, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.3495309054851532, + "learning_rate": 1.658e-05, + "loss": 0.0121, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.3551771342754364, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.5039486289024353, + "learning_rate": 1.698e-05, + "loss": 0.0094, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.3826751410961151, + "learning_rate": 1.718e-05, + "loss": 0.0107, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.46699973940849304, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0122, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3312668204307556, + "learning_rate": 1.758e-05, + "loss": 0.0087, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 0.28113219141960144, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0121, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.49752357602119446, + "learning_rate": 1.798e-05, + "loss": 0.0101, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.4177795350551605, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0096, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.34015583992004395, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0082, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.4612225890159607, + "learning_rate": 1.858e-05, + "loss": 0.0084, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.3813643753528595, + "learning_rate": 1.878e-05, + "loss": 0.012, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 0.27937838435173035, + "learning_rate": 1.898e-05, + "loss": 0.0104, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.4471273422241211, + "learning_rate": 1.918e-05, + "loss": 0.0125, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.4010440707206726, + "learning_rate": 1.938e-05, + "loss": 0.0106, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.41607654094696045, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0107, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 0.3589233458042145, + "learning_rate": 1.978e-05, + "loss": 0.0081, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.5726460814476013, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0111, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.36717164516448975, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0102, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.47284170985221863, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.01, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.5372244119644165, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0117, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.40928924083709717, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0088, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.4905182421207428, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0107, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.3709850609302521, + "learning_rate": 1.999981616897523e-05, + "loss": 0.01, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 0.6419615745544434, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0095, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.4986196458339691, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0127, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.5523516535758972, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0115, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.5443158745765686, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0113, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 0.5146775245666504, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0101, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.2972394824028015, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0092, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.4030104875564575, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0097, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 0.4765481650829315, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0136, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.4051239788532257, + "learning_rate": 1.999882759038658e-05, + "loss": 0.0113, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.3703782558441162, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0108, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5248176455497742, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0112, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.3100311756134033, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0083, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.45929211378097534, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0114, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 0.5695507526397705, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0095, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.5395359992980957, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0151, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.5106327533721924, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0124, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.3423260450363159, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0132, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.32126766443252563, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.011, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.5105165839195251, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0085, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 0.31927764415740967, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0088, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 0.4421865940093994, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0093, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.2930506765842438, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0091, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.2920694053173065, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0085, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.2661049962043762, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0081, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 0.3047257661819458, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0083, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.2774506211280823, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0091, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.2554785907268524, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0096, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.5792570114135742, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0108, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.3250623941421509, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0125, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 0.5885359048843384, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0117, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.37988749146461487, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.009, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.3751101493835449, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0099, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.31976667046546936, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0097, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 0.37007251381874084, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0079, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.4624205231666565, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0103, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 0.3769538700580597, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0094, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.25460657477378845, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0076, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.3976004719734192, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0109, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.2983521521091461, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0113, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.25581008195877075, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0101, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.29260268807411194, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0102, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.3522181808948517, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0105, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.36269208788871765, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0103, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.40412119030952454, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0116, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.24089744687080383, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0119, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.4667617082595825, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0084, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.30139675736427307, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0101, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.38486286997795105, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0097, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.3526909649372101, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0071, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.3023934066295624, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0125, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.2796316146850586, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0072, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.25742489099502563, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0089, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.3626627027988434, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.01, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.3032572567462921, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0084, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.23514018952846527, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0086, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.3835832476615906, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0091, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.5170259475708008, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0146, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 0.8983817100524902, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0112, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.26260825991630554, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0086, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.481942743062973, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0126, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.311187207698822, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0064, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.3346790373325348, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0073, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.33836621046066284, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0085, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.3678463101387024, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0098, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.6136184334754944, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0154, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.39811593294143677, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0112, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6973778009414673, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0099, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.4773237109184265, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0091, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.3776084780693054, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.009, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 0.5061993598937988, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0097, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.41183987259864807, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.009, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 0.31513598561286926, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0112, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.4571514129638672, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0097, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.3183996379375458, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.01, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.2978666126728058, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0089, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.4791043698787689, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0087, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.5216032266616821, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0124, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.44693392515182495, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0092, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 0.41371819376945496, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0111, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.3593288064002991, + "learning_rate": 1.996106060741973e-05, + "loss": 0.014, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 0.4550306499004364, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0098, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.3510669469833374, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0066, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.2778814136981964, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0108, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.32210350036621094, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0067, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.42160800099372864, + "learning_rate": 1.995639934033493e-05, + "loss": 0.012, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.49051347374916077, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0102, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.3643694519996643, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.009, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.3717772960662842, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0076, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.32102280855178833, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0081, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.36725476384162903, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0102, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.39626258611679077, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0078, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.4183773696422577, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0105, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.3494930863380432, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0078, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.6155357956886292, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0119, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.34380587935447693, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.0105, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.5476253032684326, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.01, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 0.37999996542930603, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0094, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 0.3124147057533264, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0125, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.4887244999408722, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.01, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5969874858856201, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0106, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 0.4295594096183777, + "learning_rate": 1.993971819309759e-05, + "loss": 0.007, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.3899303078651428, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0096, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.3912282884120941, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0075, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.5355616807937622, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0093, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.29141828417778015, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0129, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.24389855563640594, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.009, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.4070908725261688, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0085, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.26783379912376404, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0071, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.2644960880279541, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0089, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.35223162174224854, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0093, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.47337162494659424, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0095, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.25418519973754883, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0093, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 0.36384159326553345, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0082, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.30014440417289734, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0081, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.41121408343315125, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0081, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.5576186776161194, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.008, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.35785913467407227, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0083, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.3306240439414978, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0084, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 0.37215736508369446, + "learning_rate": 1.991774193879505e-05, + "loss": 0.012, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.5504099726676941, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0088, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.24932143092155457, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.007, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.5866615176200867, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0088, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.5174368619918823, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0121, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.2345893532037735, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0095, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.2683233916759491, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0068, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.2471713274717331, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0085, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.5090919733047485, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0108, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.2857886552810669, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0078, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.23729385435581207, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0096, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.30867621302604675, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0088, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.42522960901260376, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0103, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.37170591950416565, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0105, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.3672806918621063, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0121, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.4048611521720886, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.01, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.24768167734146118, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0125, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 0.5003495812416077, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0125, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.4303686022758484, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0084, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.3701602518558502, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0101, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.38272005319595337, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0075, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.2844183146953583, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0105, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.31114980578422546, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0095, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.3436568081378937, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0113, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.273001104593277, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0076, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.2653564512729645, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0077, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.3115384578704834, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0132, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.25932809710502625, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0083, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.28656521439552307, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0066, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.31808462738990784, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.0115, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.18877890706062317, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0092, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.3685394525527954, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0091, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.3878263533115387, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0082, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 0.284507691860199, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0085, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.3473755121231079, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0081, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.39935287833213806, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0081, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.34282153844833374, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0076, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.3581090271472931, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0087, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.37332627177238464, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0089, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 0.5224587321281433, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0089, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.42577075958251953, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0108, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.4602234959602356, + "learning_rate": 1.985504281027289e-05, + "loss": 0.014, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.4852961003780365, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0091, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.4437471628189087, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0112, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.37050408124923706, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0068, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.3345497250556946, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0069, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.36727628111839294, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0081, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 0.37056809663772583, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0152, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.5640603303909302, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0085, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 0.3653910160064697, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0078, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.2954258322715759, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0083, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6086210012435913, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0082, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 0.5260390043258667, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0105, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.3067379295825958, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.0092, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.3480100929737091, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0088, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.26472753286361694, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0067, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.5254784226417542, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0146, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.35744136571884155, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0098, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.36186468601226807, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0084, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 0.35203835368156433, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0115, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.30590811371803284, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0108, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.34612980484962463, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0082, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.2946765720844269, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0075, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.33707642555236816, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.007, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.2572688162326813, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0099, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.3901146352291107, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0185, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.4349755644798279, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0084, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.2383752018213272, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0092, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.46043846011161804, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0073, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.24630354344844818, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0062, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.5232640504837036, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0067, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 0.3850713074207306, + "learning_rate": 1.979809151602651e-05, + "loss": 0.014, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 0.44703760743141174, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0081, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.3762659728527069, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0099, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.4593638479709625, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0093, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.40554332733154297, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0125, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.33439910411834717, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0081, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.2623269855976105, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0062, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.22419600188732147, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0078, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.37183159589767456, + "learning_rate": 1.978133252131276e-05, + "loss": 0.01, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.27857136726379395, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0089, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.27683520317077637, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0069, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.45064759254455566, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0076, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.24215294420719147, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.0071, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.5163891315460205, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.0078, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.3922234773635864, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0077, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.19653558731079102, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0063, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.17621839046478271, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0084, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.6482162475585938, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0075, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.32759004831314087, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0088, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.33347561955451965, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0073, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.42883744835853577, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0084, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 0.3348788917064667, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0082, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.28349289298057556, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0102, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.2733197510242462, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0074, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.3263874351978302, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.01, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.295757532119751, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0071, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5598515868186951, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0093, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.425937294960022, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0083, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.2442379742860794, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0087, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.3378766179084778, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0163, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5137761831283569, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0099, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.3825916647911072, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0096, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.32084307074546814, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0066, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.3979593515396118, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0077, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.3103732764720917, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0067, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.5531997084617615, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0131, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5423216819763184, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0121, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.5038735270500183, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0087, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.44273868203163147, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.008, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.335232675075531, + "learning_rate": 1.971017390295979e-05, + "loss": 0.009, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.4746256470680237, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.26807400584220886, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0075, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.35464033484458923, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0123, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.33803898096084595, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0094, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 0.20334473252296448, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0101, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.34386369585990906, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0081, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.38781842589378357, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0088, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.25994163751602173, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0079, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.3342406451702118, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0091, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.3120318353176117, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0079, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.3556351661682129, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0073, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.21421445906162262, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0095, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.39498451352119446, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0087, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.5480947494506836, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0079, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.16734588146209717, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0072, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.3987548351287842, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0083, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.3929785490036011, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0096, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.2884303331375122, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0102, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.3338335454463959, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0092, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.47452738881111145, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0093, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.25584715604782104, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0068, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 0.3038389980792999, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0076, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.4123639464378357, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0101, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.38520675897598267, + "learning_rate": 1.964833301001045e-05, + "loss": 0.014, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.3355116844177246, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0094, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.3479195535182953, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0105, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.2700177729129791, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0076, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.2166757434606552, + "learning_rate": 1.963745667883003e-05, + "loss": 0.008, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 0.18578873574733734, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0071, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.26316413283348083, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0079, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.28762468695640564, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0115, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 0.3712877631187439, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0074, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.2862299382686615, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0072, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.2730867564678192, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0101, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.327648401260376, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0092, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.41153189539909363, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0083, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.32522135972976685, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0095, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.22764958441257477, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0085, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.3491888642311096, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.009, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.3123551607131958, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0103, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.1881783902645111, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0085, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.40902259945869446, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0089, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.382953941822052, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0088, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 0.23950865864753723, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0064, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.3419397175312042, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0118, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.42207059264183044, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0091, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.40754130482673645, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0087, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.2390766590833664, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0069, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.2974188029766083, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.0091, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.2993582785129547, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0092, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.42652204632759094, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0068, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.3138194680213928, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.009, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.38833311200141907, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0083, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.4015152156352997, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0081, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.42086881399154663, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.007, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.26732996106147766, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0071, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.5763937830924988, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0101, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.2955382764339447, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0075, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.4625638723373413, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0094, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.29631468653678894, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0096, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.46335819363594055, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0103, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.3183141350746155, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.008, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.26456212997436523, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0083, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 0.40924879908561707, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0097, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 0.3981763422489166, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0094, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.36437541246414185, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0064, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.2935962378978729, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0081, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.3478807210922241, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0079, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.3460087180137634, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0069, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.2706817090511322, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0088, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.2674945890903473, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0083, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.2268197238445282, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0072, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.3216208219528198, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0092, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.3226968050003052, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0101, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.2743329405784607, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0075, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.32573118805885315, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0094, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 0.53167325258255, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0099, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.3915646970272064, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0089, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.4526256322860718, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0101, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.323249489068985, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0094, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4046335816383362, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0088, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.34745559096336365, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0078, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.30308133363723755, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0071, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.37923407554626465, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0076, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 0.26785972714424133, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0093, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.2778306305408478, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0083, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.611038088798523, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0098, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.4114893078804016, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0111, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.2732110023498535, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0076, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.2964401841163635, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0095, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.40240928530693054, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0097, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.3901022672653198, + "learning_rate": 1.944152646499645e-05, + "loss": 0.008, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.38001132011413574, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0109, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.35937973856925964, + "learning_rate": 1.943474465322135e-05, + "loss": 0.007, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.2745327651500702, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0075, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.1598518043756485, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.007, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.401614785194397, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0115, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.4127846360206604, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0068, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.22147920727729797, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0061, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.28602245450019836, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0067, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.22147324681282043, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0076, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.2550548315048218, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0088, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.24113087356090546, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0076, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.3658410608768463, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0075, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.3856262266635895, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0112, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.33494284749031067, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0075, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.3767516314983368, + "learning_rate": 1.938969919958475e-05, + "loss": 0.01, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.24380649626255035, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.009, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.30575039982795715, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0079, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.32913386821746826, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.009, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.29845312237739563, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0099, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.44377902150154114, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0092, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.34614384174346924, + "learning_rate": 1.936834723687526e-05, + "loss": 0.009, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.3316318690776825, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0096, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.4076138734817505, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0081, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 0.30320486426353455, + "learning_rate": 1.935753861926916e-05, + "loss": 0.015, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.32243025302886963, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.011, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.323745459318161, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0077, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5750753283500671, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0088, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.22709843516349792, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0101, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.3067542314529419, + "learning_rate": 1.933932815280178e-05, + "loss": 0.007, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.392337828874588, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0089, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.43343180418014526, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0073, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.4371345341205597, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0078, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.35214635729789734, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0077, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.3259161412715912, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0074, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.3849303722381592, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0066, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.3968902826309204, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0091, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.33016201853752136, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0095, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.3859156668186188, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.008, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.3020654618740082, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.007, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.44503262639045715, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0105, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.3908904194831848, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0073, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.39256253838539124, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0078, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.352611243724823, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0077, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.39203983545303345, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0081, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.23835115134716034, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0066, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.24996638298034668, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0098, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.29537609219551086, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0082, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.2898835837841034, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0077, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.4040369391441345, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0083, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.3501318395137787, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0094, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5462452173233032, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0097, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.4217568337917328, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0072, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.18295089900493622, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0083, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.3695569336414337, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0078, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.37818798422813416, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0089, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.29818472266197205, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0084, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.3328498303890228, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.01, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.340724378824234, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0075, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.2966301441192627, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0063, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.30677109956741333, + "learning_rate": 1.922098355206593e-05, + "loss": 0.008, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.2091839611530304, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.0078, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.4229014217853546, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0115, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.40779992938041687, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0075, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.378817081451416, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.008, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.29796919226646423, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0092, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.2702767252922058, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0076, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.31349876523017883, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0085, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 0.30500444769859314, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0093, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.2860834002494812, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0061, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.26036593317985535, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0099, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.19049863517284393, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0075, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.3235284388065338, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0083, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.364092618227005, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.011, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.2409065216779709, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0092, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.36907926201820374, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.008, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.3230077922344208, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0073, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.191047802567482, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0063, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.3346494436264038, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0082, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.21352025866508484, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0075, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.5505086779594421, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0095, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.34264758229255676, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0083, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.20266413688659668, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0074, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.24938757717609406, + "learning_rate": 1.912718096497034e-05, + "loss": 0.007, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.4140026569366455, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0086, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.4424414038658142, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0104, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 0.5327904224395752, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0115, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 0.35958340764045715, + "learning_rate": 1.911035077753307e-05, + "loss": 0.01, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.2547682523727417, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0066, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.3701247274875641, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0115, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.34443217515945435, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0077, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.20353800058364868, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0061, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5660653114318848, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0091, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.26445311307907104, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0073, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.5561402440071106, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0071, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.3700469434261322, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0083, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.35783904790878296, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.008, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.3238641619682312, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0081, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.25247740745544434, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0099, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.435730904340744, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.008, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.37758126854896545, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0068, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.33323949575424194, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.0094, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.4356318712234497, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0093, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 0.37893903255462646, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0058, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.4411139190196991, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0085, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.3852006793022156, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0087, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4287096858024597, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0107, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.43085435032844543, + "learning_rate": 1.902392195640386e-05, + "loss": 0.009, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 0.2709400951862335, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0066, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.358126163482666, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0082, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.25320038199424744, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0077, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 0.31440937519073486, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0077, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.25246965885162354, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0079, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.28420332074165344, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0101, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.25251317024230957, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0075, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.19744229316711426, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0069, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4457854628562927, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0073, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.36817625164985657, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0096, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.3394709825515747, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0073, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.2909093201160431, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0065, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.20237651467323303, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0057, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.29520732164382935, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0072, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.25512900948524475, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0096, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.45816823840141296, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0073, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.33459368348121643, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0096, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.21619321405887604, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0063, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.25518253445625305, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0067, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.2273867279291153, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.007, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.2864684462547302, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0074, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.3077942728996277, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0075, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 0.40526703000068665, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0079, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.41480064392089844, + "learning_rate": 1.891523933768891e-05, + "loss": 0.01, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.2750788629055023, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0064, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 0.29671600461006165, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0095, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 0.24160107970237732, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0069, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 0.2949109971523285, + "learning_rate": 1.889660337749874e-05, + "loss": 0.007, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.2847975492477417, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0059, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.30052465200424194, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0067, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.36128419637680054, + "learning_rate": 1.888252908366661e-05, + "loss": 0.014, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.36974236369132996, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0064, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.43730056285858154, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0084, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.3145422339439392, + "learning_rate": 1.88683715346172e-05, + "loss": 0.008, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.35473865270614624, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0091, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.2501350939273834, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.008, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.34808069467544556, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0099, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.45218509435653687, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.0068, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.34530994296073914, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0098, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.38257333636283875, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0101, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.3040159344673157, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0079, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.3323517143726349, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0068, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.2639414370059967, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0078, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.3493870794773102, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0081, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.5838330984115601, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0091, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 0.428803026676178, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0087, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.3654572069644928, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0114, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.3295663297176361, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0075, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.3469060957431793, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0074, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.3366406261920929, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0066, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.32569241523742676, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0054, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3086700737476349, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0086, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.38562801480293274, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0092, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.3523421585559845, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0085, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.2278694063425064, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0063, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.32141822576522827, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0147, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.3375259041786194, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0077, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4483063220977783, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0062, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.3667140007019043, + "learning_rate": 1.874717450126662e-05, + "loss": 0.008, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.3419000506401062, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0079, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.36556369066238403, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0079, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.33135318756103516, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0064, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.4458329975605011, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0091, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.34939518570899963, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0072, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.34424352645874023, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0077, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.3460613191127777, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0113, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.38822048902511597, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0066, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.35550639033317566, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0083, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 0.30869176983833313, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0087, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.38202086091041565, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0081, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.25744789838790894, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0074, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.29700344800949097, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0082, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.305786669254303, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0076, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.3291271924972534, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0071, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.26111704111099243, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0074, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.348176509141922, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0086, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.27502793073654175, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0076, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.2831551432609558, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0092, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.39652079343795776, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0066, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.3885122239589691, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0087, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.37296077609062195, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0104, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.33606627583503723, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0086, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.3855937421321869, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0097, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.3322301506996155, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0076, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 0.33322253823280334, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.009, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.22358210384845734, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0088, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.5901851058006287, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0088, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.4703235328197479, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0084, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.20072896778583527, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0077, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.3537980616092682, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0098, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.3123277723789215, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0068, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.35979342460632324, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0065, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.38628828525543213, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0074, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.3498038053512573, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0074, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.20784054696559906, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0059, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.1811107099056244, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0085, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.43317103385925293, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0064, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.3815033435821533, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0064, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.35989734530448914, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.008, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.46118423342704773, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.012, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.25334376096725464, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0078, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.26764073967933655, + "learning_rate": 1.852547637090483e-05, + "loss": 0.01, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.2785920202732086, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0066, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.41587865352630615, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0061, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.42850133776664734, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.009, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.32369133830070496, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0091, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.2930110692977905, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0069, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.3199067711830139, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0085, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 0.4349478483200073, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0078, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 0.3054976165294647, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0061, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.2826739251613617, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0068, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.25106528401374817, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.007, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.25897887349128723, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0076, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.26398584246635437, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.0069, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.41751599311828613, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0083, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.17239610850811005, + "learning_rate": 1.844974808419918e-05, + "loss": 0.006, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3300461173057556, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0051, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.2645586133003235, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0068, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.24550332129001617, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0071, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.2889944911003113, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0091, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.476601779460907, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0066, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.35630306601524353, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0074, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.35651877522468567, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0084, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.3889803886413574, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0079, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4214278757572174, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.009, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.30540233850479126, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0083, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.3624532222747803, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0076, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.32963570952415466, + "learning_rate": 1.838347361898993e-05, + "loss": 0.01, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.3533381521701813, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0064, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.3011729419231415, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0065, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.4733760952949524, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0089, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.38553985953330994, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0059, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.2560643255710602, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0073, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.39531010389328003, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0106, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.2701983153820038, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0086, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.352717787027359, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0096, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.29157745838165283, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0073, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.4267994165420532, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0075, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.36308032274246216, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0075, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.33457428216934204, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0103, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.3717971444129944, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0069, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 0.21432936191558838, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0081, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.2878777086734772, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0057, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.4453850984573364, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0095, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.36917057633399963, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0063, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.3252313733100891, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0082, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.2529674470424652, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0057, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.2816419303417206, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0097, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.6464210152626038, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0076, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.33034399151802063, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0069, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.27335023880004883, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0078, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 0.3158395290374756, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0064, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.5128306746482849, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0087, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 0.24884961545467377, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0084, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.324278324842453, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0075, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6472476124763489, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0093, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.21269051730632782, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0066, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.29203882813453674, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0074, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.30436405539512634, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0087, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5066608190536499, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0081, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.32647472620010376, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0066, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.2804315388202667, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0066, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.24779941141605377, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0074, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.34001022577285767, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0101, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.2611280381679535, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0082, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.3129233717918396, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0079, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.2822776734828949, + "learning_rate": 1.815952390818299e-05, + "loss": 0.0098, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.36969345808029175, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0064, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.33959338068962097, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0088, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.2628033459186554, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0062, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.38812723755836487, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0061, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.26403307914733887, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0055, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 0.3789900541305542, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0081, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.28676870465278625, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0127, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.606293797492981, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0082, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.37321826815605164, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0063, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.368115097284317, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0091, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.3368416726589203, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0068, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.23466472327709198, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.006, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.3796599507331848, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0169, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.2202090471982956, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0099, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.5006175637245178, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0086, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.3673453629016876, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0083, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4379428029060364, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.006, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.43015891313552856, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0084, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.2806220054626465, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0061, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.23545289039611816, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0062, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 0.32115358114242554, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0075, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.3217777907848358, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0062, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.3224331736564636, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0072, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.31703537702560425, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0082, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 0.4175204932689667, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.008, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.22969186305999756, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0084, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.3421284258365631, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0077, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.32668444514274597, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0071, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.2729822099208832, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0068, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.33153197169303894, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0074, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.4678424000740051, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0076, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.23711496591567993, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0076, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.3230719566345215, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0084, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.32328692078590393, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0075, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.566879153251648, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0072, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.26277920603752136, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0062, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.339163601398468, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0082, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.23408609628677368, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0061, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.2942394018173218, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0065, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 0.3774799704551697, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0063, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.2847958207130432, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0072, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.2577030062675476, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0088, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.2883673906326294, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0075, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.3596307933330536, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0073, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.30285483598709106, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0076, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.2933914363384247, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0077, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.7666468024253845, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0102, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.31347739696502686, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0072, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.3435507118701935, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0081, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.3266170620918274, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0058, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.284027099609375, + "learning_rate": 1.784745142605655e-05, + "loss": 0.005, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.19972574710845947, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0072, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.2587524950504303, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0067, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.2922254204750061, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0064, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.17053507268428802, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0092, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.2850453555583954, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0073, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.2844892144203186, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0075, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.28969481587409973, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0079, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.4704195261001587, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0102, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.2652505338191986, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0077, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.2656702399253845, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0118, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.2282119244337082, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0086, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.30130353569984436, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0062, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.2295757234096527, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0066, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.25287938117980957, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0065, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.3274557292461395, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0076, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.34377023577690125, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0079, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.36259520053863525, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0055, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.24462608993053436, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0067, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.3615039587020874, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0088, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.40002626180648804, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0086, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.3362888991832733, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0062, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.33698126673698425, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0087, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.3287750482559204, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.0068, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.23409898579120636, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0063, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.23275460302829742, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0066, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.35324692726135254, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0068, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.2781875729560852, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0066, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.3083304166793823, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0069, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.22543831169605255, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0066, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.22566530108451843, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0066, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.3640650808811188, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0073, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.35346123576164246, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0069, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 0.30858153104782104, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0076, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.30895760655403137, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0074, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.30667638778686523, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0082, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.3134152889251709, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0086, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.21407048404216766, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0077, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.3456077575683594, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0083, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.4259016513824463, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.009, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.38690924644470215, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0094, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.31742537021636963, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0065, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.3568819463253021, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0077, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.3771888315677643, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0073, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.25528469681739807, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0067, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.36028411984443665, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0064, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.41987329721450806, + "learning_rate": 1.754802282200567e-05, + "loss": 0.007, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.18902993202209473, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0064, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.1859915405511856, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0086, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.1778331696987152, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0052, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4222147464752197, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.007, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.26806506514549255, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0074, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.34431734681129456, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0056, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.41732800006866455, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0079, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.3027847409248352, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0054, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.47592151165008545, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0066, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9539707899093628, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0095, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.4084669351577759, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0082, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.3052361309528351, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0072, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.23123528063297272, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.009, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.20356184244155884, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0073, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.048543930053711, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.3017459213733673, + "learning_rate": 1.74400239259128e-05, + "loss": 0.007, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.3679676353931427, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0085, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.20339734852313995, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0087, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.3523346781730652, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0076, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.4162348210811615, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0063, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.3293565511703491, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0067, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.24455691874027252, + "learning_rate": 1.739902378104222e-05, + "loss": 0.007, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 0.17645037174224854, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0051, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.2554231286048889, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0076, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.20006878674030304, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0076, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.27911216020584106, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0082, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.5701723694801331, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0081, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.222118079662323, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0072, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.2762138843536377, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0049, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 1.4110082387924194, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0114, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.31313180923461914, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0078, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.20941513776779175, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0079, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.3963930308818817, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0053, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.2066672146320343, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0049, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.3919369876384735, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0082, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.2544628083705902, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.0054, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.31123557686805725, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0078, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.24768301844596863, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0051, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.26674744486808777, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0052, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.27382466197013855, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0075, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.23384103178977966, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.0059, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.3531075417995453, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0068, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.34425088763237, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0066, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.2716144323348999, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0058, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.35163211822509766, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0071, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.23585639894008636, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0072, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.28066661953926086, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0068, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.3146689832210541, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0071, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.37553170323371887, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.008, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.18403242528438568, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0068, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.3904851973056793, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0072, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.4481397867202759, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0074, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.31124234199523926, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0074, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.3815377354621887, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0084, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.2909438908100128, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0074, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.3408021330833435, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0074, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.23902025818824768, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0076, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.2194853127002716, + "learning_rate": 1.714740708672306e-05, + "loss": 0.006, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 0.4337097108364105, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0092, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.4132380783557892, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0078, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.3434816598892212, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0076, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.25129666924476624, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0058, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.45458248257637024, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0064, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.5350340008735657, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.009, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 0.28008121252059937, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0073, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.33276447653770447, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0064, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37103456258773804, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0078, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 0.4689319133758545, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0073, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.3622629642486572, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.006, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.2822306156158447, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0073, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.19226481020450592, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0059, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.27806007862091064, + "learning_rate": 1.704700993266678e-05, + "loss": 0.007, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.25948378443717957, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0076, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.5857216715812683, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0095, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.30467140674591064, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0073, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.2067701816558838, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0068, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 0.5653601288795471, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0087, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.3107249140739441, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0065, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4027363061904907, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0098, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.2757766544818878, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0091, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.30397671461105347, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0061, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.28112074732780457, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0063, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.14751966297626495, + "learning_rate": 1.696714953556411e-05, + "loss": 0.008, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.2988373935222626, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0055, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.2706286311149597, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0066, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.3612031042575836, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.006, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.25386789441108704, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0065, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.3170768916606903, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0056, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.4776926338672638, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0059, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.34828829765319824, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0088, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.20440815389156342, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0066, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.2943046987056732, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0068, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.16982606053352356, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0073, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.5607914924621582, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0085, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.35823172330856323, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0064, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.23943926393985748, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0068, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.24083787202835083, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0056, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.37987980246543884, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0062, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.35953620076179504, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0069, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.22255095839500427, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0071, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.4121200442314148, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0098, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.2377164363861084, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0076, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.2298472374677658, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0064, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.40824711322784424, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0066, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.33295100927352905, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.007, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.3978032171726227, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0077, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.27672451734542847, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.006, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.2591206729412079, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0089, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.1749347746372223, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0051, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.18699893355369568, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0056, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.240631103515625, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0089, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.3650512993335724, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0075, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.3503545820713043, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0067, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.3086877167224884, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0061, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.41695648431777954, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0064, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.33144691586494446, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0067, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.2679164409637451, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0072, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.22681233286857605, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0071, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.36362454295158386, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0067, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.20192845165729523, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0067, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.3895004093647003, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0055, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.22510671615600586, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0069, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.19641445577144623, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0101, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.2914806008338928, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0076, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.3187137544155121, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0059, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.3116552233695984, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0095, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.2597426772117615, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0058, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.21480600535869598, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0055, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.23912057280540466, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.006, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.317941278219223, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0064, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.58933025598526, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0095, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.21906700730323792, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0105, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.23899045586585999, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0059, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.2969389259815216, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0124, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.3514954447746277, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0066, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.18145518004894257, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0077, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.3087640404701233, + "learning_rate": 1.656303606359183e-05, + "loss": 0.006, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.3532063364982605, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0055, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.34000685811042786, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0096, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.24904295802116394, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0073, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.36314642429351807, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.008, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.20241902768611908, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.009, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.3215351700782776, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0075, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 0.4313117563724518, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0081, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.48170387744903564, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0071, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.3369109630584717, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0066, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.34541958570480347, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0058, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.2493886947631836, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0058, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.22845667600631714, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0096, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.2695702016353607, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0055, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 0.28211796283721924, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0052, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.1901162564754486, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0058, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.2701025605201721, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0061, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.36527693271636963, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0072, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.3061700463294983, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0067, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.5612105131149292, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0087, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.23399880528450012, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0072, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.314933180809021, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0078, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.35548436641693115, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0094, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.37685567140579224, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0084, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.3190719783306122, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0065, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.26337119936943054, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0063, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.3518264889717102, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0072, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.3185817003250122, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0068, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.2995646893978119, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0064, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.3110463619232178, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0063, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.24277286231517792, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0064, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.17603862285614014, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0061, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.28089356422424316, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0076, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.2855492830276489, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0047, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.3247278928756714, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0058, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.18349547684192657, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0061, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.30654969811439514, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.007, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.2674420177936554, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0067, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.38177546858787537, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0091, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.33796218037605286, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0068, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.3754856586456299, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0063, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.21820858120918274, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.007, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.36184942722320557, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0061, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.32240399718284607, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0063, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 0.24755406379699707, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0059, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.397858589887619, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0064, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.389072448015213, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0063, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.3368140757083893, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0071, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.29631632566452026, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0062, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.24265453219413757, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0076, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.19892603158950806, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0064, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.1852462887763977, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0051, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.1886446475982666, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0075, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.25982722640037537, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0068, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.3376137614250183, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0058, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.33173730969429016, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0064, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.3177517354488373, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0072, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.3385971784591675, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0066, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.29163679480552673, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0073, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.2335229516029358, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0056, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.24502214789390564, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0054, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.2009458988904953, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0061, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.3341793715953827, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0082, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.3872147798538208, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0063, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.29940876364707947, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0073, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.4895729720592499, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0086, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.4485950469970703, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.0053, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.22961653769016266, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0077, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.24187293648719788, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.005, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.3535212278366089, + "learning_rate": 1.601916647245149e-05, + "loss": 0.007, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.26539868116378784, + "learning_rate": 1.601107070706339e-05, + "loss": 0.008, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.43096065521240234, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0076, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.16919535398483276, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0058, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.2383720725774765, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0064, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.36103156208992004, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0067, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.2657287120819092, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0072, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.21437199413776398, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0065, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.34000417590141296, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0046, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.4855337142944336, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0068, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.3178497850894928, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0064, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.3171309530735016, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0067, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.3364340662956238, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0067, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2272711992263794, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0069, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.29505178332328796, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0078, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.3755042552947998, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0081, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.2983969449996948, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0085, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.3112468421459198, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0072, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.1950412392616272, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0061, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.2153436243534088, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0065, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.25062650442123413, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0079, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.1407836377620697, + "learning_rate": 1.584793312377278e-05, + "loss": 0.005, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.17276513576507568, + "learning_rate": 1.583971586792325e-05, + "loss": 0.006, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.47983887791633606, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0076, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.28724750876426697, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0076, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.3224884569644928, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0079, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.37969788908958435, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0063, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.48106926679611206, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0071, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.3555319905281067, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0075, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.19486083090305328, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.006, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.42018064856529236, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0074, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.3075830936431885, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0071, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.20921990275382996, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0063, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.20436584949493408, + "learning_rate": 1.574895332125391e-05, + "loss": 0.006, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.28120604157447815, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0071, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.22980183362960815, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0078, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.24825431406497955, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0064, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.22042447328567505, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0071, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.249199777841568, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0076, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.32628607749938965, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0057, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.35151633620262146, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0059, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.29098865389823914, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0064, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.24006013572216034, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0058, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.2797141671180725, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0073, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.2963006794452667, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0058, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.19539053738117218, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0053, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.2686854898929596, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0051, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.35952430963516235, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0071, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.21042552590370178, + "learning_rate": 1.562410199183484e-05, + "loss": 0.005, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.27942436933517456, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0068, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.17137926816940308, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0063, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.20331411063671112, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0047, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.15683002769947052, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0052, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.14726290106773376, + "learning_rate": 1.558221191857467e-05, + "loss": 0.006, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.2940376400947571, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0068, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.4059796929359436, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0067, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.2587816119194031, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0086, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.3462979793548584, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0078, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.5607128739356995, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0079, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.24189788103103638, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0052, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 0.23362945020198822, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0073, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.22395116090774536, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0059, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.3514958322048187, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0064, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.25395795702934265, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0081, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.2948741018772125, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0051, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.22298739850521088, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0038, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.46948447823524475, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0097, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.2992243468761444, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0083, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.18001538515090942, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0055, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.23337051272392273, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0066, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.2863878905773163, + "learning_rate": 1.543878746906905e-05, + "loss": 0.006, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.23027309775352478, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0072, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.21359150111675262, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0064, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.3878735601902008, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0069, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.29146283864974976, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.007, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.21782676875591278, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0051, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.45582008361816406, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0063, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.4554077982902527, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0067, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.2254059612751007, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0064, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.13952374458312988, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0061, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.23241721093654633, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0072, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.3424162268638611, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0058, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.21074503660202026, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0057, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.33662086725234985, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0056, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.24403709173202515, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0073, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.27195101976394653, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0058, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.34224429726600647, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0072, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.29089581966400146, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0053, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3397226333618164, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0066, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.30517837405204773, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0092, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.3485032021999359, + "learning_rate": 1.52681291800283e-05, + "loss": 0.007, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.31346458196640015, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0045, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.1864607185125351, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.006, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.20976679027080536, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0053, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.22616958618164062, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0059, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.14772117137908936, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0073, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.33677151799201965, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0059, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.32354292273521423, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0061, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.21409569680690765, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0064, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.4659721851348877, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0061, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 0.32267874479293823, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0064, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.5019848942756653, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0061, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.32694318890571594, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0076, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.3013843297958374, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0068, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.1973707377910614, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0059, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22204430401325226, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0056, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.3365449607372284, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0059, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.3398677110671997, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.007, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.27888917922973633, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0062, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.2814931273460388, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0069, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.3317541182041168, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.006, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.21940776705741882, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0052, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.239700049161911, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0059, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.19117280840873718, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0071, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.21827168762683868, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0056, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.25645333528518677, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0085, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.30847233533859253, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0055, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.3127819895744324, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0058, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.30181658267974854, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0075, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.34778207540512085, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0077, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.18988046050071716, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0048, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.3479195833206177, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0045, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.24158424139022827, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0051, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.14698052406311035, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0053, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.4441753625869751, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0065, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.28078633546829224, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0064, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.29406028985977173, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0048, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3856968581676483, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0067, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.36528849601745605, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0062, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.34250667691230774, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0053, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.2862832844257355, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0055, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.3683549761772156, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0091, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.26892581582069397, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0069, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.2220073938369751, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0052, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.18825116753578186, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0065, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.28731998801231384, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0069, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.26817163825035095, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0058, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.44162800908088684, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0065, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 0.2990165948867798, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0074, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.20428279042243958, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0053, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.2918189465999603, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0056, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.30408942699432373, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0063, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.2593521177768707, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0061, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.34048640727996826, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0054, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.2438877820968628, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0059, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.5205245018005371, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0065, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.3658570349216461, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0061, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.23279106616973877, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0039, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.2704083323478699, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0054, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.1849551945924759, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0061, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.21807430684566498, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0059, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.47879981994628906, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0061, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.24125567078590393, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0056, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.25820469856262207, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0053, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.30664944648742676, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0075, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.3646678030490875, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0057, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.2534210979938507, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0045, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.2125798910856247, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0074, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 0.4387839734554291, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0072, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.337387353181839, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.01, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.23150259256362915, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0072, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.3243090808391571, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0076, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.26716119050979614, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.006, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.15551891922950745, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0061, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.1841796338558197, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0058, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 0.3119230270385742, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.006, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.2633327841758728, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0059, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.24567869305610657, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0055, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.3697315454483032, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0061, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.1941021829843521, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0052, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.2610131502151489, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.007, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.24856074154376984, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0062, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.27259066700935364, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0052, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.20962993800640106, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0055, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.4015270471572876, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0062, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.22935271263122559, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0063, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.29984018206596375, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0059, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.35775551199913025, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0079, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.15501125156879425, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0054, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.3543296158313751, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0072, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.1982075721025467, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0064, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.2616399824619293, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0062, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.2612541615962982, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0064, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3081730008125305, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0055, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.24024926126003265, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0083, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.20793405175209045, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0055, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.21445533633232117, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0058, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.24078251421451569, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0059, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.36214157938957214, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0061, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.2583295702934265, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0054, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.2641732394695282, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0069, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.2179708331823349, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0049, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.27418699860572815, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0049, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.3894921839237213, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0076, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.3912152945995331, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0063, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.16886518895626068, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0059, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.2731325626373291, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0073, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.3299262225627899, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.007, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.2671407163143158, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0058, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.2701479196548462, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0059, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.3803080916404724, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0061, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.2621704041957855, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0061, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.27780428528785706, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0065, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.3326016962528229, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0081, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.3632255792617798, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0069, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.24395202100276947, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0065, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.3215671181678772, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0066, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.2625272572040558, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0065, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.31547197699546814, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0043, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.1893424689769745, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0059, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.27042335271835327, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0059, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.22597061097621918, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0063, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.1742873191833496, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0062, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.16797663271427155, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0048, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.42558521032333374, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0075, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.37216684222221375, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0061, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.19943472743034363, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0065, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.2211161106824875, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0075, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.2680184245109558, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0052, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.2402123361825943, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0051, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.1881084442138672, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0066, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.26134756207466125, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0063, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.3185539245605469, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0062, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.3118845820426941, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0061, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.22595946490764618, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.007, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.2627023458480835, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0067, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.2984865605831146, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0051, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.25496092438697815, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0057, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.3078263998031616, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0074, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.17885653674602509, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0057, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.37737196683883667, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0058, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.21651378273963928, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0053, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.1974128633737564, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0059, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.17184904217720032, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0058, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.3074864447116852, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0059, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.28784239292144775, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0061, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.3435216546058655, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0065, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.38048845529556274, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0057, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.1875533014535904, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0052, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.48555630445480347, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0063, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 0.25066429376602173, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0055, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.2763892412185669, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0059, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.21217335760593414, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0092, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.23555652797222137, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0064, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.14828811585903168, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.006, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 0.27303484082221985, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0047, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.14681454002857208, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0067, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.43693456053733826, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0081, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.2940906286239624, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0059, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.20382657647132874, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0074, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.25655868649482727, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0069, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.31879740953445435, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0062, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4898712933063507, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0051, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.17142456769943237, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0061, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.14010348916053772, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0045, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.26882827281951904, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0056, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.2636195421218872, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0048, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.24932081997394562, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0045, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.3367895185947418, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0049, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.15173649787902832, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0053, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.34083831310272217, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0072, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3327343165874481, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0048, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.36545902490615845, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0076, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.22761192917823792, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0067, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.19272181391716003, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0072, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.2881070375442505, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.006, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.32841676473617554, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0063, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.19850151240825653, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0052, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.31401291489601135, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0052, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.4023345112800598, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0058, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.25802844762802124, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0051, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.19678954780101776, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0053, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.4545653164386749, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0073, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.36174362897872925, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0068, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.31692951917648315, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0063, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.3470834195613861, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0064, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.29541268944740295, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0062, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.26377183198928833, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.006, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.2019137591123581, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0058, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.45156505703926086, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.007, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.15810425579547882, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.006, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.20093902945518494, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.006, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.28989917039871216, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0062, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.39454182982444763, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0063, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.25967612862586975, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0069, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.2058791220188141, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0065, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.26367849111557007, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0074, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.2432256042957306, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0054, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.19844679534435272, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0048, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.16757237911224365, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0052, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.2988821566104889, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0047, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.2231496274471283, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0048, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.265029639005661, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0048, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.41179928183555603, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.0049, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.33498677611351013, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0052, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.2323407232761383, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0048, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.27306419610977173, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0061, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.2791977822780609, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0088, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.453421026468277, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0073, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.3209727108478546, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0063, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.2572932839393616, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0056, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.19572272896766663, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0051, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.2831172049045563, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0057, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.21267575025558472, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0059, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.3220005929470062, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0057, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.2515857517719269, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0063, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.18344618380069733, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0052, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.34515154361724854, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0052, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.16711464524269104, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0054, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.3027217984199524, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.006, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.31168296933174133, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.007, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5778804421424866, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0056, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.2591782212257385, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0061, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.2449295073747635, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0046, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 0.19733767211437225, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0054, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.14837461709976196, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0053, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.3784295916557312, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0054, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.2400134950876236, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0054, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.17671307921409607, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0051, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.2664073705673218, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.006, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.25426605343818665, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0062, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.26733267307281494, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0049, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.46151378750801086, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.006, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.17070212960243225, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0062, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.42009514570236206, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0052, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.20439159870147705, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0053, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.25189417600631714, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0066, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.21402288973331451, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0072, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.294109046459198, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0061, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.29355865716934204, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0061, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.2937833368778229, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0061, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.1926010102033615, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0056, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.21794214844703674, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0065, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.23409108817577362, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0067, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.4696379005908966, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0062, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.28415724635124207, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0061, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.22433705627918243, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0064, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3090682923793793, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0056, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.23742817342281342, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0057, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.2670089900493622, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0052, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.2810697555541992, + "learning_rate": 1.299277443549658e-05, + "loss": 0.007, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.44233059883117676, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0069, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.14227768778800964, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0064, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.298776239156723, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0072, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.2882034480571747, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0064, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.23135380446910858, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0064, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.2870500981807709, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.005, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.24524538218975067, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0064, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.2949783504009247, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0081, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.2215491235256195, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0069, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.26351356506347656, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0082, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.1909482628107071, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0052, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.13428187370300293, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0068, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.2125115543603897, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0048, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.27032148838043213, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0056, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.20981402695178986, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0069, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.24961373209953308, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0073, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.13643066585063934, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0054, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.25289252400398254, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0071, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.4061530828475952, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.006, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.29924723505973816, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0055, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.37029367685317993, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0053, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.37273409962654114, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0066, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.18242980539798737, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0054, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.18563945591449738, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0044, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.32972440123558044, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0045, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 0.3327874541282654, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0065, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.2077408730983734, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0055, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.1813255399465561, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0055, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.17811767756938934, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0055, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.20526157319545746, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0043, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.112189382314682, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0055, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.29082757234573364, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0099, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.23212411999702454, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0067, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.17449915409088135, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0047, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.3327349126338959, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0047, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.2709571123123169, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0056, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.19788618385791779, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0063, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.22075456380844116, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0064, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.2943982779979706, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0057, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.1718410849571228, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0056, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.3546068072319031, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0055, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.18132814764976501, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0047, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.17795684933662415, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0048, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.22964486479759216, + "learning_rate": 1.257232766480803e-05, + "loss": 0.005, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.3259448707103729, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0072, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.18410101532936096, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0045, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.28669047355651855, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0056, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.25986725091934204, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0055, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.1731722205877304, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0053, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.17501944303512573, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.005, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.2749968469142914, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0046, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.26125603914260864, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0055, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.22476239502429962, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0103, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.26169249415397644, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0067, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.19236186146736145, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0048, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.26535508036613464, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0055, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.2534106373786926, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0052, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.29464206099510193, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0076, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.3711875081062317, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0059, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.26430103182792664, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0055, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.27274343371391296, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.006, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.15951389074325562, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0069, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.33735600113868713, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0064, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.19443227350711823, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0051, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.1960541307926178, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0049, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21133695542812347, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0066, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.22702853381633759, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.006, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.22489185631275177, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0061, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.33164891600608826, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0067, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.22196516394615173, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.0055, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.19532594084739685, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0048, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.41902172565460205, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0064, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.30388328433036804, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0052, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.2507944703102112, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0051, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.30817684531211853, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0052, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.27485454082489014, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.006, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.14287802577018738, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0047, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 0.14513961970806122, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.0049, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.3345814645290375, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0051, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.2974685728549957, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0049, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.3455393612384796, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0062, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.16792115569114685, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.005, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.3038713335990906, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.005, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.2928559184074402, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0054, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.2317439168691635, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0039, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.3498123586177826, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0067, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.2850436866283417, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0045, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.18316122889518738, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0089, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.34362390637397766, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0066, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.13047993183135986, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0057, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.3403606116771698, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0055, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.27717292308807373, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0043, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.27412480115890503, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0049, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.1914675235748291, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0075, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.3778243958950043, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0084, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.20566068589687347, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.007, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.1868937760591507, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0051, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.24719548225402832, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.005, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.20591633021831512, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0053, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4353996217250824, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.005, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.31571000814437866, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.005, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.14182177186012268, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0048, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.3461489975452423, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0062, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.17980965971946716, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0043, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.28671878576278687, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0048, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.18663623929023743, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0072, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.25223061442375183, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0063, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.20179906487464905, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.37325599789619446, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0079, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.18855971097946167, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0052, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.2992260754108429, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0051, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.18020357191562653, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0046, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.2106374204158783, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0044, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3749687373638153, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0068, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.1616801619529724, + "learning_rate": 1.188676298665799e-05, + "loss": 0.007, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.20882001519203186, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0143, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.16600479185581207, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0052, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.406480073928833, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0051, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.27349016070365906, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0056, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.2340608835220337, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0044, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.3165459632873535, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0042, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.19552721083164215, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0047, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.21882636845111847, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0061, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.23699741065502167, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0052, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.283207505941391, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0053, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.2782933712005615, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0062, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.3389151096343994, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0074, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.25642505288124084, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0061, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.19476772844791412, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0067, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.1992277055978775, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0057, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.21006375551223755, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0058, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.18808932602405548, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0073, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.258075475692749, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0052, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.29291409254074097, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0052, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.19002115726470947, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0041, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.4246057868003845, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.006, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 0.16166792809963226, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.005, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.35779255628585815, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0065, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.20405125617980957, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0082, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.23229332268238068, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0095, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.21156901121139526, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0074, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.22334401309490204, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0051, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.18344342708587646, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0048, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.22982414066791534, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0056, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.24991759657859802, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0046, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.27965986728668213, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0045, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.309841126203537, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0054, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.20964398980140686, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0044, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.45226722955703735, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0057, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.17177052795886993, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0064, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.8886939287185669, + "learning_rate": 1.153689339251154e-05, + "loss": 0.008, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.14726528525352478, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0066, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.32135209441185, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0064, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.22926779091358185, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0052, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.21345189213752747, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0047, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.31324461102485657, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0072, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.2185574620962143, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0047, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.36229151487350464, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0042, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.3479749262332916, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0053, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.23806153237819672, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0065, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.30633601546287537, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0079, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.2326052039861679, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0063, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 0.1756114363670349, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0064, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.18622055649757385, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0045, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.3261238932609558, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0059, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.16155003011226654, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0057, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.22661013901233673, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0046, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.24310468137264252, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0044, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.16182619333267212, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0056, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.1656215786933899, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0039, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.2945510447025299, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0049, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.24436083436012268, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0058, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.34221476316452026, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0069, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.26235878467559814, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0055, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.23333275318145752, + "learning_rate": 1.130316049722011e-05, + "loss": 0.005, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.23382601141929626, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0057, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 0.1693800389766693, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0058, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.3740929067134857, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.005, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.26146796345710754, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0038, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.13361674547195435, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0053, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8631370663642883, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0085, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.2952764630317688, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0054, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.23047442734241486, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0054, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.25271645188331604, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0059, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.3246142864227295, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0066, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.31531205773353577, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0045, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4806351959705353, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0089, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.15645328164100647, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0051, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.29767802357673645, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0044, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.23338516056537628, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0055, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.20454354584217072, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0049, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.2087928056716919, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.004, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.18911990523338318, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0058, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.16931432485580444, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0053, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.3027138411998749, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0055, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.22635169327259064, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0039, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.26646292209625244, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0047, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.20067426562309265, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0054, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.22507227957248688, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0076, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.18533077836036682, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.005, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.1757635474205017, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0077, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.2326493263244629, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.006, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.2661048471927643, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0048, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.3285987079143524, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0047, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.3764145076274872, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.005, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.19637148082256317, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0048, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 0.16601431369781494, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.005, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.12405529618263245, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0036, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.21413138508796692, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0043, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.3323937952518463, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0057, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.20915299654006958, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0054, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.28372666239738464, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0048, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.32995301485061646, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0051, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.2148507684469223, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0061, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.22549118101596832, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.005, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.19749189913272858, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0049, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.250184565782547, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0065, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.23174546658992767, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.2707926034927368, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0049, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.175989031791687, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0058, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.2267833948135376, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0044, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.3495822846889496, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0048, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.2051204890012741, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0063, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.22149987518787384, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0058, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.21434035897254944, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0046, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.2996143400669098, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0065, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.22886960208415985, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0053, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.3317148685455322, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.005, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.45717868208885193, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0062, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.1223258301615715, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0051, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.2037084549665451, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0046, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.3772616982460022, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0045, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.30312252044677734, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0069, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.14988413453102112, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0047, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.3409348130226135, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0069, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.2308650016784668, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0049, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.15572187304496765, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0051, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.1962181180715561, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0049, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.337464302778244, + "learning_rate": 1.067930046280971e-05, + "loss": 0.005, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.17047251760959625, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0045, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.3098141849040985, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0043, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.17919068038463593, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0052, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.3461310863494873, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.006, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.37006744742393494, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0066, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.19726566970348358, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.005, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.1319705843925476, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0049, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.2131422460079193, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0055, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.1435563862323761, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0067, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.24024318158626556, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0055, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.1511068344116211, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0052, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.16795606911182404, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0047, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.1475641280412674, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0046, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.21277494728565216, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0048, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.2511015832424164, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0043, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.24675171077251434, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0059, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.2560728192329407, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0055, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.30879196524620056, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.005, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.1838868409395218, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0052, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.1673516035079956, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.20293423533439636, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0047, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.25513023138046265, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0052, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.26149800419807434, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0045, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.27551159262657166, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0041, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.2508440911769867, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0043, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.2889135181903839, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0043, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.1755184680223465, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0051, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.2095116674900055, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.33451047539711, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0079, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.44589516520500183, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0064, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.24158142507076263, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0047, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.15632936358451843, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.006, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.10808487981557846, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0065, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.1782998889684677, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0046, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.16395118832588196, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.004, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 0.30205732583999634, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0058, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.1561775654554367, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.004, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.1649634838104248, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0062, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.15428072214126587, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0043, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.11285894364118576, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0067, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.3470291793346405, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0056, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.16610246896743774, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0051, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.29931193590164185, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0051, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.15366005897521973, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.005, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.2352767139673233, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0057, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.19226962327957153, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0042, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.1903623789548874, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0044, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.4167932868003845, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0071, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.2913760840892792, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0046, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.2632276713848114, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0063, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.21258050203323364, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0043, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.19750680029392242, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.0032, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.2896588444709778, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0045, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3017624020576477, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0074, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.18355949223041534, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0051, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.16483789682388306, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0056, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.2190672904253006, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0043, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.28435200452804565, + "learning_rate": 1.011517750003287e-05, + "loss": 0.005, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.2564929723739624, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0049, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.2592712342739105, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0048, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.18716935813426971, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0047, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.18236829340457916, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0049, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.27956655621528625, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0056, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.13664546608924866, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0048, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.21617569029331207, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0052, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.2196502536535263, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0054, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.20864732563495636, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0041, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.38381293416023254, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.005, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.1605401486158371, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0045, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.2079813927412033, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0051, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.2110205590724945, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0054, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.2421400547027588, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0048, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.41358140110969543, + "learning_rate": 9.969762660447491e-06, + "loss": 0.006, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.23386628925800323, + "learning_rate": 9.960077585586335e-06, + "loss": 0.005, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.20425592362880707, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0059, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.21164651215076447, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0042, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.1642364114522934, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0034, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.18716906011104584, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0044, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.15626995265483856, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0044, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.18394386768341064, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0044, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.3590037524700165, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0073, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.2103291153907776, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0051, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.19865299761295319, + "learning_rate": 9.87296819358355e-06, + "loss": 0.006, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.2052467316389084, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0065, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.31245940923690796, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0049, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.2959006726741791, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0042, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.33695659041404724, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0071, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.20898328721523285, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0062, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.3500119149684906, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0049, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.3926694095134735, + "learning_rate": 9.805290087509098e-06, + "loss": 0.007, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.24234539270401, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0039, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.1705496460199356, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0056, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.2907398045063019, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0048, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.2366454005241394, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0047, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.25498414039611816, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0046, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.163838192820549, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0048, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.1613040417432785, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0048, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.3639470338821411, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0042, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.22151169180870056, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0043, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.13474372029304504, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0051, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.2601003050804138, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0038, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.20202822983264923, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0046, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.18514803051948547, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0061, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.16678287088871002, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0038, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.17608965933322906, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0041, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.26356828212738037, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0059, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.297612726688385, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0047, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.16363881528377533, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.12642459571361542, + "learning_rate": 9.621949874438232e-06, + "loss": 0.004, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.3339644968509674, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0052, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.20784282684326172, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0046, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.28467273712158203, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0047, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.3124372661113739, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0051, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.3490087389945984, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0047, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.15114343166351318, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0051, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.41157594323158264, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0058, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.40405890345573425, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0045, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.1149911880493164, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0087, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.18746539950370789, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0058, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.1327875554561615, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0049, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.1530160903930664, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0038, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.2663615047931671, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0049, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.3390499949455261, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0046, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.2461002618074417, + "learning_rate": 9.477616135359713e-06, + "loss": 0.006, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.2141093611717224, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0049, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.20443470776081085, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0052, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.14927290380001068, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0039, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.3012462854385376, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0047, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.33484792709350586, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0045, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.19986321032047272, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0041, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.21612870693206787, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0043, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.19541047513484955, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0044, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.24203962087631226, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0049, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.1470087766647339, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0049, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.2336059808731079, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0048, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.32893121242523193, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0044, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.32034680247306824, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0055, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.27538758516311646, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0049, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.18869644403457642, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0065, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.2719379961490631, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0047, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.2850756347179413, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0043, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.19997543096542358, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0068, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.19222821295261383, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0044, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.16414248943328857, + "learning_rate": 9.285803018919292e-06, + "loss": 0.004, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.23754803836345673, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0039, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.2682085335254669, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0048, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.18268488347530365, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0046, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.14906349778175354, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0034, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.19079554080963135, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0041, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.09538780897855759, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0043, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.19193744659423828, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0044, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.1366361379623413, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0049, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.29436588287353516, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0052, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.24179348349571228, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0047, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.236627459526062, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0061, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.1719210296869278, + "learning_rate": 9.171095634265995e-06, + "loss": 0.0054, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.2724406123161316, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0048, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.09852395206689835, + "learning_rate": 9.152007262148612e-06, + "loss": 0.004, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.23493632674217224, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0049, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.20697079598903656, + "learning_rate": 9.132927564918328e-06, + "loss": 0.0047, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.16597376763820648, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0048, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.23542962968349457, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0046, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.18859006464481354, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0054, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.16773538291454315, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0044, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.2122378647327423, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0042, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.18205690383911133, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0046, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.1791398823261261, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0043, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.4446735680103302, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0052, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.32150915265083313, + "learning_rate": 9.047178679583151e-06, + "loss": 0.005, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.15855731070041656, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0045, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.19377414882183075, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0057, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.25969046354293823, + "learning_rate": 9.018636566864313e-06, + "loss": 0.006, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.2349981814622879, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0073, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.1853523701429367, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0051, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.22417226433753967, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0058, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.1969340741634369, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0058, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.18523764610290527, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0059, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.28188323974609375, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0052, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.18134717643260956, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0048, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.15660132467746735, + "learning_rate": 8.942627394858978e-06, + "loss": 0.004, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.3179869055747986, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0044, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.14007267355918884, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0043, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.31531354784965515, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0062, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.1867508888244629, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0054, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4172282814979553, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0056, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.21233956515789032, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0054, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.13055016100406647, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0048, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.24662990868091583, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0054, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.1877284198999405, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0045, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.20158089697360992, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0052, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.23169469833374023, + "learning_rate": 8.83836825410936e-06, + "loss": 0.0048, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.27991265058517456, + "learning_rate": 8.828905148874785e-06, + "loss": 0.008, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.3321090638637543, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0063, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.14790703356266022, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0033, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.1504756361246109, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0052, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.2211659848690033, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0038, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.1777208149433136, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0041, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.2586643397808075, + "learning_rate": 8.772180411864604e-06, + "loss": 0.006, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.2705499529838562, + "learning_rate": 8.762735374981932e-06, + "loss": 0.0047, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.16527540981769562, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0037, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.24313445389270782, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0057, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.16705767810344696, + "learning_rate": 8.734416061983528e-06, + "loss": 0.004, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.20638783276081085, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0052, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.26159438490867615, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0039, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.30387070775032043, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0038, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.24292278289794922, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0042, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.3707493543624878, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0056, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.41142478585243225, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0044, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.22052627801895142, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0047, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.14626234769821167, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0047, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.25504666566848755, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0046, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.2020457535982132, + "learning_rate": 8.640192851412488e-06, + "loss": 0.006, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.2440478354692459, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0047, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.12040785700082779, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0044, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.25539812445640564, + "learning_rate": 8.611979388060327e-06, + "loss": 0.006, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.20701228082180023, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0041, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.24188214540481567, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0063, + "step": 18000 + }, + { + "epoch": 1.0791539337288034, + "grad_norm": 0.24987974762916565, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0063, + "step": 18010 + }, + { + "epoch": 1.0797531308047217, + "grad_norm": 0.20973123610019684, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0049, + "step": 18020 + }, + { + "epoch": 1.08035232788064, + "grad_norm": 0.19898714125156403, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0061, + "step": 18030 + }, + { + "epoch": 1.0809515249565582, + "grad_norm": 0.21703247725963593, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0056, + "step": 18040 + }, + { + "epoch": 1.0815507220324765, + "grad_norm": 0.18688541650772095, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0054, + "step": 18050 + }, + { + "epoch": 1.0821499191083948, + "grad_norm": 0.30194586515426636, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0049, + "step": 18060 + }, + { + "epoch": 1.082749116184313, + "grad_norm": 0.17975366115570068, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0046, + "step": 18070 + }, + { + "epoch": 1.0833483132602313, + "grad_norm": 0.25966599583625793, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0044, + "step": 18080 + }, + { + "epoch": 1.0839475103361496, + "grad_norm": 0.1702205240726471, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0058, + "step": 18090 + }, + { + "epoch": 1.0845467074120678, + "grad_norm": 0.18940114974975586, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0052, + "step": 18100 + }, + { + "epoch": 1.085145904487986, + "grad_norm": 0.18239127099514008, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0047, + "step": 18110 + }, + { + "epoch": 1.0857451015639044, + "grad_norm": 0.14571616053581238, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0046, + "step": 18120 + }, + { + "epoch": 1.0863442986398226, + "grad_norm": 0.17203395068645477, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0038, + "step": 18130 + }, + { + "epoch": 1.086943495715741, + "grad_norm": 0.249881312251091, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0056, + "step": 18140 + }, + { + "epoch": 1.0875426927916592, + "grad_norm": 0.296194463968277, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0044, + "step": 18150 + }, + { + "epoch": 1.0881418898675774, + "grad_norm": 0.21376049518585205, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0052, + "step": 18160 + }, + { + "epoch": 1.0887410869434957, + "grad_norm": 0.2952374815940857, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0036, + "step": 18170 + }, + { + "epoch": 1.089340284019414, + "grad_norm": 0.20862646400928497, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0051, + "step": 18180 + }, + { + "epoch": 1.0899394810953322, + "grad_norm": 0.17828255891799927, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0053, + "step": 18190 + }, + { + "epoch": 1.0905386781712505, + "grad_norm": 0.20771050453186035, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0038, + "step": 18200 + }, + { + "epoch": 1.0911378752471688, + "grad_norm": 0.3046565651893616, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0059, + "step": 18210 + }, + { + "epoch": 1.091737072323087, + "grad_norm": 0.12605167925357819, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0046, + "step": 18220 + }, + { + "epoch": 1.0923362693990053, + "grad_norm": 0.13702887296676636, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0038, + "step": 18230 + }, + { + "epoch": 1.0929354664749236, + "grad_norm": 0.11569058150053024, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0042, + "step": 18240 + }, + { + "epoch": 1.0935346635508418, + "grad_norm": 0.27488255500793457, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0054, + "step": 18250 + }, + { + "epoch": 1.09413386062676, + "grad_norm": 0.30820342898368835, + "learning_rate": 8.349909816537207e-06, + "loss": 0.005, + "step": 18260 + }, + { + "epoch": 1.0947330577026784, + "grad_norm": 0.3108576536178589, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0056, + "step": 18270 + }, + { + "epoch": 1.0953322547785966, + "grad_norm": 0.16087505221366882, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0044, + "step": 18280 + }, + { + "epoch": 1.095931451854515, + "grad_norm": 0.27139320969581604, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0055, + "step": 18290 + }, + { + "epoch": 1.0965306489304332, + "grad_norm": 0.17057007551193237, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0036, + "step": 18300 + }, + { + "epoch": 1.0971298460063514, + "grad_norm": 0.13946233689785004, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0057, + "step": 18310 + }, + { + "epoch": 1.0977290430822697, + "grad_norm": 0.2342602014541626, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0038, + "step": 18320 + }, + { + "epoch": 1.098328240158188, + "grad_norm": 0.17249339818954468, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0047, + "step": 18330 + }, + { + "epoch": 1.0989274372341062, + "grad_norm": 0.2641673684120178, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0044, + "step": 18340 + }, + { + "epoch": 1.0995266343100245, + "grad_norm": 0.18304336071014404, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0041, + "step": 18350 + }, + { + "epoch": 1.1001258313859428, + "grad_norm": 0.25955966114997864, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0045, + "step": 18360 + }, + { + "epoch": 1.100725028461861, + "grad_norm": 0.2159314751625061, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0038, + "step": 18370 + }, + { + "epoch": 1.1013242255377793, + "grad_norm": 0.254371702671051, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0043, + "step": 18380 + }, + { + "epoch": 1.1019234226136976, + "grad_norm": 0.10616741329431534, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0036, + "step": 18390 + }, + { + "epoch": 1.1025226196896158, + "grad_norm": 0.38598379492759705, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0065, + "step": 18400 + }, + { + "epoch": 1.1031218167655341, + "grad_norm": 0.3797863721847534, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0048, + "step": 18410 + }, + { + "epoch": 1.1037210138414524, + "grad_norm": 0.2059139758348465, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0062, + "step": 18420 + }, + { + "epoch": 1.1043202109173706, + "grad_norm": 0.19991335272789001, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0043, + "step": 18430 + }, + { + "epoch": 1.104919407993289, + "grad_norm": 0.17376656830310822, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0047, + "step": 18440 + }, + { + "epoch": 1.1055186050692072, + "grad_norm": 0.17102457582950592, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0056, + "step": 18450 + }, + { + "epoch": 1.1061178021451255, + "grad_norm": 0.501983642578125, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0065, + "step": 18460 + }, + { + "epoch": 1.1067169992210437, + "grad_norm": 0.40338510274887085, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0055, + "step": 18470 + }, + { + "epoch": 1.107316196296962, + "grad_norm": 0.10511627048254013, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0052, + "step": 18480 + }, + { + "epoch": 1.1079153933728803, + "grad_norm": 0.2610682249069214, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0038, + "step": 18490 + }, + { + "epoch": 1.1085145904487985, + "grad_norm": 0.09666074812412262, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0058, + "step": 18500 + }, + { + "epoch": 1.1091137875247168, + "grad_norm": 0.19014683365821838, + "learning_rate": 8.117972135268806e-06, + "loss": 0.005, + "step": 18510 + }, + { + "epoch": 1.109712984600635, + "grad_norm": 0.2999255657196045, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 1.1103121816765533, + "grad_norm": 0.20351538062095642, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0049, + "step": 18530 + }, + { + "epoch": 1.1109113787524716, + "grad_norm": 0.1562410295009613, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0034, + "step": 18540 + }, + { + "epoch": 1.1115105758283899, + "grad_norm": 0.14160799980163574, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0035, + "step": 18550 + }, + { + "epoch": 1.1121097729043081, + "grad_norm": 0.10796743631362915, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0056, + "step": 18560 + }, + { + "epoch": 1.1127089699802264, + "grad_norm": 0.28861188888549805, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0043, + "step": 18570 + }, + { + "epoch": 1.1133081670561449, + "grad_norm": 0.3835368752479553, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0037, + "step": 18580 + }, + { + "epoch": 1.113907364132063, + "grad_norm": 0.21850043535232544, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0038, + "step": 18590 + }, + { + "epoch": 1.1145065612079814, + "grad_norm": 0.2950346767902374, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0068, + "step": 18600 + }, + { + "epoch": 1.1151057582838995, + "grad_norm": 0.13051068782806396, + "learning_rate": 8.025779439806006e-06, + "loss": 0.0041, + "step": 18610 + }, + { + "epoch": 1.115704955359818, + "grad_norm": 0.11036359518766403, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0074, + "step": 18620 + }, + { + "epoch": 1.116304152435736, + "grad_norm": 0.35306516289711, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0087, + "step": 18630 + }, + { + "epoch": 1.1169033495116545, + "grad_norm": 0.29782727360725403, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0045, + "step": 18640 + }, + { + "epoch": 1.1175025465875728, + "grad_norm": 0.20690713822841644, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0042, + "step": 18650 + }, + { + "epoch": 1.118101743663491, + "grad_norm": 0.16064110398292542, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0038, + "step": 18660 + }, + { + "epoch": 1.1187009407394093, + "grad_norm": 0.2477649450302124, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0042, + "step": 18670 + }, + { + "epoch": 1.1193001378153276, + "grad_norm": 0.25939393043518066, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0045, + "step": 18680 + }, + { + "epoch": 1.1198993348912458, + "grad_norm": 0.3345301151275635, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0045, + "step": 18690 + }, + { + "epoch": 1.120498531967164, + "grad_norm": 0.19570066034793854, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0052, + "step": 18700 + }, + { + "epoch": 1.1210977290430824, + "grad_norm": 0.09655601531267166, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0044, + "step": 18710 + }, + { + "epoch": 1.1216969261190006, + "grad_norm": 0.13345655798912048, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0031, + "step": 18720 + }, + { + "epoch": 1.122296123194919, + "grad_norm": 0.3130756616592407, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0072, + "step": 18730 + }, + { + "epoch": 1.1228953202708372, + "grad_norm": 0.16259168088436127, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0036, + "step": 18740 + }, + { + "epoch": 1.1234945173467554, + "grad_norm": 0.2581227123737335, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0037, + "step": 18750 + }, + { + "epoch": 1.1240937144226737, + "grad_norm": 0.36706119775772095, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0043, + "step": 18760 + }, + { + "epoch": 1.124692911498592, + "grad_norm": 0.1705426573753357, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0069, + "step": 18770 + }, + { + "epoch": 1.1252921085745102, + "grad_norm": 0.4281153380870819, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0057, + "step": 18780 + }, + { + "epoch": 1.1258913056504285, + "grad_norm": 0.25743696093559265, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0036, + "step": 18790 + }, + { + "epoch": 1.1264905027263468, + "grad_norm": 0.17692404985427856, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0044, + "step": 18800 + }, + { + "epoch": 1.127089699802265, + "grad_norm": 0.17617255449295044, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0043, + "step": 18810 + }, + { + "epoch": 1.1276888968781833, + "grad_norm": 0.193951815366745, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0042, + "step": 18820 + }, + { + "epoch": 1.1282880939541016, + "grad_norm": 0.2187023162841797, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0047, + "step": 18830 + }, + { + "epoch": 1.1288872910300198, + "grad_norm": 0.21488729119300842, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0039, + "step": 18840 + }, + { + "epoch": 1.129486488105938, + "grad_norm": 0.13388743996620178, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0043, + "step": 18850 + }, + { + "epoch": 1.1300856851818564, + "grad_norm": 0.26977118849754333, + "learning_rate": 7.796848308199681e-06, + "loss": 0.004, + "step": 18860 + }, + { + "epoch": 1.1306848822577746, + "grad_norm": 0.40695786476135254, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0049, + "step": 18870 + }, + { + "epoch": 1.131284079333693, + "grad_norm": 0.29070621728897095, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0056, + "step": 18880 + }, + { + "epoch": 1.1318832764096112, + "grad_norm": 0.2745647728443146, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0056, + "step": 18890 + }, + { + "epoch": 1.1324824734855294, + "grad_norm": 0.20881050825119019, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0057, + "step": 18900 + }, + { + "epoch": 1.1330816705614477, + "grad_norm": 0.17475518584251404, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0041, + "step": 18910 + }, + { + "epoch": 1.133680867637366, + "grad_norm": 0.2414310723543167, + "learning_rate": 7.742248115573104e-06, + "loss": 0.004, + "step": 18920 + }, + { + "epoch": 1.1342800647132842, + "grad_norm": 0.20051640272140503, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0042, + "step": 18930 + }, + { + "epoch": 1.1348792617892025, + "grad_norm": 0.18383435904979706, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0038, + "step": 18940 + }, + { + "epoch": 1.1354784588651208, + "grad_norm": 0.16546988487243652, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0041, + "step": 18950 + }, + { + "epoch": 1.136077655941039, + "grad_norm": 0.17165544629096985, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0057, + "step": 18960 + }, + { + "epoch": 1.1366768530169573, + "grad_norm": 0.25065234303474426, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0048, + "step": 18970 + }, + { + "epoch": 1.1372760500928756, + "grad_norm": 0.19762223958969116, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0038, + "step": 18980 + }, + { + "epoch": 1.1378752471687938, + "grad_norm": 0.23894545435905457, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0033, + "step": 18990 + }, + { + "epoch": 1.1384744442447121, + "grad_norm": 0.2860289216041565, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0053, + "step": 19000 + }, + { + "epoch": 1.1390736413206304, + "grad_norm": 0.3699626624584198, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0061, + "step": 19010 + }, + { + "epoch": 1.1396728383965486, + "grad_norm": 0.2370971292257309, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0043, + "step": 19020 + }, + { + "epoch": 1.140272035472467, + "grad_norm": 0.19790691137313843, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0042, + "step": 19030 + }, + { + "epoch": 1.1408712325483852, + "grad_norm": 0.14648208022117615, + "learning_rate": 7.633462930388875e-06, + "loss": 0.005, + "step": 19040 + }, + { + "epoch": 1.1414704296243035, + "grad_norm": 0.105158232152462, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0032, + "step": 19050 + }, + { + "epoch": 1.1420696267002217, + "grad_norm": 0.24994254112243652, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0042, + "step": 19060 + }, + { + "epoch": 1.14266882377614, + "grad_norm": 0.30648791790008545, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0058, + "step": 19070 + }, + { + "epoch": 1.1432680208520583, + "grad_norm": 0.16284243762493134, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0047, + "step": 19080 + }, + { + "epoch": 1.1438672179279765, + "grad_norm": 0.14919471740722656, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0045, + "step": 19090 + }, + { + "epoch": 1.1444664150038948, + "grad_norm": 0.14879491925239563, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0047, + "step": 19100 + }, + { + "epoch": 1.145065612079813, + "grad_norm": 0.11741457879543304, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0041, + "step": 19110 + }, + { + "epoch": 1.1456648091557313, + "grad_norm": 0.09406878799200058, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0029, + "step": 19120 + }, + { + "epoch": 1.1462640062316496, + "grad_norm": 0.20860706269741058, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0054, + "step": 19130 + }, + { + "epoch": 1.1468632033075679, + "grad_norm": 0.24234607815742493, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0047, + "step": 19140 + }, + { + "epoch": 1.1474624003834861, + "grad_norm": 0.27025938034057617, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0042, + "step": 19150 + }, + { + "epoch": 1.1480615974594044, + "grad_norm": 0.15129081904888153, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0046, + "step": 19160 + }, + { + "epoch": 1.1486607945353227, + "grad_norm": 0.11173490434885025, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0035, + "step": 19170 + }, + { + "epoch": 1.149259991611241, + "grad_norm": 0.2204807698726654, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0036, + "step": 19180 + }, + { + "epoch": 1.1498591886871592, + "grad_norm": 0.20111115276813507, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0087, + "step": 19190 + }, + { + "epoch": 1.1504583857630775, + "grad_norm": 0.213748961687088, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0045, + "step": 19200 + }, + { + "epoch": 1.1510575828389957, + "grad_norm": 0.21150177717208862, + "learning_rate": 7.480328799175369e-06, + "loss": 0.004, + "step": 19210 + }, + { + "epoch": 1.151656779914914, + "grad_norm": 0.2450210005044937, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0036, + "step": 19220 + }, + { + "epoch": 1.1522559769908323, + "grad_norm": 0.16161729395389557, + "learning_rate": 7.4623904967312e-06, + "loss": 0.004, + "step": 19230 + }, + { + "epoch": 1.1528551740667505, + "grad_norm": 0.15077564120292664, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0038, + "step": 19240 + }, + { + "epoch": 1.1534543711426688, + "grad_norm": 0.3078431487083435, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0051, + "step": 19250 + }, + { + "epoch": 1.154053568218587, + "grad_norm": 0.15213221311569214, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0032, + "step": 19260 + }, + { + "epoch": 1.1546527652945053, + "grad_norm": 0.12404917925596237, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0042, + "step": 19270 + }, + { + "epoch": 1.1552519623704236, + "grad_norm": 0.18779516220092773, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0041, + "step": 19280 + }, + { + "epoch": 1.1558511594463419, + "grad_norm": 0.4039568603038788, + "learning_rate": 7.408675563767873e-06, + "loss": 0.005, + "step": 19290 + }, + { + "epoch": 1.1564503565222601, + "grad_norm": 0.2045651078224182, + "learning_rate": 7.399737764864619e-06, + "loss": 0.0057, + "step": 19300 + }, + { + "epoch": 1.1570495535981784, + "grad_norm": 0.3885338306427002, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0049, + "step": 19310 + }, + { + "epoch": 1.1576487506740967, + "grad_norm": 0.253049373626709, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0059, + "step": 19320 + }, + { + "epoch": 1.158247947750015, + "grad_norm": 0.250356525182724, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0076, + "step": 19330 + }, + { + "epoch": 1.1588471448259332, + "grad_norm": 0.3269367814064026, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0112, + "step": 19340 + }, + { + "epoch": 1.1594463419018515, + "grad_norm": 0.15401138365268707, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0052, + "step": 19350 + }, + { + "epoch": 1.1600455389777697, + "grad_norm": 0.1631775051355362, + "learning_rate": 7.346200065486093e-06, + "loss": 0.004, + "step": 19360 + }, + { + "epoch": 1.160644736053688, + "grad_norm": 0.17112085223197937, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0038, + "step": 19370 + }, + { + "epoch": 1.1612439331296063, + "grad_norm": 0.24018551409244537, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0056, + "step": 19380 + }, + { + "epoch": 1.1618431302055245, + "grad_norm": 0.17964349687099457, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0057, + "step": 19390 + }, + { + "epoch": 1.1624423272814428, + "grad_norm": 0.1747465431690216, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0053, + "step": 19400 + }, + { + "epoch": 1.163041524357361, + "grad_norm": 0.21299205720424652, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0038, + "step": 19410 + }, + { + "epoch": 1.1636407214332793, + "grad_norm": 0.13219258189201355, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0057, + "step": 19420 + }, + { + "epoch": 1.1642399185091976, + "grad_norm": 1.0558332204818726, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0066, + "step": 19430 + }, + { + "epoch": 1.1648391155851159, + "grad_norm": 0.2154799997806549, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0041, + "step": 19440 + }, + { + "epoch": 1.1654383126610341, + "grad_norm": 0.13665339350700378, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0044, + "step": 19450 + }, + { + "epoch": 1.1660375097369524, + "grad_norm": 0.2101723700761795, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0039, + "step": 19460 + }, + { + "epoch": 1.1666367068128707, + "grad_norm": 0.13208501040935516, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0054, + "step": 19470 + }, + { + "epoch": 1.167235903888789, + "grad_norm": 0.09342823177576065, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0032, + "step": 19480 + }, + { + "epoch": 1.1678351009647072, + "grad_norm": 0.22464905679225922, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0055, + "step": 19490 + }, + { + "epoch": 1.1684342980406255, + "grad_norm": 0.17030438780784607, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0042, + "step": 19500 + }, + { + "epoch": 1.1690334951165438, + "grad_norm": 0.17673689126968384, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0055, + "step": 19510 + }, + { + "epoch": 1.169632692192462, + "grad_norm": 0.24041922390460968, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0048, + "step": 19520 + }, + { + "epoch": 1.1702318892683803, + "grad_norm": 0.14808662235736847, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0031, + "step": 19530 + }, + { + "epoch": 1.1708310863442986, + "grad_norm": 0.2489791214466095, + "learning_rate": 7.186522173441719e-06, + "loss": 0.004, + "step": 19540 + }, + { + "epoch": 1.1714302834202168, + "grad_norm": 0.19468742609024048, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0042, + "step": 19550 + }, + { + "epoch": 1.172029480496135, + "grad_norm": 0.15028323233127594, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0061, + "step": 19560 + }, + { + "epoch": 1.1726286775720534, + "grad_norm": 0.13852037489414215, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0045, + "step": 19570 + }, + { + "epoch": 1.1732278746479716, + "grad_norm": 0.1401798278093338, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0063, + "step": 19580 + }, + { + "epoch": 1.17382707172389, + "grad_norm": 0.1831122189760208, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0034, + "step": 19590 + }, + { + "epoch": 1.1744262687998082, + "grad_norm": 0.2867920994758606, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0044, + "step": 19600 + }, + { + "epoch": 1.1750254658757266, + "grad_norm": 0.13363438844680786, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0038, + "step": 19610 + }, + { + "epoch": 1.1756246629516447, + "grad_norm": 0.20085696876049042, + "learning_rate": 7.116016051769541e-06, + "loss": 0.004, + "step": 19620 + }, + { + "epoch": 1.1762238600275632, + "grad_norm": 0.1598372906446457, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0042, + "step": 19630 + }, + { + "epoch": 1.1768230571034812, + "grad_norm": 0.09672598540782928, + "learning_rate": 7.098434895408162e-06, + "loss": 0.004, + "step": 19640 + }, + { + "epoch": 1.1774222541793997, + "grad_norm": 0.18206225335597992, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0048, + "step": 19650 + }, + { + "epoch": 1.1780214512553178, + "grad_norm": 0.1818019449710846, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0038, + "step": 19660 + }, + { + "epoch": 1.1786206483312363, + "grad_norm": 0.21658800542354584, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0044, + "step": 19670 + }, + { + "epoch": 1.1792198454071543, + "grad_norm": 0.08513368666172028, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0038, + "step": 19680 + }, + { + "epoch": 1.1798190424830728, + "grad_norm": 0.10634194314479828, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0044, + "step": 19690 + }, + { + "epoch": 1.1804182395589908, + "grad_norm": 0.12106078863143921, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0037, + "step": 19700 + }, + { + "epoch": 1.1810174366349093, + "grad_norm": 0.11508465558290482, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0036, + "step": 19710 + }, + { + "epoch": 1.1816166337108274, + "grad_norm": 0.20805053412914276, + "learning_rate": 7.028294242074066e-06, + "loss": 0.004, + "step": 19720 + }, + { + "epoch": 1.1822158307867459, + "grad_norm": 0.23920200765132904, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0045, + "step": 19730 + }, + { + "epoch": 1.1828150278626641, + "grad_norm": 0.1300375908613205, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0045, + "step": 19740 + }, + { + "epoch": 1.1834142249385824, + "grad_norm": 0.23444809019565582, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0036, + "step": 19750 + }, + { + "epoch": 1.1840134220145007, + "grad_norm": 0.2636217772960663, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0044, + "step": 19760 + }, + { + "epoch": 1.184612619090419, + "grad_norm": 0.31166398525238037, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.005, + "step": 19770 + }, + { + "epoch": 1.1852118161663372, + "grad_norm": 0.32881107926368713, + "learning_rate": 6.975884226362e-06, + "loss": 0.0055, + "step": 19780 + }, + { + "epoch": 1.1858110132422555, + "grad_norm": 0.41748252511024475, + "learning_rate": 6.967165692827958e-06, + "loss": 0.006, + "step": 19790 + }, + { + "epoch": 1.1864102103181737, + "grad_norm": 0.1588834673166275, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0039, + "step": 19800 + }, + { + "epoch": 1.187009407394092, + "grad_norm": 0.23697984218597412, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0039, + "step": 19810 + }, + { + "epoch": 1.1876086044700103, + "grad_norm": 0.19356773793697357, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0061, + "step": 19820 + }, + { + "epoch": 1.1882078015459285, + "grad_norm": 0.16373206675052643, + "learning_rate": 6.932338988482141e-06, + "loss": 0.004, + "step": 19830 + }, + { + "epoch": 1.1888069986218468, + "grad_norm": 0.1331700086593628, + "learning_rate": 6.923644220932124e-06, + "loss": 0.004, + "step": 19840 + }, + { + "epoch": 1.189406195697765, + "grad_norm": 0.4039696753025055, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0057, + "step": 19850 + }, + { + "epoch": 1.1900053927736833, + "grad_norm": 0.30325421690940857, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0065, + "step": 19860 + }, + { + "epoch": 1.1906045898496016, + "grad_norm": 0.21767468750476837, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0038, + "step": 19870 + }, + { + "epoch": 1.1912037869255199, + "grad_norm": 0.17474445700645447, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0056, + "step": 19880 + }, + { + "epoch": 1.1918029840014381, + "grad_norm": 0.17118008434772491, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0045, + "step": 19890 + }, + { + "epoch": 1.1924021810773564, + "grad_norm": 0.44261473417282104, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0063, + "step": 19900 + }, + { + "epoch": 1.1930013781532747, + "grad_norm": 0.18502798676490784, + "learning_rate": 6.862915366041247e-06, + "loss": 0.004, + "step": 19910 + }, + { + "epoch": 1.193600575229193, + "grad_norm": 0.19384194910526276, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0036, + "step": 19920 + }, + { + "epoch": 1.1941997723051112, + "grad_norm": 0.1448352187871933, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0044, + "step": 19930 + }, + { + "epoch": 1.1947989693810295, + "grad_norm": 0.3728172779083252, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0038, + "step": 19940 + }, + { + "epoch": 1.1953981664569477, + "grad_norm": 0.31421783566474915, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0043, + "step": 19950 + }, + { + "epoch": 1.195997363532866, + "grad_norm": 0.28181371092796326, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0045, + "step": 19960 + }, + { + "epoch": 1.1965965606087843, + "grad_norm": 0.2249889373779297, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0041, + "step": 19970 + }, + { + "epoch": 1.1971957576847025, + "grad_norm": 0.26402008533477783, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0043, + "step": 19980 + }, + { + "epoch": 1.1977949547606208, + "grad_norm": 0.22621415555477142, + "learning_rate": 6.793802468038111e-06, + "loss": 0.004, + "step": 19990 + }, + { + "epoch": 1.198394151836539, + "grad_norm": 0.2681289315223694, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0045, + "step": 20000 + }, + { + "epoch": 1.1989933489124573, + "grad_norm": 0.17681041359901428, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0037, + "step": 20010 + }, + { + "epoch": 1.1995925459883756, + "grad_norm": 0.16526542603969574, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0032, + "step": 20020 + }, + { + "epoch": 1.2001917430642939, + "grad_norm": 0.30313149094581604, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0046, + "step": 20030 + }, + { + "epoch": 1.2007909401402121, + "grad_norm": 0.17628541588783264, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0065, + "step": 20040 + }, + { + "epoch": 1.2013901372161304, + "grad_norm": 0.1840096414089203, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0051, + "step": 20050 + }, + { + "epoch": 1.2019893342920487, + "grad_norm": 0.146232470870018, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0035, + "step": 20060 + }, + { + "epoch": 1.202588531367967, + "grad_norm": 0.4804438352584839, + "learning_rate": 6.725005485342219e-06, + "loss": 0.005, + "step": 20070 + }, + { + "epoch": 1.2031877284438852, + "grad_norm": 0.2245558500289917, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0039, + "step": 20080 + }, + { + "epoch": 1.2037869255198035, + "grad_norm": 0.21845588088035583, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0053, + "step": 20090 + }, + { + "epoch": 1.2043861225957218, + "grad_norm": 0.1743943691253662, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0037, + "step": 20100 + }, + { + "epoch": 1.20498531967164, + "grad_norm": 0.16978098452091217, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0036, + "step": 20110 + }, + { + "epoch": 1.2055845167475583, + "grad_norm": 0.27158796787261963, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0043, + "step": 20120 + }, + { + "epoch": 1.2061837138234766, + "grad_norm": 0.13516400754451752, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0048, + "step": 20130 + }, + { + "epoch": 1.2067829108993948, + "grad_norm": 0.1645064353942871, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0038, + "step": 20140 + }, + { + "epoch": 1.207382107975313, + "grad_norm": 0.07616083323955536, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0046, + "step": 20150 + }, + { + "epoch": 1.2079813050512314, + "grad_norm": 0.13306911289691925, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0039, + "step": 20160 + }, + { + "epoch": 1.2085805021271496, + "grad_norm": 0.19445037841796875, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0044, + "step": 20170 + }, + { + "epoch": 1.209179699203068, + "grad_norm": 0.18423207104206085, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0049, + "step": 20180 + }, + { + "epoch": 1.2097788962789862, + "grad_norm": 0.19280213117599487, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0043, + "step": 20190 + }, + { + "epoch": 1.2103780933549044, + "grad_norm": 0.25472623109817505, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0033, + "step": 20200 + }, + { + "epoch": 1.2109772904308227, + "grad_norm": 0.16799427568912506, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0031, + "step": 20210 + }, + { + "epoch": 1.211576487506741, + "grad_norm": 0.2097395807504654, + "learning_rate": 6.596880604028027e-06, + "loss": 0.004, + "step": 20220 + }, + { + "epoch": 1.2121756845826592, + "grad_norm": 0.31450021266937256, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0047, + "step": 20230 + }, + { + "epoch": 1.2127748816585775, + "grad_norm": 0.16530238091945648, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0034, + "step": 20240 + }, + { + "epoch": 1.2133740787344958, + "grad_norm": 0.2506805956363678, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0038, + "step": 20250 + }, + { + "epoch": 1.213973275810414, + "grad_norm": 0.1876160055398941, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0035, + "step": 20260 + }, + { + "epoch": 1.2145724728863323, + "grad_norm": 0.23704354465007782, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0041, + "step": 20270 + }, + { + "epoch": 1.2151716699622506, + "grad_norm": 0.13814999163150787, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0042, + "step": 20280 + }, + { + "epoch": 1.2157708670381688, + "grad_norm": 0.1164403185248375, + "learning_rate": 6.53748481975927e-06, + "loss": 0.0042, + "step": 20290 + }, + { + "epoch": 1.216370064114087, + "grad_norm": 0.23078426718711853, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0038, + "step": 20300 + }, + { + "epoch": 1.2169692611900054, + "grad_norm": 0.21749110519886017, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0046, + "step": 20310 + }, + { + "epoch": 1.2175684582659236, + "grad_norm": 0.24972137808799744, + "learning_rate": 6.512107839793337e-06, + "loss": 0.0041, + "step": 20320 + }, + { + "epoch": 1.218167655341842, + "grad_norm": 0.2491082102060318, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0043, + "step": 20330 + }, + { + "epoch": 1.2187668524177602, + "grad_norm": 0.14915086328983307, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0048, + "step": 20340 + }, + { + "epoch": 1.2193660494936784, + "grad_norm": 0.2794116735458374, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0035, + "step": 20350 + }, + { + "epoch": 1.2199652465695967, + "grad_norm": 0.13765662908554077, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0047, + "step": 20360 + }, + { + "epoch": 1.220564443645515, + "grad_norm": 0.14874878525733948, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0042, + "step": 20370 + }, + { + "epoch": 1.2211636407214332, + "grad_norm": 0.1800280064344406, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0057, + "step": 20380 + }, + { + "epoch": 1.2217628377973515, + "grad_norm": 0.17518648505210876, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0049, + "step": 20390 + }, + { + "epoch": 1.2223620348732698, + "grad_norm": 0.16315865516662598, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0045, + "step": 20400 + }, + { + "epoch": 1.222961231949188, + "grad_norm": 0.3590790033340454, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0039, + "step": 20410 + }, + { + "epoch": 1.2235604290251063, + "grad_norm": 0.14534324407577515, + "learning_rate": 6.427861749601945e-06, + "loss": 0.004, + "step": 20420 + }, + { + "epoch": 1.2241596261010246, + "grad_norm": 0.1662825047969818, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0042, + "step": 20430 + }, + { + "epoch": 1.2247588231769428, + "grad_norm": 0.27466440200805664, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0045, + "step": 20440 + }, + { + "epoch": 1.2253580202528611, + "grad_norm": 0.1323469579219818, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0047, + "step": 20450 + }, + { + "epoch": 1.2259572173287794, + "grad_norm": 0.12367355078458786, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0077, + "step": 20460 + }, + { + "epoch": 1.2265564144046976, + "grad_norm": 0.18238325417041779, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0058, + "step": 20470 + }, + { + "epoch": 1.227155611480616, + "grad_norm": 0.2733745574951172, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0038, + "step": 20480 + }, + { + "epoch": 1.2277548085565342, + "grad_norm": 0.3367181420326233, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0039, + "step": 20490 + }, + { + "epoch": 1.2283540056324525, + "grad_norm": 0.20671530067920685, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0034, + "step": 20500 + }, + { + "epoch": 1.2289532027083707, + "grad_norm": 0.23353071510791779, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0033, + "step": 20510 + }, + { + "epoch": 1.229552399784289, + "grad_norm": 0.21081902086734772, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0031, + "step": 20520 + }, + { + "epoch": 1.2301515968602073, + "grad_norm": 0.3426077365875244, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0049, + "step": 20530 + }, + { + "epoch": 1.2307507939361255, + "grad_norm": 0.3905622959136963, + "learning_rate": 6.327475567095824e-06, + "loss": 0.004, + "step": 20540 + }, + { + "epoch": 1.2313499910120438, + "grad_norm": 0.1888400912284851, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0041, + "step": 20550 + }, + { + "epoch": 1.231949188087962, + "grad_norm": 0.23982487618923187, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0041, + "step": 20560 + }, + { + "epoch": 1.2325483851638803, + "grad_norm": 0.2061331421136856, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0046, + "step": 20570 + }, + { + "epoch": 1.2331475822397986, + "grad_norm": 0.17000116407871246, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0033, + "step": 20580 + }, + { + "epoch": 1.2337467793157169, + "grad_norm": 0.15905790030956268, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0049, + "step": 20590 + }, + { + "epoch": 1.2343459763916351, + "grad_norm": 0.16794419288635254, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0052, + "step": 20600 + }, + { + "epoch": 1.2349451734675534, + "grad_norm": 0.3003343641757965, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0061, + "step": 20610 + }, + { + "epoch": 1.2355443705434717, + "grad_norm": 0.1429288536310196, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0042, + "step": 20620 + }, + { + "epoch": 1.23614356761939, + "grad_norm": 0.18542084097862244, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0047, + "step": 20630 + }, + { + "epoch": 1.2367427646953082, + "grad_norm": 0.2692892253398895, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0035, + "step": 20640 + }, + { + "epoch": 1.2373419617712265, + "grad_norm": 0.23286236822605133, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0037, + "step": 20650 + }, + { + "epoch": 1.2379411588471447, + "grad_norm": 0.0963423103094101, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0041, + "step": 20660 + }, + { + "epoch": 1.238540355923063, + "grad_norm": 0.1425798237323761, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0043, + "step": 20670 + }, + { + "epoch": 1.2391395529989815, + "grad_norm": 0.0960182398557663, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0046, + "step": 20680 + }, + { + "epoch": 1.2397387500748995, + "grad_norm": 0.2674477994441986, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0043, + "step": 20690 + }, + { + "epoch": 1.240337947150818, + "grad_norm": 0.16276703774929047, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0041, + "step": 20700 + }, + { + "epoch": 1.240937144226736, + "grad_norm": 0.24255621433258057, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.003, + "step": 20710 + }, + { + "epoch": 1.2415363413026546, + "grad_norm": 0.20395220816135406, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0054, + "step": 20720 + }, + { + "epoch": 1.2421355383785726, + "grad_norm": 0.12099681794643402, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0082, + "step": 20730 + }, + { + "epoch": 1.242734735454491, + "grad_norm": 0.14017170667648315, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0042, + "step": 20740 + }, + { + "epoch": 1.2433339325304091, + "grad_norm": 0.28132137656211853, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0043, + "step": 20750 + }, + { + "epoch": 1.2439331296063276, + "grad_norm": 0.27220970392227173, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0039, + "step": 20760 + }, + { + "epoch": 1.2445323266822457, + "grad_norm": 0.23647353053092957, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0058, + "step": 20770 + }, + { + "epoch": 1.2451315237581642, + "grad_norm": 0.20623824000358582, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0053, + "step": 20780 + }, + { + "epoch": 1.2457307208340822, + "grad_norm": 0.12366114556789398, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0037, + "step": 20790 + }, + { + "epoch": 1.2463299179100007, + "grad_norm": 0.23330192267894745, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0056, + "step": 20800 + }, + { + "epoch": 1.246929114985919, + "grad_norm": 0.19991633296012878, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0031, + "step": 20810 + }, + { + "epoch": 1.2475283120618372, + "grad_norm": 0.1496160626411438, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0058, + "step": 20820 + }, + { + "epoch": 1.2481275091377555, + "grad_norm": 0.13247868418693542, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0037, + "step": 20830 + }, + { + "epoch": 1.2487267062136738, + "grad_norm": 0.19072194397449493, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0057, + "step": 20840 + }, + { + "epoch": 1.249325903289592, + "grad_norm": 0.10773085057735443, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0042, + "step": 20850 + }, + { + "epoch": 1.2499251003655103, + "grad_norm": 0.14058449864387512, + "learning_rate": 6.063685039328116e-06, + "loss": 0.005, + "step": 20860 + }, + { + "epoch": 1.2505242974414286, + "grad_norm": 0.10825464874505997, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0042, + "step": 20870 + }, + { + "epoch": 1.2511234945173468, + "grad_norm": 0.18059906363487244, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0046, + "step": 20880 + }, + { + "epoch": 1.251722691593265, + "grad_norm": 0.1713389754295349, + "learning_rate": 6.039253929027638e-06, + "loss": 0.005, + "step": 20890 + }, + { + "epoch": 1.2523218886691834, + "grad_norm": 0.23789434134960175, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0047, + "step": 20900 + }, + { + "epoch": 1.2529210857451016, + "grad_norm": 0.17626744508743286, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0041, + "step": 20910 + }, + { + "epoch": 1.25352028282102, + "grad_norm": 0.2091904729604721, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0044, + "step": 20920 + }, + { + "epoch": 1.2541194798969382, + "grad_norm": 0.17293672263622284, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0043, + "step": 20930 + }, + { + "epoch": 1.2547186769728564, + "grad_norm": 0.13156521320343018, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0039, + "step": 20940 + }, + { + "epoch": 1.2553178740487747, + "grad_norm": 0.19591976702213287, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0043, + "step": 20950 + }, + { + "epoch": 1.255917071124693, + "grad_norm": 0.16212835907936096, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0039, + "step": 20960 + }, + { + "epoch": 1.2565162682006112, + "grad_norm": 0.10661022365093231, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0037, + "step": 20970 + }, + { + "epoch": 1.2571154652765295, + "grad_norm": 0.16630858182907104, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0038, + "step": 20980 + }, + { + "epoch": 1.2577146623524478, + "grad_norm": 0.11001022905111313, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0037, + "step": 20990 + }, + { + "epoch": 1.258313859428366, + "grad_norm": 0.1888381838798523, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0044, + "step": 21000 + }, + { + "epoch": 1.2589130565042843, + "grad_norm": 0.19239328801631927, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0044, + "step": 21010 + }, + { + "epoch": 1.2595122535802026, + "grad_norm": 0.16555139422416687, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0032, + "step": 21020 + }, + { + "epoch": 1.2601114506561208, + "grad_norm": 0.19748231768608093, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0043, + "step": 21030 + }, + { + "epoch": 1.2607106477320391, + "grad_norm": 0.1546473354101181, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0049, + "step": 21040 + }, + { + "epoch": 1.2613098448079574, + "grad_norm": 0.30511707067489624, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0037, + "step": 21050 + }, + { + "epoch": 1.2619090418838756, + "grad_norm": 0.1722872257232666, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0048, + "step": 21060 + }, + { + "epoch": 1.262508238959794, + "grad_norm": 0.1784086525440216, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0049, + "step": 21070 + }, + { + "epoch": 1.2631074360357122, + "grad_norm": 0.15101182460784912, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0042, + "step": 21080 + }, + { + "epoch": 1.2637066331116305, + "grad_norm": 0.1252688318490982, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0041, + "step": 21090 + }, + { + "epoch": 1.2643058301875487, + "grad_norm": 0.15101821720600128, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0043, + "step": 21100 + }, + { + "epoch": 1.264905027263467, + "grad_norm": 0.21302345395088196, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0035, + "step": 21110 + }, + { + "epoch": 1.2655042243393853, + "grad_norm": 0.1591431051492691, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0033, + "step": 21120 + }, + { + "epoch": 1.2661034214153035, + "grad_norm": 0.16010484099388123, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0049, + "step": 21130 + }, + { + "epoch": 1.2667026184912218, + "grad_norm": 0.19287234544754028, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0037, + "step": 21140 + }, + { + "epoch": 1.26730181556714, + "grad_norm": 0.1804349720478058, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0036, + "step": 21150 + }, + { + "epoch": 1.2679010126430583, + "grad_norm": 0.14769446849822998, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0044, + "step": 21160 + }, + { + "epoch": 1.2685002097189766, + "grad_norm": 0.15914054214954376, + "learning_rate": 5.813791207086085e-06, + "loss": 0.004, + "step": 21170 + }, + { + "epoch": 1.2690994067948949, + "grad_norm": 0.19632315635681152, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0034, + "step": 21180 + }, + { + "epoch": 1.2696986038708131, + "grad_norm": 0.3017818331718445, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0046, + "step": 21190 + }, + { + "epoch": 1.2702978009467314, + "grad_norm": 0.2728461027145386, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0044, + "step": 21200 + }, + { + "epoch": 1.2708969980226497, + "grad_norm": 0.18619874119758606, + "learning_rate": 5.781966956563247e-06, + "loss": 0.004, + "step": 21210 + }, + { + "epoch": 1.271496195098568, + "grad_norm": 0.1235085129737854, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0037, + "step": 21220 + }, + { + "epoch": 1.2720953921744862, + "grad_norm": 0.15798084437847137, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0035, + "step": 21230 + }, + { + "epoch": 1.2726945892504045, + "grad_norm": 0.15713484585285187, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0036, + "step": 21240 + }, + { + "epoch": 1.2732937863263227, + "grad_norm": 0.15594886243343353, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0038, + "step": 21250 + }, + { + "epoch": 1.273892983402241, + "grad_norm": 0.1558992713689804, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0037, + "step": 21260 + }, + { + "epoch": 1.2744921804781593, + "grad_norm": 0.20599815249443054, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0054, + "step": 21270 + }, + { + "epoch": 1.2750913775540775, + "grad_norm": 0.2785670757293701, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0042, + "step": 21280 + }, + { + "epoch": 1.2756905746299958, + "grad_norm": 0.22550497949123383, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0035, + "step": 21290 + }, + { + "epoch": 1.276289771705914, + "grad_norm": 0.15210074186325073, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0035, + "step": 21300 + }, + { + "epoch": 1.2768889687818323, + "grad_norm": 0.18905121088027954, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0035, + "step": 21310 + }, + { + "epoch": 1.2774881658577506, + "grad_norm": 0.1337066888809204, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0046, + "step": 21320 + }, + { + "epoch": 1.2780873629336689, + "grad_norm": 0.23699362576007843, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0048, + "step": 21330 + }, + { + "epoch": 1.2786865600095871, + "grad_norm": 0.2480958253145218, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0037, + "step": 21340 + }, + { + "epoch": 1.2792857570855054, + "grad_norm": 0.09328999370336533, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0047, + "step": 21350 + }, + { + "epoch": 1.2798849541614237, + "grad_norm": 0.3416430950164795, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0048, + "step": 21360 + }, + { + "epoch": 1.280484151237342, + "grad_norm": 0.13258710503578186, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0032, + "step": 21370 + }, + { + "epoch": 1.2810833483132602, + "grad_norm": 0.18493984639644623, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0037, + "step": 21380 + }, + { + "epoch": 1.2816825453891785, + "grad_norm": 0.10433483123779297, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0045, + "step": 21390 + }, + { + "epoch": 1.2822817424650967, + "grad_norm": 0.18333138525485992, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0038, + "step": 21400 + }, + { + "epoch": 1.282880939541015, + "grad_norm": 0.25164106488227844, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0058, + "step": 21410 + }, + { + "epoch": 1.2834801366169333, + "grad_norm": 0.17989882826805115, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0041, + "step": 21420 + }, + { + "epoch": 1.2840793336928515, + "grad_norm": 0.1597793847322464, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0036, + "step": 21430 + }, + { + "epoch": 1.2846785307687698, + "grad_norm": 0.1543695032596588, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0036, + "step": 21440 + }, + { + "epoch": 1.285277727844688, + "grad_norm": 0.2985675036907196, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0043, + "step": 21450 + }, + { + "epoch": 1.2858769249206063, + "grad_norm": 0.1357773244380951, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0036, + "step": 21460 + }, + { + "epoch": 1.2864761219965246, + "grad_norm": 0.23978300392627716, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.005, + "step": 21470 + }, + { + "epoch": 1.2870753190724429, + "grad_norm": 0.12806151807308197, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0035, + "step": 21480 + }, + { + "epoch": 1.2876745161483611, + "grad_norm": 0.2222731113433838, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0039, + "step": 21490 + }, + { + "epoch": 1.2882737132242794, + "grad_norm": 0.16744646430015564, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0035, + "step": 21500 + }, + { + "epoch": 1.2888729103001977, + "grad_norm": 0.2162114977836609, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0048, + "step": 21510 + }, + { + "epoch": 1.289472107376116, + "grad_norm": 0.14857177436351776, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0036, + "step": 21520 + }, + { + "epoch": 1.2900713044520342, + "grad_norm": 0.21318115293979645, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0032, + "step": 21530 + }, + { + "epoch": 1.2906705015279525, + "grad_norm": 0.257682204246521, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0036, + "step": 21540 + }, + { + "epoch": 1.2912696986038708, + "grad_norm": 0.254349946975708, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0042, + "step": 21550 + }, + { + "epoch": 1.291868895679789, + "grad_norm": 0.148925319314003, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0029, + "step": 21560 + }, + { + "epoch": 1.2924680927557073, + "grad_norm": 0.1902056336402893, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0031, + "step": 21570 + }, + { + "epoch": 1.2930672898316256, + "grad_norm": 0.17580094933509827, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0026, + "step": 21580 + }, + { + "epoch": 1.2936664869075438, + "grad_norm": 0.18856695294380188, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0045, + "step": 21590 + }, + { + "epoch": 1.294265683983462, + "grad_norm": 0.17185454070568085, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0039, + "step": 21600 + }, + { + "epoch": 1.2948648810593804, + "grad_norm": 0.1997966468334198, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0043, + "step": 21610 + }, + { + "epoch": 1.2954640781352986, + "grad_norm": 0.14173944294452667, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0033, + "step": 21620 + }, + { + "epoch": 1.296063275211217, + "grad_norm": 0.20653635263442993, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0039, + "step": 21630 + }, + { + "epoch": 1.2966624722871352, + "grad_norm": 0.19571708142757416, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0026, + "step": 21640 + }, + { + "epoch": 1.2972616693630534, + "grad_norm": 0.0877508670091629, + "learning_rate": 5.438496901657042e-06, + "loss": 0.005, + "step": 21650 + }, + { + "epoch": 1.2978608664389717, + "grad_norm": 0.17305001616477966, + "learning_rate": 5.430834687545416e-06, + "loss": 0.0038, + "step": 21660 + }, + { + "epoch": 1.2984600635148902, + "grad_norm": 0.16555450856685638, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0035, + "step": 21670 + }, + { + "epoch": 1.2990592605908082, + "grad_norm": 0.15395715832710266, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0035, + "step": 21680 + }, + { + "epoch": 1.2996584576667267, + "grad_norm": 0.2430422455072403, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0032, + "step": 21690 + }, + { + "epoch": 1.3002576547426448, + "grad_norm": 0.2465265393257141, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0034, + "step": 21700 + }, + { + "epoch": 1.3008568518185633, + "grad_norm": 0.08382703363895416, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0038, + "step": 21710 + }, + { + "epoch": 1.3014560488944813, + "grad_norm": 0.3427184224128723, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0042, + "step": 21720 + }, + { + "epoch": 1.3020552459703998, + "grad_norm": 0.13029031455516815, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0039, + "step": 21730 + }, + { + "epoch": 1.3026544430463178, + "grad_norm": 0.11826448887586594, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0035, + "step": 21740 + }, + { + "epoch": 1.3032536401222363, + "grad_norm": 0.1612391620874405, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0039, + "step": 21750 + }, + { + "epoch": 1.3038528371981544, + "grad_norm": 0.21143540740013123, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0057, + "step": 21760 + }, + { + "epoch": 1.3044520342740729, + "grad_norm": 0.22977286577224731, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.005, + "step": 21770 + }, + { + "epoch": 1.305051231349991, + "grad_norm": 0.11853202432394028, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0058, + "step": 21780 + }, + { + "epoch": 1.3056504284259094, + "grad_norm": 0.24277184903621674, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0038, + "step": 21790 + }, + { + "epoch": 1.3062496255018274, + "grad_norm": 0.2625603675842285, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0048, + "step": 21800 + }, + { + "epoch": 1.306848822577746, + "grad_norm": 0.1333419382572174, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0033, + "step": 21810 + }, + { + "epoch": 1.307448019653664, + "grad_norm": 0.09627685695886612, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0035, + "step": 21820 + }, + { + "epoch": 1.3080472167295825, + "grad_norm": 0.416618674993515, + "learning_rate": 5.301584321328435e-06, + "loss": 0.0038, + "step": 21830 + }, + { + "epoch": 1.3086464138055005, + "grad_norm": 0.18699553608894348, + "learning_rate": 5.294041118587667e-06, + "loss": 0.004, + "step": 21840 + }, + { + "epoch": 1.309245610881419, + "grad_norm": 0.1827329397201538, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0039, + "step": 21850 + }, + { + "epoch": 1.309844807957337, + "grad_norm": 0.19719162583351135, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0034, + "step": 21860 + }, + { + "epoch": 1.3104440050332555, + "grad_norm": 0.09895205497741699, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0042, + "step": 21870 + }, + { + "epoch": 1.3110432021091736, + "grad_norm": 0.11187861114740372, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0036, + "step": 21880 + }, + { + "epoch": 1.311642399185092, + "grad_norm": 0.154103085398674, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0037, + "step": 21890 + }, + { + "epoch": 1.3122415962610101, + "grad_norm": 0.11124159395694733, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0038, + "step": 21900 + }, + { + "epoch": 1.3128407933369286, + "grad_norm": 0.27686378359794617, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0041, + "step": 21910 + }, + { + "epoch": 1.3134399904128466, + "grad_norm": 0.12900429964065552, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0036, + "step": 21920 + }, + { + "epoch": 1.3140391874887651, + "grad_norm": 0.26441213488578796, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0032, + "step": 21930 + }, + { + "epoch": 1.3146383845646834, + "grad_norm": 0.2187345325946808, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.004, + "step": 21940 + }, + { + "epoch": 1.3152375816406017, + "grad_norm": 0.08503159135580063, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0034, + "step": 21950 + }, + { + "epoch": 1.31583677871652, + "grad_norm": 0.12869144976139069, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0035, + "step": 21960 + }, + { + "epoch": 1.3164359757924382, + "grad_norm": 0.13212713599205017, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0027, + "step": 21970 + }, + { + "epoch": 1.3170351728683565, + "grad_norm": 0.23211228847503662, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0032, + "step": 21980 + }, + { + "epoch": 1.3176343699442747, + "grad_norm": 0.2017366737127304, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0043, + "step": 21990 + }, + { + "epoch": 1.318233567020193, + "grad_norm": 0.21221789717674255, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0039, + "step": 22000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3741821653522842e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..19d22af7b7d6155175015b5c3c5b452030d153ea --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-22000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccf8e16588ffacf58cd09ed0241d355125d76c992d11c15a4bc8ee94db38dc3b +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a0935b1e5a6557a7aa8f3da736e7cd7335fc8dc4 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58cade9cc73f58851b67b2dd5bff0393b3f17db9ceb7021ef9e069fc58281f16 +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..72f489ab693371707524c153c1d652d06cba9c40 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70d1599e9a5532ee72df09ee3c2bc6ecede0e5cee41590ae9d9c244a7a800ce4 +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7df53a4f3de7b0e2e03954c17ab5a1152145e161 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:595e0178886c6822751d405dba427fd221e3a760e32ce73bdaadc2ec38b33b9c +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..0da6184f9732635317d9591566929a0f088174db --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -24.608807465362545, + -30.57493604888916, + -14.421680474472046, + -1.8400005650520326, + -2.2583390679359434, + -1.9374337060928344, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 2.947746359062201, + 22.348905650329584, + 21.642364361572263, + 2.36660552740097, + 4.0908002225875855, + 3.2823701507568366, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + -6.435277462005615, + -1.046771764755249, + 3.5443263053894043, + 0.010237408801913261, + 0.7088965773582458, + 0.433538019657135, + 0.11327514797449112, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 7.037599563598633, + 16.91518783569336, + 8.290277481079102, + 0.6919190883636475, + 1.1289485692977905, + 0.9604002833366394, + 0.9935636520385742, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.702568125152588, + -21.763728466033935, + -21.216347326660156, + -2.3684931322097778, + -4.066458044528961, + -3.2888745792388914, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.601868363571164, + 30.525507734680176, + 14.354210775756833, + 1.8357849156379702, + 2.250663768482209, + 1.934181491851806, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.545124530792236, + 1.3164341449737549, + -3.4697155952453613, + -0.00962071679532528, + -0.7082296013832092, + -0.43808361887931824, + 0.13391299545764923, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.980162143707275, + 16.702543258666992, + 8.168180465698242, + 0.6913491487503052, + 1.1232151985168457, + 0.9606267809867859, + 0.990993082523346, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff8c0ecf109af377331583e4079865e7d8037bc8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 5 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8ac6af9f7fa2495621e368a658ec831ec8194c8f --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/trainer_state.json @@ -0,0 +1,16834 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4380729822038467, + "eval_steps": 500, + "global_step": 24000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 2.688621997833252, + "learning_rate": 1.8e-07, + "loss": 0.1495, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.1722424030303955, + "learning_rate": 3.8e-07, + "loss": 0.1358, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 2.3095974922180176, + "learning_rate": 5.800000000000001e-07, + "loss": 0.1268, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 2.131070852279663, + "learning_rate": 7.8e-07, + "loss": 0.1224, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 2.273555278778076, + "learning_rate": 9.800000000000001e-07, + "loss": 0.118, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 1.3571869134902954, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.111, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 1.6004165410995483, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0826, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 1.0413638353347778, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.0657, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 1.1965473890304565, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.0493, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 1.1422100067138672, + "learning_rate": 1.98e-06, + "loss": 0.0444, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 0.6911118626594543, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.0457, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 0.6770259737968445, + "learning_rate": 2.38e-06, + "loss": 0.0257, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 0.4811704456806183, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.0208, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 0.7260023951530457, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.0203, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 0.4369716942310333, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.0174, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 0.4100959300994873, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.0133, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 1.0024627447128296, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.0149, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.4598183035850525, + "learning_rate": 3.58e-06, + "loss": 0.0143, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 0.7042055130004883, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.0143, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 0.7677909731864929, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0151, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 0.45090702176094055, + "learning_rate": 4.18e-06, + "loss": 0.0113, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 0.4400976598262787, + "learning_rate": 4.38e-06, + "loss": 0.0155, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 0.2424178272485733, + "learning_rate": 4.58e-06, + "loss": 0.0113, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 0.4720967411994934, + "learning_rate": 4.78e-06, + "loss": 0.0166, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 0.41622042655944824, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0104, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 0.6915765404701233, + "learning_rate": 5.18e-06, + "loss": 0.0108, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.25931113958358765, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0104, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.42486071586608887, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0084, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.3798843324184418, + "learning_rate": 5.78e-06, + "loss": 0.0107, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.3281213343143463, + "learning_rate": 5.98e-06, + "loss": 0.0081, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 0.3394489884376526, + "learning_rate": 6.18e-06, + "loss": 0.01, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 0.38298189640045166, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0098, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 0.3188078999519348, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0104, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.3152049779891968, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0087, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.34163472056388855, + "learning_rate": 6.98e-06, + "loss": 0.01, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 0.43860143423080444, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0065, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.2845093309879303, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0086, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 0.4009752869606018, + "learning_rate": 7.58e-06, + "loss": 0.0099, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.37756970524787903, + "learning_rate": 7.78e-06, + "loss": 0.0097, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.38135284185409546, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0076, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 0.3145769536495209, + "learning_rate": 8.18e-06, + "loss": 0.0106, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 0.32534345984458923, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0069, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.24024507403373718, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0089, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 0.32857799530029297, + "learning_rate": 8.78e-06, + "loss": 0.0105, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.28823110461235046, + "learning_rate": 8.98e-06, + "loss": 0.0101, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 0.32506972551345825, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0126, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 0.19875723123550415, + "learning_rate": 9.38e-06, + "loss": 0.0081, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.3245992958545685, + "learning_rate": 9.58e-06, + "loss": 0.0099, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.24933603405952454, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0117, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.3154098391532898, + "learning_rate": 9.980000000000001e-06, + "loss": 0.009, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.3685779273509979, + "learning_rate": 1.018e-05, + "loss": 0.0101, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 0.7251449823379517, + "learning_rate": 1.038e-05, + "loss": 0.0119, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 0.3183727264404297, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.009, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.3737810254096985, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0089, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.45293235778808594, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.011, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 0.3476772606372833, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.008, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.38373252749443054, + "learning_rate": 1.138e-05, + "loss": 0.0088, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 0.2530902624130249, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.008, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 0.19455896317958832, + "learning_rate": 1.178e-05, + "loss": 0.008, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.3315221071243286, + "learning_rate": 1.198e-05, + "loss": 0.0102, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.23430880904197693, + "learning_rate": 1.218e-05, + "loss": 0.007, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.4636307656764984, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0075, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.3785994052886963, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0109, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.2804955542087555, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0099, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.393702894449234, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0132, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.400641530752182, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0099, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 0.24428881704807281, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0076, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 0.4449252188205719, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0103, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.406582236289978, + "learning_rate": 1.378e-05, + "loss": 0.0098, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.36386170983314514, + "learning_rate": 1.398e-05, + "loss": 0.0088, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.38196030259132385, + "learning_rate": 1.418e-05, + "loss": 0.01, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.28740620613098145, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.008, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.3616485297679901, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0094, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.4004146158695221, + "learning_rate": 1.478e-05, + "loss": 0.009, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.4585514962673187, + "learning_rate": 1.498e-05, + "loss": 0.0092, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.20028235018253326, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0138, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 0.46603646874427795, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0139, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.3518030047416687, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0116, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.22323082387447357, + "learning_rate": 1.578e-05, + "loss": 0.0097, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.26777058839797974, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0081, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.32380548119544983, + "learning_rate": 1.618e-05, + "loss": 0.0087, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.5248059630393982, + "learning_rate": 1.638e-05, + "loss": 0.0102, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.3495309054851532, + "learning_rate": 1.658e-05, + "loss": 0.0121, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.3551771342754364, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.5039486289024353, + "learning_rate": 1.698e-05, + "loss": 0.0094, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.3826751410961151, + "learning_rate": 1.718e-05, + "loss": 0.0107, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.46699973940849304, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0122, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3312668204307556, + "learning_rate": 1.758e-05, + "loss": 0.0087, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 0.28113219141960144, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0121, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.49752357602119446, + "learning_rate": 1.798e-05, + "loss": 0.0101, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.4177795350551605, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0096, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.34015583992004395, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0082, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.4612225890159607, + "learning_rate": 1.858e-05, + "loss": 0.0084, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.3813643753528595, + "learning_rate": 1.878e-05, + "loss": 0.012, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 0.27937838435173035, + "learning_rate": 1.898e-05, + "loss": 0.0104, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.4471273422241211, + "learning_rate": 1.918e-05, + "loss": 0.0125, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.4010440707206726, + "learning_rate": 1.938e-05, + "loss": 0.0106, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.41607654094696045, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0107, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 0.3589233458042145, + "learning_rate": 1.978e-05, + "loss": 0.0081, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.5726460814476013, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0111, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.36717164516448975, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0102, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.47284170985221863, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.01, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.5372244119644165, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0117, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.40928924083709717, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0088, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.4905182421207428, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0107, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.3709850609302521, + "learning_rate": 1.999981616897523e-05, + "loss": 0.01, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 0.6419615745544434, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0095, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.4986196458339691, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0127, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.5523516535758972, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0115, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.5443158745765686, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0113, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 0.5146775245666504, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0101, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.2972394824028015, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0092, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.4030104875564575, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0097, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 0.4765481650829315, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0136, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.4051239788532257, + "learning_rate": 1.999882759038658e-05, + "loss": 0.0113, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.3703782558441162, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0108, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5248176455497742, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0112, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.3100311756134033, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0083, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.45929211378097534, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0114, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 0.5695507526397705, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0095, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.5395359992980957, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0151, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.5106327533721924, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0124, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.3423260450363159, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0132, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.32126766443252563, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.011, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.5105165839195251, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0085, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 0.31927764415740967, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0088, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 0.4421865940093994, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0093, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.2930506765842438, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0091, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.2920694053173065, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0085, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.2661049962043762, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0081, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 0.3047257661819458, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0083, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.2774506211280823, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0091, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.2554785907268524, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0096, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.5792570114135742, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0108, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.3250623941421509, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0125, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 0.5885359048843384, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0117, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.37988749146461487, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.009, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.3751101493835449, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0099, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.31976667046546936, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0097, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 0.37007251381874084, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0079, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.4624205231666565, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0103, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 0.3769538700580597, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0094, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.25460657477378845, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0076, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.3976004719734192, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0109, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.2983521521091461, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0113, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.25581008195877075, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0101, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.29260268807411194, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0102, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.3522181808948517, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0105, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.36269208788871765, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0103, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.40412119030952454, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0116, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.24089744687080383, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0119, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.4667617082595825, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0084, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.30139675736427307, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0101, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.38486286997795105, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0097, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.3526909649372101, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0071, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.3023934066295624, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0125, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.2796316146850586, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0072, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.25742489099502563, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0089, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.3626627027988434, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.01, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.3032572567462921, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0084, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.23514018952846527, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0086, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.3835832476615906, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0091, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.5170259475708008, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0146, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 0.8983817100524902, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0112, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.26260825991630554, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0086, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.481942743062973, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0126, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.311187207698822, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0064, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.3346790373325348, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0073, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.33836621046066284, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0085, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.3678463101387024, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0098, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.6136184334754944, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0154, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.39811593294143677, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0112, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6973778009414673, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0099, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.4773237109184265, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0091, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.3776084780693054, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.009, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 0.5061993598937988, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0097, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.41183987259864807, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.009, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 0.31513598561286926, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0112, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.4571514129638672, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0097, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.3183996379375458, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.01, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.2978666126728058, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0089, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.4791043698787689, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0087, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.5216032266616821, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0124, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.44693392515182495, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0092, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 0.41371819376945496, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0111, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.3593288064002991, + "learning_rate": 1.996106060741973e-05, + "loss": 0.014, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 0.4550306499004364, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0098, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.3510669469833374, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0066, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.2778814136981964, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0108, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.32210350036621094, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0067, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.42160800099372864, + "learning_rate": 1.995639934033493e-05, + "loss": 0.012, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.49051347374916077, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0102, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.3643694519996643, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.009, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.3717772960662842, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0076, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.32102280855178833, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0081, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.36725476384162903, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0102, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.39626258611679077, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0078, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.4183773696422577, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0105, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.3494930863380432, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0078, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.6155357956886292, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0119, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.34380587935447693, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.0105, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.5476253032684326, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.01, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 0.37999996542930603, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0094, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 0.3124147057533264, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0125, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.4887244999408722, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.01, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5969874858856201, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0106, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 0.4295594096183777, + "learning_rate": 1.993971819309759e-05, + "loss": 0.007, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.3899303078651428, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0096, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.3912282884120941, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0075, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.5355616807937622, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0093, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.29141828417778015, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0129, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.24389855563640594, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.009, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.4070908725261688, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0085, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.26783379912376404, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0071, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.2644960880279541, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0089, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.35223162174224854, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0093, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.47337162494659424, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0095, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.25418519973754883, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0093, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 0.36384159326553345, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0082, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.30014440417289734, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0081, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.41121408343315125, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0081, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.5576186776161194, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.008, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.35785913467407227, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0083, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.3306240439414978, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0084, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 0.37215736508369446, + "learning_rate": 1.991774193879505e-05, + "loss": 0.012, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.5504099726676941, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0088, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.24932143092155457, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.007, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.5866615176200867, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0088, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.5174368619918823, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0121, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.2345893532037735, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0095, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.2683233916759491, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0068, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.2471713274717331, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0085, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.5090919733047485, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0108, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.2857886552810669, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0078, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.23729385435581207, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0096, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.30867621302604675, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0088, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.42522960901260376, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0103, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.37170591950416565, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0105, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.3672806918621063, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0121, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.4048611521720886, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.01, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.24768167734146118, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0125, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 0.5003495812416077, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0125, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.4303686022758484, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0084, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.3701602518558502, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0101, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.38272005319595337, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0075, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.2844183146953583, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0105, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.31114980578422546, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0095, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.3436568081378937, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0113, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.273001104593277, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0076, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.2653564512729645, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0077, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.3115384578704834, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0132, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.25932809710502625, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0083, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.28656521439552307, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0066, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.31808462738990784, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.0115, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.18877890706062317, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0092, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.3685394525527954, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0091, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.3878263533115387, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0082, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 0.284507691860199, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0085, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.3473755121231079, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0081, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.39935287833213806, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0081, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.34282153844833374, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0076, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.3581090271472931, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0087, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.37332627177238464, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0089, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 0.5224587321281433, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0089, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.42577075958251953, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0108, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.4602234959602356, + "learning_rate": 1.985504281027289e-05, + "loss": 0.014, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.4852961003780365, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0091, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.4437471628189087, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0112, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.37050408124923706, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0068, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.3345497250556946, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0069, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.36727628111839294, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0081, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 0.37056809663772583, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0152, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.5640603303909302, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0085, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 0.3653910160064697, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0078, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.2954258322715759, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0083, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6086210012435913, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0082, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 0.5260390043258667, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0105, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.3067379295825958, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.0092, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.3480100929737091, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0088, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.26472753286361694, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0067, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.5254784226417542, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0146, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.35744136571884155, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0098, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.36186468601226807, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0084, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 0.35203835368156433, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0115, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.30590811371803284, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0108, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.34612980484962463, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0082, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.2946765720844269, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0075, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.33707642555236816, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.007, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.2572688162326813, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0099, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.3901146352291107, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0185, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.4349755644798279, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0084, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.2383752018213272, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0092, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.46043846011161804, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0073, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.24630354344844818, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0062, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.5232640504837036, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0067, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 0.3850713074207306, + "learning_rate": 1.979809151602651e-05, + "loss": 0.014, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 0.44703760743141174, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0081, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.3762659728527069, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0099, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.4593638479709625, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0093, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.40554332733154297, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0125, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.33439910411834717, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0081, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.2623269855976105, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0062, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.22419600188732147, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0078, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.37183159589767456, + "learning_rate": 1.978133252131276e-05, + "loss": 0.01, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.27857136726379395, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0089, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.27683520317077637, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0069, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.45064759254455566, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0076, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.24215294420719147, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.0071, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.5163891315460205, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.0078, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.3922234773635864, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0077, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.19653558731079102, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0063, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.17621839046478271, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0084, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.6482162475585938, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0075, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.32759004831314087, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0088, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.33347561955451965, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0073, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.42883744835853577, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0084, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 0.3348788917064667, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0082, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.28349289298057556, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0102, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.2733197510242462, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0074, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.3263874351978302, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.01, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.295757532119751, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0071, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5598515868186951, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0093, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.425937294960022, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0083, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.2442379742860794, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0087, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.3378766179084778, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0163, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5137761831283569, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0099, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.3825916647911072, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0096, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.32084307074546814, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0066, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.3979593515396118, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0077, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.3103732764720917, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0067, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.5531997084617615, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0131, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5423216819763184, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0121, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.5038735270500183, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0087, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.44273868203163147, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.008, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.335232675075531, + "learning_rate": 1.971017390295979e-05, + "loss": 0.009, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.4746256470680237, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.26807400584220886, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0075, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.35464033484458923, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0123, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.33803898096084595, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0094, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 0.20334473252296448, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0101, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.34386369585990906, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0081, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.38781842589378357, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0088, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.25994163751602173, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0079, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.3342406451702118, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0091, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.3120318353176117, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0079, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.3556351661682129, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0073, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.21421445906162262, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0095, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.39498451352119446, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0087, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.5480947494506836, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0079, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.16734588146209717, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0072, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.3987548351287842, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0083, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.3929785490036011, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0096, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.2884303331375122, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0102, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.3338335454463959, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0092, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.47452738881111145, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0093, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.25584715604782104, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0068, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 0.3038389980792999, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0076, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.4123639464378357, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0101, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.38520675897598267, + "learning_rate": 1.964833301001045e-05, + "loss": 0.014, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.3355116844177246, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0094, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.3479195535182953, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0105, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.2700177729129791, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0076, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.2166757434606552, + "learning_rate": 1.963745667883003e-05, + "loss": 0.008, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 0.18578873574733734, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0071, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.26316413283348083, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0079, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.28762468695640564, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0115, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 0.3712877631187439, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0074, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.2862299382686615, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0072, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.2730867564678192, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0101, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.327648401260376, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0092, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.41153189539909363, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0083, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.32522135972976685, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0095, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.22764958441257477, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0085, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.3491888642311096, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.009, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.3123551607131958, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0103, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.1881783902645111, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0085, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.40902259945869446, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0089, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.382953941822052, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0088, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 0.23950865864753723, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0064, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.3419397175312042, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0118, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.42207059264183044, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0091, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.40754130482673645, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0087, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.2390766590833664, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0069, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.2974188029766083, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.0091, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.2993582785129547, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0092, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.42652204632759094, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0068, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.3138194680213928, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.009, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.38833311200141907, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0083, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.4015152156352997, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0081, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.42086881399154663, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.007, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.26732996106147766, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0071, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.5763937830924988, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0101, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.2955382764339447, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0075, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.4625638723373413, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0094, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.29631468653678894, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0096, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.46335819363594055, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0103, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.3183141350746155, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.008, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.26456212997436523, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0083, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 0.40924879908561707, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0097, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 0.3981763422489166, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0094, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.36437541246414185, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0064, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.2935962378978729, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0081, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.3478807210922241, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0079, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.3460087180137634, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0069, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.2706817090511322, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0088, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.2674945890903473, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0083, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.2268197238445282, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0072, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.3216208219528198, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0092, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.3226968050003052, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0101, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.2743329405784607, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0075, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.32573118805885315, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0094, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 0.53167325258255, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0099, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.3915646970272064, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0089, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.4526256322860718, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0101, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.323249489068985, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0094, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4046335816383362, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0088, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.34745559096336365, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0078, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.30308133363723755, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0071, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.37923407554626465, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0076, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 0.26785972714424133, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0093, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.2778306305408478, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0083, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.611038088798523, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0098, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.4114893078804016, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0111, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.2732110023498535, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0076, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.2964401841163635, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0095, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.40240928530693054, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0097, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.3901022672653198, + "learning_rate": 1.944152646499645e-05, + "loss": 0.008, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.38001132011413574, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0109, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.35937973856925964, + "learning_rate": 1.943474465322135e-05, + "loss": 0.007, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.2745327651500702, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0075, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.1598518043756485, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.007, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.401614785194397, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0115, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.4127846360206604, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0068, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.22147920727729797, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0061, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.28602245450019836, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0067, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.22147324681282043, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0076, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.2550548315048218, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0088, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.24113087356090546, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0076, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.3658410608768463, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0075, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.3856262266635895, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0112, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.33494284749031067, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0075, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.3767516314983368, + "learning_rate": 1.938969919958475e-05, + "loss": 0.01, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.24380649626255035, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.009, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.30575039982795715, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0079, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.32913386821746826, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.009, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.29845312237739563, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0099, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.44377902150154114, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0092, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.34614384174346924, + "learning_rate": 1.936834723687526e-05, + "loss": 0.009, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.3316318690776825, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0096, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.4076138734817505, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0081, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 0.30320486426353455, + "learning_rate": 1.935753861926916e-05, + "loss": 0.015, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.32243025302886963, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.011, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.323745459318161, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0077, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5750753283500671, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0088, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.22709843516349792, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0101, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.3067542314529419, + "learning_rate": 1.933932815280178e-05, + "loss": 0.007, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.392337828874588, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0089, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.43343180418014526, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0073, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.4371345341205597, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0078, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.35214635729789734, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0077, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.3259161412715912, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0074, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.3849303722381592, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0066, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.3968902826309204, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0091, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.33016201853752136, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0095, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.3859156668186188, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.008, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.3020654618740082, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.007, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.44503262639045715, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0105, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.3908904194831848, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0073, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.39256253838539124, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0078, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.352611243724823, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0077, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.39203983545303345, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0081, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.23835115134716034, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0066, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.24996638298034668, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0098, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.29537609219551086, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0082, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.2898835837841034, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0077, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.4040369391441345, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0083, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.3501318395137787, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0094, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5462452173233032, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0097, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.4217568337917328, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0072, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.18295089900493622, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0083, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.3695569336414337, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0078, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.37818798422813416, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0089, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.29818472266197205, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0084, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.3328498303890228, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.01, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.340724378824234, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0075, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.2966301441192627, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0063, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.30677109956741333, + "learning_rate": 1.922098355206593e-05, + "loss": 0.008, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.2091839611530304, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.0078, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.4229014217853546, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0115, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.40779992938041687, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0075, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.378817081451416, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.008, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.29796919226646423, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0092, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.2702767252922058, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0076, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.31349876523017883, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0085, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 0.30500444769859314, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0093, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.2860834002494812, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0061, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.26036593317985535, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0099, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.19049863517284393, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0075, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.3235284388065338, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0083, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.364092618227005, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.011, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.2409065216779709, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0092, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.36907926201820374, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.008, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.3230077922344208, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0073, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.191047802567482, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0063, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.3346494436264038, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0082, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.21352025866508484, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0075, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.5505086779594421, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0095, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.34264758229255676, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0083, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.20266413688659668, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0074, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.24938757717609406, + "learning_rate": 1.912718096497034e-05, + "loss": 0.007, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.4140026569366455, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0086, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.4424414038658142, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0104, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 0.5327904224395752, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0115, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 0.35958340764045715, + "learning_rate": 1.911035077753307e-05, + "loss": 0.01, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.2547682523727417, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0066, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.3701247274875641, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0115, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.34443217515945435, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0077, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.20353800058364868, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0061, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5660653114318848, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0091, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.26445311307907104, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0073, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.5561402440071106, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0071, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.3700469434261322, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0083, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.35783904790878296, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.008, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.3238641619682312, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0081, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.25247740745544434, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0099, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.435730904340744, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.008, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.37758126854896545, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0068, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.33323949575424194, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.0094, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.4356318712234497, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0093, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 0.37893903255462646, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0058, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.4411139190196991, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0085, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.3852006793022156, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0087, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4287096858024597, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0107, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.43085435032844543, + "learning_rate": 1.902392195640386e-05, + "loss": 0.009, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 0.2709400951862335, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0066, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.358126163482666, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0082, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.25320038199424744, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0077, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 0.31440937519073486, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0077, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.25246965885162354, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0079, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.28420332074165344, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0101, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.25251317024230957, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0075, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.19744229316711426, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0069, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4457854628562927, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0073, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.36817625164985657, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0096, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.3394709825515747, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0073, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.2909093201160431, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0065, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.20237651467323303, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0057, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.29520732164382935, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0072, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.25512900948524475, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0096, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.45816823840141296, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0073, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.33459368348121643, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0096, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.21619321405887604, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0063, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.25518253445625305, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0067, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.2273867279291153, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.007, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.2864684462547302, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0074, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.3077942728996277, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0075, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 0.40526703000068665, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0079, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.41480064392089844, + "learning_rate": 1.891523933768891e-05, + "loss": 0.01, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.2750788629055023, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0064, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 0.29671600461006165, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0095, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 0.24160107970237732, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0069, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 0.2949109971523285, + "learning_rate": 1.889660337749874e-05, + "loss": 0.007, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.2847975492477417, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0059, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.30052465200424194, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0067, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.36128419637680054, + "learning_rate": 1.888252908366661e-05, + "loss": 0.014, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.36974236369132996, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0064, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.43730056285858154, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0084, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.3145422339439392, + "learning_rate": 1.88683715346172e-05, + "loss": 0.008, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.35473865270614624, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0091, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.2501350939273834, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.008, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.34808069467544556, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0099, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.45218509435653687, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.0068, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.34530994296073914, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0098, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.38257333636283875, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0101, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.3040159344673157, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0079, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.3323517143726349, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0068, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.2639414370059967, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0078, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.3493870794773102, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0081, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.5838330984115601, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0091, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 0.428803026676178, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0087, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.3654572069644928, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0114, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.3295663297176361, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0075, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.3469060957431793, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0074, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.3366406261920929, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0066, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.32569241523742676, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0054, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3086700737476349, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0086, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.38562801480293274, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0092, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.3523421585559845, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0085, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.2278694063425064, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0063, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.32141822576522827, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0147, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.3375259041786194, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0077, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4483063220977783, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0062, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.3667140007019043, + "learning_rate": 1.874717450126662e-05, + "loss": 0.008, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.3419000506401062, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0079, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.36556369066238403, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0079, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.33135318756103516, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0064, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.4458329975605011, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0091, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.34939518570899963, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0072, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.34424352645874023, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0077, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.3460613191127777, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0113, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.38822048902511597, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0066, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.35550639033317566, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0083, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 0.30869176983833313, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0087, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.38202086091041565, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0081, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.25744789838790894, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0074, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.29700344800949097, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0082, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.305786669254303, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0076, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.3291271924972534, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0071, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.26111704111099243, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0074, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.348176509141922, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0086, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.27502793073654175, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0076, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.2831551432609558, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0092, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.39652079343795776, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0066, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.3885122239589691, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0087, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.37296077609062195, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0104, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.33606627583503723, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0086, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.3855937421321869, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0097, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.3322301506996155, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0076, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 0.33322253823280334, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.009, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.22358210384845734, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0088, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.5901851058006287, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0088, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.4703235328197479, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0084, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.20072896778583527, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0077, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.3537980616092682, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0098, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.3123277723789215, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0068, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.35979342460632324, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0065, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.38628828525543213, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0074, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.3498038053512573, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0074, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.20784054696559906, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0059, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.1811107099056244, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0085, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.43317103385925293, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0064, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.3815033435821533, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0064, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.35989734530448914, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.008, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.46118423342704773, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.012, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.25334376096725464, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0078, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.26764073967933655, + "learning_rate": 1.852547637090483e-05, + "loss": 0.01, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.2785920202732086, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0066, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.41587865352630615, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0061, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.42850133776664734, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.009, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.32369133830070496, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0091, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.2930110692977905, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0069, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.3199067711830139, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0085, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 0.4349478483200073, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0078, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 0.3054976165294647, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0061, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.2826739251613617, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0068, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.25106528401374817, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.007, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.25897887349128723, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0076, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.26398584246635437, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.0069, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.41751599311828613, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0083, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.17239610850811005, + "learning_rate": 1.844974808419918e-05, + "loss": 0.006, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3300461173057556, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0051, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.2645586133003235, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0068, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.24550332129001617, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0071, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.2889944911003113, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0091, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.476601779460907, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0066, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.35630306601524353, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0074, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.35651877522468567, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0084, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.3889803886413574, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0079, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4214278757572174, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.009, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.30540233850479126, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0083, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.3624532222747803, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0076, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.32963570952415466, + "learning_rate": 1.838347361898993e-05, + "loss": 0.01, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.3533381521701813, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0064, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.3011729419231415, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0065, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.4733760952949524, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0089, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.38553985953330994, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0059, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.2560643255710602, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0073, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.39531010389328003, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0106, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.2701983153820038, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0086, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.352717787027359, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0096, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.29157745838165283, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0073, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.4267994165420532, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0075, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.36308032274246216, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0075, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.33457428216934204, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0103, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.3717971444129944, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0069, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 0.21432936191558838, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0081, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.2878777086734772, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0057, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.4453850984573364, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0095, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.36917057633399963, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0063, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.3252313733100891, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0082, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.2529674470424652, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0057, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.2816419303417206, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0097, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.6464210152626038, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0076, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.33034399151802063, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0069, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.27335023880004883, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0078, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 0.3158395290374756, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0064, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.5128306746482849, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0087, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 0.24884961545467377, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0084, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.324278324842453, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0075, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6472476124763489, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0093, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.21269051730632782, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0066, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.29203882813453674, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0074, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.30436405539512634, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0087, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5066608190536499, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0081, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.32647472620010376, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0066, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.2804315388202667, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0066, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.24779941141605377, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0074, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.34001022577285767, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0101, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.2611280381679535, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0082, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.3129233717918396, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0079, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.2822776734828949, + "learning_rate": 1.815952390818299e-05, + "loss": 0.0098, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.36969345808029175, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0064, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.33959338068962097, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0088, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.2628033459186554, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0062, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.38812723755836487, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0061, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.26403307914733887, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0055, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 0.3789900541305542, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0081, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.28676870465278625, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0127, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.606293797492981, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0082, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.37321826815605164, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0063, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.368115097284317, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0091, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.3368416726589203, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0068, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.23466472327709198, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.006, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.3796599507331848, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0169, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.2202090471982956, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0099, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.5006175637245178, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0086, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.3673453629016876, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0083, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4379428029060364, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.006, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.43015891313552856, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0084, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.2806220054626465, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0061, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.23545289039611816, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0062, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 0.32115358114242554, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0075, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.3217777907848358, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0062, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.3224331736564636, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0072, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.31703537702560425, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0082, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 0.4175204932689667, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.008, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.22969186305999756, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0084, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.3421284258365631, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0077, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.32668444514274597, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0071, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.2729822099208832, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0068, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.33153197169303894, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0074, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.4678424000740051, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0076, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.23711496591567993, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0076, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.3230719566345215, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0084, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.32328692078590393, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0075, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.566879153251648, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0072, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.26277920603752136, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0062, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.339163601398468, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0082, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.23408609628677368, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0061, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.2942394018173218, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0065, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 0.3774799704551697, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0063, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.2847958207130432, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0072, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.2577030062675476, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0088, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.2883673906326294, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0075, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.3596307933330536, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0073, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.30285483598709106, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0076, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.2933914363384247, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0077, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.7666468024253845, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0102, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.31347739696502686, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0072, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.3435507118701935, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0081, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.3266170620918274, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0058, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.284027099609375, + "learning_rate": 1.784745142605655e-05, + "loss": 0.005, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.19972574710845947, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0072, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.2587524950504303, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0067, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.2922254204750061, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0064, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.17053507268428802, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0092, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.2850453555583954, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0073, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.2844892144203186, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0075, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.28969481587409973, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0079, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.4704195261001587, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0102, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.2652505338191986, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0077, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.2656702399253845, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0118, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.2282119244337082, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0086, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.30130353569984436, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0062, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.2295757234096527, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0066, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.25287938117980957, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0065, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.3274557292461395, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0076, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.34377023577690125, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0079, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.36259520053863525, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0055, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.24462608993053436, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0067, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.3615039587020874, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0088, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.40002626180648804, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0086, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.3362888991832733, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0062, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.33698126673698425, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0087, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.3287750482559204, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.0068, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.23409898579120636, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0063, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.23275460302829742, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0066, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.35324692726135254, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0068, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.2781875729560852, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0066, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.3083304166793823, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0069, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.22543831169605255, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0066, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.22566530108451843, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0066, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.3640650808811188, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0073, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.35346123576164246, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0069, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 0.30858153104782104, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0076, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.30895760655403137, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0074, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.30667638778686523, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0082, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.3134152889251709, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0086, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.21407048404216766, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0077, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.3456077575683594, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0083, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.4259016513824463, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.009, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.38690924644470215, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0094, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.31742537021636963, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0065, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.3568819463253021, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0077, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.3771888315677643, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0073, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.25528469681739807, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0067, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.36028411984443665, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0064, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.41987329721450806, + "learning_rate": 1.754802282200567e-05, + "loss": 0.007, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.18902993202209473, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0064, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.1859915405511856, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0086, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.1778331696987152, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0052, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4222147464752197, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.007, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.26806506514549255, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0074, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.34431734681129456, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0056, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.41732800006866455, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0079, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.3027847409248352, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0054, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.47592151165008545, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0066, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9539707899093628, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0095, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.4084669351577759, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0082, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.3052361309528351, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0072, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.23123528063297272, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.009, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.20356184244155884, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0073, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.048543930053711, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.3017459213733673, + "learning_rate": 1.74400239259128e-05, + "loss": 0.007, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.3679676353931427, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0085, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.20339734852313995, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0087, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.3523346781730652, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0076, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.4162348210811615, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0063, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.3293565511703491, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0067, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.24455691874027252, + "learning_rate": 1.739902378104222e-05, + "loss": 0.007, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 0.17645037174224854, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0051, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.2554231286048889, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0076, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.20006878674030304, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0076, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.27911216020584106, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0082, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.5701723694801331, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0081, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.222118079662323, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0072, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.2762138843536377, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0049, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 1.4110082387924194, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0114, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.31313180923461914, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0078, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.20941513776779175, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0079, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.3963930308818817, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0053, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.2066672146320343, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0049, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.3919369876384735, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0082, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.2544628083705902, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.0054, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.31123557686805725, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0078, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.24768301844596863, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0051, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.26674744486808777, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0052, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.27382466197013855, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0075, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.23384103178977966, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.0059, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.3531075417995453, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0068, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.34425088763237, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0066, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.2716144323348999, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0058, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.35163211822509766, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0071, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.23585639894008636, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0072, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.28066661953926086, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0068, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.3146689832210541, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0071, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.37553170323371887, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.008, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.18403242528438568, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0068, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.3904851973056793, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0072, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.4481397867202759, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0074, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.31124234199523926, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0074, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.3815377354621887, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0084, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.2909438908100128, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0074, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.3408021330833435, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0074, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.23902025818824768, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0076, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.2194853127002716, + "learning_rate": 1.714740708672306e-05, + "loss": 0.006, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 0.4337097108364105, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0092, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.4132380783557892, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0078, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.3434816598892212, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0076, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.25129666924476624, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0058, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.45458248257637024, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0064, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.5350340008735657, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.009, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 0.28008121252059937, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0073, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.33276447653770447, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0064, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37103456258773804, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0078, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 0.4689319133758545, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0073, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.3622629642486572, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.006, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.2822306156158447, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0073, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.19226481020450592, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0059, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.27806007862091064, + "learning_rate": 1.704700993266678e-05, + "loss": 0.007, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.25948378443717957, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0076, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.5857216715812683, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0095, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.30467140674591064, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0073, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.2067701816558838, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0068, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 0.5653601288795471, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0087, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.3107249140739441, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0065, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4027363061904907, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0098, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.2757766544818878, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0091, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.30397671461105347, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0061, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.28112074732780457, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0063, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.14751966297626495, + "learning_rate": 1.696714953556411e-05, + "loss": 0.008, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.2988373935222626, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0055, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.2706286311149597, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0066, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.3612031042575836, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.006, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.25386789441108704, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0065, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.3170768916606903, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0056, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.4776926338672638, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0059, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.34828829765319824, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0088, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.20440815389156342, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0066, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.2943046987056732, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0068, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.16982606053352356, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0073, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.5607914924621582, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0085, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.35823172330856323, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0064, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.23943926393985748, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0068, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.24083787202835083, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0056, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.37987980246543884, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0062, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.35953620076179504, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0069, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.22255095839500427, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0071, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.4121200442314148, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0098, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.2377164363861084, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0076, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.2298472374677658, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0064, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.40824711322784424, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0066, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.33295100927352905, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.007, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.3978032171726227, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0077, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.27672451734542847, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.006, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.2591206729412079, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0089, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.1749347746372223, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0051, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.18699893355369568, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0056, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.240631103515625, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0089, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.3650512993335724, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0075, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.3503545820713043, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0067, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.3086877167224884, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0061, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.41695648431777954, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0064, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.33144691586494446, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0067, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.2679164409637451, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0072, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.22681233286857605, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0071, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.36362454295158386, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0067, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.20192845165729523, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0067, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.3895004093647003, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0055, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.22510671615600586, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0069, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.19641445577144623, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0101, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.2914806008338928, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0076, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.3187137544155121, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0059, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.3116552233695984, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0095, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.2597426772117615, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0058, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.21480600535869598, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0055, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.23912057280540466, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.006, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.317941278219223, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0064, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.58933025598526, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0095, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.21906700730323792, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0105, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.23899045586585999, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0059, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.2969389259815216, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0124, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.3514954447746277, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0066, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.18145518004894257, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0077, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.3087640404701233, + "learning_rate": 1.656303606359183e-05, + "loss": 0.006, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.3532063364982605, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0055, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.34000685811042786, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0096, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.24904295802116394, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0073, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.36314642429351807, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.008, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.20241902768611908, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.009, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.3215351700782776, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0075, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 0.4313117563724518, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0081, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.48170387744903564, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0071, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.3369109630584717, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0066, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.34541958570480347, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0058, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.2493886947631836, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0058, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.22845667600631714, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0096, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.2695702016353607, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0055, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 0.28211796283721924, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0052, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.1901162564754486, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0058, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.2701025605201721, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0061, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.36527693271636963, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0072, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.3061700463294983, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0067, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.5612105131149292, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0087, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.23399880528450012, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0072, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.314933180809021, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0078, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.35548436641693115, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0094, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.37685567140579224, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0084, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.3190719783306122, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0065, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.26337119936943054, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0063, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.3518264889717102, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0072, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.3185817003250122, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0068, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.2995646893978119, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0064, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.3110463619232178, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0063, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.24277286231517792, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0064, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.17603862285614014, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0061, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.28089356422424316, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0076, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.2855492830276489, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0047, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.3247278928756714, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0058, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.18349547684192657, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0061, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.30654969811439514, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.007, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.2674420177936554, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0067, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.38177546858787537, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0091, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.33796218037605286, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0068, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.3754856586456299, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0063, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.21820858120918274, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.007, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.36184942722320557, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0061, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.32240399718284607, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0063, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 0.24755406379699707, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0059, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.397858589887619, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0064, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.389072448015213, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0063, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.3368140757083893, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0071, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.29631632566452026, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0062, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.24265453219413757, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0076, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.19892603158950806, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0064, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.1852462887763977, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0051, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.1886446475982666, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0075, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.25982722640037537, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0068, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.3376137614250183, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0058, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.33173730969429016, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0064, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.3177517354488373, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0072, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.3385971784591675, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0066, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.29163679480552673, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0073, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.2335229516029358, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0056, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.24502214789390564, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0054, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.2009458988904953, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0061, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.3341793715953827, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0082, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.3872147798538208, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0063, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.29940876364707947, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0073, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.4895729720592499, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0086, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.4485950469970703, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.0053, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.22961653769016266, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0077, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.24187293648719788, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.005, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.3535212278366089, + "learning_rate": 1.601916647245149e-05, + "loss": 0.007, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.26539868116378784, + "learning_rate": 1.601107070706339e-05, + "loss": 0.008, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.43096065521240234, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0076, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.16919535398483276, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0058, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.2383720725774765, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0064, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.36103156208992004, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0067, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.2657287120819092, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0072, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.21437199413776398, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0065, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.34000417590141296, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0046, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.4855337142944336, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0068, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.3178497850894928, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0064, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.3171309530735016, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0067, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.3364340662956238, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0067, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2272711992263794, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0069, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.29505178332328796, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0078, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.3755042552947998, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0081, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.2983969449996948, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0085, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.3112468421459198, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0072, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.1950412392616272, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0061, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.2153436243534088, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0065, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.25062650442123413, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0079, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.1407836377620697, + "learning_rate": 1.584793312377278e-05, + "loss": 0.005, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.17276513576507568, + "learning_rate": 1.583971586792325e-05, + "loss": 0.006, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.47983887791633606, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0076, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.28724750876426697, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0076, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.3224884569644928, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0079, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.37969788908958435, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0063, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.48106926679611206, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0071, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.3555319905281067, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0075, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.19486083090305328, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.006, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.42018064856529236, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0074, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.3075830936431885, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0071, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.20921990275382996, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0063, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.20436584949493408, + "learning_rate": 1.574895332125391e-05, + "loss": 0.006, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.28120604157447815, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0071, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.22980183362960815, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0078, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.24825431406497955, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0064, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.22042447328567505, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0071, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.249199777841568, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0076, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.32628607749938965, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0057, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.35151633620262146, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0059, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.29098865389823914, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0064, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.24006013572216034, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0058, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.2797141671180725, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0073, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.2963006794452667, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0058, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.19539053738117218, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0053, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.2686854898929596, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0051, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.35952430963516235, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0071, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.21042552590370178, + "learning_rate": 1.562410199183484e-05, + "loss": 0.005, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.27942436933517456, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0068, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.17137926816940308, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0063, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.20331411063671112, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0047, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.15683002769947052, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0052, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.14726290106773376, + "learning_rate": 1.558221191857467e-05, + "loss": 0.006, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.2940376400947571, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0068, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.4059796929359436, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0067, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.2587816119194031, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0086, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.3462979793548584, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0078, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.5607128739356995, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0079, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.24189788103103638, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0052, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 0.23362945020198822, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0073, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.22395116090774536, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0059, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.3514958322048187, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0064, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.25395795702934265, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0081, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.2948741018772125, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0051, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.22298739850521088, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0038, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.46948447823524475, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0097, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.2992243468761444, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0083, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.18001538515090942, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0055, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.23337051272392273, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0066, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.2863878905773163, + "learning_rate": 1.543878746906905e-05, + "loss": 0.006, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.23027309775352478, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0072, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.21359150111675262, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0064, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.3878735601902008, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0069, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.29146283864974976, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.007, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.21782676875591278, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0051, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.45582008361816406, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0063, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.4554077982902527, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0067, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.2254059612751007, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0064, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.13952374458312988, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0061, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.23241721093654633, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0072, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.3424162268638611, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0058, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.21074503660202026, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0057, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.33662086725234985, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0056, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.24403709173202515, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0073, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.27195101976394653, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0058, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.34224429726600647, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0072, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.29089581966400146, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0053, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3397226333618164, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0066, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.30517837405204773, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0092, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.3485032021999359, + "learning_rate": 1.52681291800283e-05, + "loss": 0.007, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.31346458196640015, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0045, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.1864607185125351, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.006, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.20976679027080536, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0053, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.22616958618164062, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0059, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.14772117137908936, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0073, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.33677151799201965, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0059, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.32354292273521423, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0061, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.21409569680690765, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0064, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.4659721851348877, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0061, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 0.32267874479293823, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0064, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.5019848942756653, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0061, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.32694318890571594, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0076, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.3013843297958374, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0068, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.1973707377910614, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0059, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22204430401325226, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0056, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.3365449607372284, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0059, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.3398677110671997, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.007, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.27888917922973633, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0062, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.2814931273460388, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0069, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.3317541182041168, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.006, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.21940776705741882, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0052, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.239700049161911, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0059, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.19117280840873718, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0071, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.21827168762683868, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0056, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.25645333528518677, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0085, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.30847233533859253, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0055, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.3127819895744324, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0058, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.30181658267974854, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0075, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.34778207540512085, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0077, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.18988046050071716, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0048, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.3479195833206177, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0045, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.24158424139022827, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0051, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.14698052406311035, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0053, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.4441753625869751, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0065, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.28078633546829224, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0064, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.29406028985977173, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0048, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3856968581676483, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0067, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.36528849601745605, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0062, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.34250667691230774, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0053, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.2862832844257355, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0055, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.3683549761772156, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0091, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.26892581582069397, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0069, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.2220073938369751, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0052, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.18825116753578186, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0065, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.28731998801231384, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0069, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.26817163825035095, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0058, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.44162800908088684, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0065, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 0.2990165948867798, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0074, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.20428279042243958, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0053, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.2918189465999603, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0056, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.30408942699432373, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0063, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.2593521177768707, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0061, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.34048640727996826, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0054, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.2438877820968628, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0059, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.5205245018005371, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0065, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.3658570349216461, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0061, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.23279106616973877, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0039, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.2704083323478699, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0054, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.1849551945924759, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0061, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.21807430684566498, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0059, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.47879981994628906, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0061, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.24125567078590393, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0056, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.25820469856262207, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0053, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.30664944648742676, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0075, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.3646678030490875, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0057, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.2534210979938507, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0045, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.2125798910856247, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0074, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 0.4387839734554291, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0072, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.337387353181839, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.01, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.23150259256362915, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0072, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.3243090808391571, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0076, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.26716119050979614, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.006, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.15551891922950745, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0061, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.1841796338558197, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0058, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 0.3119230270385742, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.006, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.2633327841758728, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0059, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.24567869305610657, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0055, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.3697315454483032, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0061, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.1941021829843521, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0052, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.2610131502151489, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.007, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.24856074154376984, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0062, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.27259066700935364, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0052, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.20962993800640106, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0055, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.4015270471572876, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0062, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.22935271263122559, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0063, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.29984018206596375, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0059, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.35775551199913025, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0079, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.15501125156879425, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0054, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.3543296158313751, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0072, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.1982075721025467, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0064, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.2616399824619293, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0062, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.2612541615962982, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0064, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3081730008125305, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0055, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.24024926126003265, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0083, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.20793405175209045, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0055, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.21445533633232117, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0058, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.24078251421451569, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0059, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.36214157938957214, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0061, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.2583295702934265, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0054, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.2641732394695282, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0069, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.2179708331823349, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0049, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.27418699860572815, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0049, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.3894921839237213, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0076, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.3912152945995331, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0063, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.16886518895626068, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0059, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.2731325626373291, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0073, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.3299262225627899, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.007, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.2671407163143158, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0058, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.2701479196548462, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0059, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.3803080916404724, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0061, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.2621704041957855, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0061, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.27780428528785706, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0065, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.3326016962528229, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0081, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.3632255792617798, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0069, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.24395202100276947, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0065, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.3215671181678772, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0066, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.2625272572040558, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0065, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.31547197699546814, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0043, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.1893424689769745, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0059, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.27042335271835327, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0059, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.22597061097621918, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0063, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.1742873191833496, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0062, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.16797663271427155, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0048, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.42558521032333374, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0075, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.37216684222221375, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0061, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.19943472743034363, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0065, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.2211161106824875, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0075, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.2680184245109558, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0052, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.2402123361825943, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0051, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.1881084442138672, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0066, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.26134756207466125, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0063, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.3185539245605469, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0062, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.3118845820426941, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0061, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.22595946490764618, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.007, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.2627023458480835, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0067, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.2984865605831146, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0051, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.25496092438697815, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0057, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.3078263998031616, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0074, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.17885653674602509, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0057, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.37737196683883667, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0058, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.21651378273963928, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0053, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.1974128633737564, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0059, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.17184904217720032, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0058, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.3074864447116852, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0059, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.28784239292144775, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0061, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.3435216546058655, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0065, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.38048845529556274, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0057, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.1875533014535904, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0052, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.48555630445480347, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0063, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 0.25066429376602173, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0055, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.2763892412185669, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0059, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.21217335760593414, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0092, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.23555652797222137, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0064, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.14828811585903168, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.006, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 0.27303484082221985, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0047, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.14681454002857208, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0067, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.43693456053733826, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0081, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.2940906286239624, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0059, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.20382657647132874, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0074, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.25655868649482727, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0069, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.31879740953445435, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0062, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4898712933063507, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0051, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.17142456769943237, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0061, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.14010348916053772, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0045, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.26882827281951904, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0056, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.2636195421218872, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0048, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.24932081997394562, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0045, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.3367895185947418, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0049, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.15173649787902832, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0053, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.34083831310272217, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0072, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3327343165874481, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0048, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.36545902490615845, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0076, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.22761192917823792, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0067, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.19272181391716003, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0072, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.2881070375442505, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.006, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.32841676473617554, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0063, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.19850151240825653, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0052, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.31401291489601135, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0052, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.4023345112800598, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0058, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.25802844762802124, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0051, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.19678954780101776, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0053, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.4545653164386749, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0073, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.36174362897872925, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0068, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.31692951917648315, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0063, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.3470834195613861, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0064, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.29541268944740295, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0062, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.26377183198928833, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.006, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.2019137591123581, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0058, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.45156505703926086, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.007, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.15810425579547882, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.006, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.20093902945518494, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.006, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.28989917039871216, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0062, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.39454182982444763, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0063, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.25967612862586975, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0069, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.2058791220188141, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0065, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.26367849111557007, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0074, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.2432256042957306, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0054, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.19844679534435272, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0048, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.16757237911224365, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0052, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.2988821566104889, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0047, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.2231496274471283, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0048, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.265029639005661, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0048, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.41179928183555603, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.0049, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.33498677611351013, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0052, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.2323407232761383, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0048, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.27306419610977173, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0061, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.2791977822780609, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0088, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.453421026468277, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0073, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.3209727108478546, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0063, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.2572932839393616, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0056, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.19572272896766663, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0051, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.2831172049045563, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0057, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.21267575025558472, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0059, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.3220005929470062, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0057, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.2515857517719269, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0063, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.18344618380069733, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0052, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.34515154361724854, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0052, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.16711464524269104, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0054, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.3027217984199524, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.006, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.31168296933174133, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.007, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5778804421424866, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0056, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.2591782212257385, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0061, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.2449295073747635, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0046, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 0.19733767211437225, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0054, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.14837461709976196, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0053, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.3784295916557312, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0054, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.2400134950876236, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0054, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.17671307921409607, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0051, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.2664073705673218, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.006, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.25426605343818665, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0062, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.26733267307281494, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0049, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.46151378750801086, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.006, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.17070212960243225, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0062, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.42009514570236206, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0052, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.20439159870147705, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0053, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.25189417600631714, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0066, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.21402288973331451, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0072, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.294109046459198, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0061, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.29355865716934204, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0061, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.2937833368778229, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0061, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.1926010102033615, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0056, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.21794214844703674, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0065, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.23409108817577362, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0067, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.4696379005908966, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0062, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.28415724635124207, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0061, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.22433705627918243, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0064, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3090682923793793, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0056, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.23742817342281342, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0057, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.2670089900493622, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0052, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.2810697555541992, + "learning_rate": 1.299277443549658e-05, + "loss": 0.007, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.44233059883117676, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0069, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.14227768778800964, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0064, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.298776239156723, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0072, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.2882034480571747, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0064, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.23135380446910858, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0064, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.2870500981807709, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.005, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.24524538218975067, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0064, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.2949783504009247, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0081, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.2215491235256195, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0069, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.26351356506347656, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0082, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.1909482628107071, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0052, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.13428187370300293, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0068, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.2125115543603897, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0048, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.27032148838043213, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0056, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.20981402695178986, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0069, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.24961373209953308, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0073, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.13643066585063934, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0054, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.25289252400398254, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0071, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.4061530828475952, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.006, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.29924723505973816, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0055, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.37029367685317993, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0053, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.37273409962654114, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0066, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.18242980539798737, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0054, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.18563945591449738, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0044, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.32972440123558044, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0045, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 0.3327874541282654, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0065, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.2077408730983734, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0055, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.1813255399465561, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0055, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.17811767756938934, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0055, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.20526157319545746, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0043, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.112189382314682, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0055, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.29082757234573364, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0099, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.23212411999702454, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0067, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.17449915409088135, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0047, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.3327349126338959, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0047, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.2709571123123169, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0056, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.19788618385791779, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0063, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.22075456380844116, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0064, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.2943982779979706, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0057, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.1718410849571228, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0056, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.3546068072319031, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0055, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.18132814764976501, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0047, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.17795684933662415, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0048, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.22964486479759216, + "learning_rate": 1.257232766480803e-05, + "loss": 0.005, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.3259448707103729, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0072, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.18410101532936096, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0045, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.28669047355651855, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0056, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.25986725091934204, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0055, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.1731722205877304, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0053, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.17501944303512573, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.005, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.2749968469142914, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0046, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.26125603914260864, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0055, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.22476239502429962, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0103, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.26169249415397644, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0067, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.19236186146736145, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0048, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.26535508036613464, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0055, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.2534106373786926, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0052, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.29464206099510193, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0076, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.3711875081062317, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0059, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.26430103182792664, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0055, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.27274343371391296, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.006, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.15951389074325562, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0069, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.33735600113868713, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0064, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.19443227350711823, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0051, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.1960541307926178, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0049, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21133695542812347, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0066, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.22702853381633759, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.006, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.22489185631275177, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0061, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.33164891600608826, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0067, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.22196516394615173, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.0055, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.19532594084739685, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0048, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.41902172565460205, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0064, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.30388328433036804, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0052, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.2507944703102112, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0051, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.30817684531211853, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0052, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.27485454082489014, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.006, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.14287802577018738, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0047, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 0.14513961970806122, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.0049, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.3345814645290375, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0051, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.2974685728549957, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0049, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.3455393612384796, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0062, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.16792115569114685, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.005, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.3038713335990906, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.005, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.2928559184074402, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0054, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.2317439168691635, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0039, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.3498123586177826, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0067, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.2850436866283417, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0045, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.18316122889518738, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0089, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.34362390637397766, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0066, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.13047993183135986, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0057, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.3403606116771698, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0055, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.27717292308807373, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0043, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.27412480115890503, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0049, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.1914675235748291, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0075, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.3778243958950043, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0084, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.20566068589687347, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.007, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.1868937760591507, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0051, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.24719548225402832, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.005, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.20591633021831512, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0053, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4353996217250824, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.005, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.31571000814437866, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.005, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.14182177186012268, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0048, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.3461489975452423, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0062, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.17980965971946716, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0043, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.28671878576278687, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0048, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.18663623929023743, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0072, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.25223061442375183, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0063, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.20179906487464905, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.37325599789619446, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0079, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.18855971097946167, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0052, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.2992260754108429, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0051, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.18020357191562653, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0046, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.2106374204158783, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0044, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3749687373638153, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0068, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.1616801619529724, + "learning_rate": 1.188676298665799e-05, + "loss": 0.007, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.20882001519203186, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0143, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.16600479185581207, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0052, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.406480073928833, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0051, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.27349016070365906, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0056, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.2340608835220337, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0044, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.3165459632873535, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0042, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.19552721083164215, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0047, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.21882636845111847, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0061, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.23699741065502167, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0052, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.283207505941391, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0053, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.2782933712005615, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0062, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.3389151096343994, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0074, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.25642505288124084, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0061, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.19476772844791412, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0067, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.1992277055978775, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0057, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.21006375551223755, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0058, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.18808932602405548, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0073, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.258075475692749, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0052, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.29291409254074097, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0052, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.19002115726470947, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0041, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.4246057868003845, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.006, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 0.16166792809963226, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.005, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.35779255628585815, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0065, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.20405125617980957, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0082, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.23229332268238068, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0095, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.21156901121139526, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0074, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.22334401309490204, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0051, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.18344342708587646, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0048, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.22982414066791534, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0056, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.24991759657859802, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0046, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.27965986728668213, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0045, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.309841126203537, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0054, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.20964398980140686, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0044, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.45226722955703735, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0057, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.17177052795886993, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0064, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.8886939287185669, + "learning_rate": 1.153689339251154e-05, + "loss": 0.008, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.14726528525352478, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0066, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.32135209441185, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0064, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.22926779091358185, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0052, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.21345189213752747, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0047, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.31324461102485657, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0072, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.2185574620962143, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0047, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.36229151487350464, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0042, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.3479749262332916, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0053, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.23806153237819672, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0065, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.30633601546287537, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0079, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.2326052039861679, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0063, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 0.1756114363670349, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0064, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.18622055649757385, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0045, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.3261238932609558, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0059, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.16155003011226654, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0057, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.22661013901233673, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0046, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.24310468137264252, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0044, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.16182619333267212, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0056, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.1656215786933899, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0039, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.2945510447025299, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0049, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.24436083436012268, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0058, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.34221476316452026, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0069, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.26235878467559814, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0055, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.23333275318145752, + "learning_rate": 1.130316049722011e-05, + "loss": 0.005, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.23382601141929626, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0057, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 0.1693800389766693, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0058, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.3740929067134857, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.005, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.26146796345710754, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0038, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.13361674547195435, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0053, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8631370663642883, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0085, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.2952764630317688, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0054, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.23047442734241486, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0054, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.25271645188331604, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0059, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.3246142864227295, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0066, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.31531205773353577, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0045, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4806351959705353, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0089, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.15645328164100647, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0051, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.29767802357673645, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0044, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.23338516056537628, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0055, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.20454354584217072, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0049, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.2087928056716919, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.004, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.18911990523338318, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0058, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.16931432485580444, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0053, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.3027138411998749, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0055, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.22635169327259064, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0039, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.26646292209625244, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0047, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.20067426562309265, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0054, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.22507227957248688, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0076, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.18533077836036682, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.005, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.1757635474205017, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0077, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.2326493263244629, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.006, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.2661048471927643, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0048, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.3285987079143524, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0047, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.3764145076274872, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.005, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.19637148082256317, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0048, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 0.16601431369781494, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.005, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.12405529618263245, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0036, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.21413138508796692, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0043, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.3323937952518463, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0057, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.20915299654006958, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0054, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.28372666239738464, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0048, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.32995301485061646, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0051, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.2148507684469223, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0061, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.22549118101596832, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.005, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.19749189913272858, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0049, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.250184565782547, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0065, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.23174546658992767, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.2707926034927368, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0049, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.175989031791687, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0058, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.2267833948135376, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0044, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.3495822846889496, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0048, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.2051204890012741, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0063, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.22149987518787384, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0058, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.21434035897254944, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0046, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.2996143400669098, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0065, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.22886960208415985, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0053, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.3317148685455322, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.005, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.45717868208885193, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0062, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.1223258301615715, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0051, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.2037084549665451, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0046, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.3772616982460022, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0045, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.30312252044677734, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0069, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.14988413453102112, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0047, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.3409348130226135, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0069, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.2308650016784668, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0049, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.15572187304496765, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0051, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.1962181180715561, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0049, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.337464302778244, + "learning_rate": 1.067930046280971e-05, + "loss": 0.005, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.17047251760959625, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0045, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.3098141849040985, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0043, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.17919068038463593, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0052, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.3461310863494873, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.006, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.37006744742393494, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0066, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.19726566970348358, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.005, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.1319705843925476, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0049, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.2131422460079193, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0055, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.1435563862323761, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0067, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.24024318158626556, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0055, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.1511068344116211, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0052, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.16795606911182404, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0047, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.1475641280412674, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0046, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.21277494728565216, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0048, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.2511015832424164, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0043, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.24675171077251434, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0059, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.2560728192329407, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0055, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.30879196524620056, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.005, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.1838868409395218, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0052, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.1673516035079956, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.20293423533439636, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0047, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.25513023138046265, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0052, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.26149800419807434, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0045, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.27551159262657166, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0041, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.2508440911769867, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0043, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.2889135181903839, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0043, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.1755184680223465, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0051, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.2095116674900055, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.33451047539711, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0079, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.44589516520500183, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0064, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.24158142507076263, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0047, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.15632936358451843, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.006, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.10808487981557846, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0065, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.1782998889684677, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0046, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.16395118832588196, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.004, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 0.30205732583999634, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0058, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.1561775654554367, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.004, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.1649634838104248, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0062, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.15428072214126587, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0043, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.11285894364118576, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0067, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.3470291793346405, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0056, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.16610246896743774, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0051, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.29931193590164185, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0051, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.15366005897521973, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.005, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.2352767139673233, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0057, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.19226962327957153, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0042, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.1903623789548874, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0044, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.4167932868003845, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0071, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.2913760840892792, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0046, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.2632276713848114, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0063, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.21258050203323364, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0043, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.19750680029392242, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.0032, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.2896588444709778, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0045, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3017624020576477, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0074, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.18355949223041534, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0051, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.16483789682388306, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0056, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.2190672904253006, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0043, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.28435200452804565, + "learning_rate": 1.011517750003287e-05, + "loss": 0.005, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.2564929723739624, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0049, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.2592712342739105, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0048, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.18716935813426971, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0047, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.18236829340457916, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0049, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.27956655621528625, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0056, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.13664546608924866, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0048, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.21617569029331207, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0052, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.2196502536535263, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0054, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.20864732563495636, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0041, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.38381293416023254, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.005, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.1605401486158371, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0045, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.2079813927412033, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0051, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.2110205590724945, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0054, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.2421400547027588, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0048, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.41358140110969543, + "learning_rate": 9.969762660447491e-06, + "loss": 0.006, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.23386628925800323, + "learning_rate": 9.960077585586335e-06, + "loss": 0.005, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.20425592362880707, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0059, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.21164651215076447, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0042, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.1642364114522934, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0034, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.18716906011104584, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0044, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.15626995265483856, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0044, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.18394386768341064, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0044, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.3590037524700165, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0073, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.2103291153907776, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0051, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.19865299761295319, + "learning_rate": 9.87296819358355e-06, + "loss": 0.006, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.2052467316389084, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0065, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.31245940923690796, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0049, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.2959006726741791, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0042, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.33695659041404724, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0071, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.20898328721523285, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0062, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.3500119149684906, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0049, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.3926694095134735, + "learning_rate": 9.805290087509098e-06, + "loss": 0.007, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.24234539270401, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0039, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.1705496460199356, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0056, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.2907398045063019, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0048, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.2366454005241394, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0047, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.25498414039611816, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0046, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.163838192820549, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0048, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.1613040417432785, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0048, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.3639470338821411, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0042, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.22151169180870056, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0043, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.13474372029304504, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0051, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.2601003050804138, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0038, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.20202822983264923, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0046, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.18514803051948547, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0061, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.16678287088871002, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0038, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.17608965933322906, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0041, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.26356828212738037, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0059, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.297612726688385, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0047, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.16363881528377533, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.12642459571361542, + "learning_rate": 9.621949874438232e-06, + "loss": 0.004, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.3339644968509674, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0052, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.20784282684326172, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0046, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.28467273712158203, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0047, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.3124372661113739, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0051, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.3490087389945984, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0047, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.15114343166351318, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0051, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.41157594323158264, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0058, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.40405890345573425, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0045, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.1149911880493164, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0087, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.18746539950370789, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0058, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.1327875554561615, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0049, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.1530160903930664, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0038, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.2663615047931671, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0049, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.3390499949455261, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0046, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.2461002618074417, + "learning_rate": 9.477616135359713e-06, + "loss": 0.006, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.2141093611717224, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0049, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.20443470776081085, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0052, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.14927290380001068, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0039, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.3012462854385376, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0047, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.33484792709350586, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0045, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.19986321032047272, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0041, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.21612870693206787, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0043, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.19541047513484955, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0044, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.24203962087631226, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0049, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.1470087766647339, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0049, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.2336059808731079, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0048, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.32893121242523193, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0044, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.32034680247306824, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0055, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.27538758516311646, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0049, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.18869644403457642, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0065, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.2719379961490631, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0047, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.2850756347179413, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0043, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.19997543096542358, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0068, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.19222821295261383, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0044, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.16414248943328857, + "learning_rate": 9.285803018919292e-06, + "loss": 0.004, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.23754803836345673, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0039, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.2682085335254669, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0048, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.18268488347530365, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0046, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.14906349778175354, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0034, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.19079554080963135, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0041, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.09538780897855759, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0043, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.19193744659423828, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0044, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.1366361379623413, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0049, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.29436588287353516, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0052, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.24179348349571228, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0047, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.236627459526062, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0061, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.1719210296869278, + "learning_rate": 9.171095634265995e-06, + "loss": 0.0054, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.2724406123161316, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0048, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.09852395206689835, + "learning_rate": 9.152007262148612e-06, + "loss": 0.004, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.23493632674217224, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0049, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.20697079598903656, + "learning_rate": 9.132927564918328e-06, + "loss": 0.0047, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.16597376763820648, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0048, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.23542962968349457, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0046, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.18859006464481354, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0054, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.16773538291454315, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0044, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.2122378647327423, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0042, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.18205690383911133, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0046, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.1791398823261261, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0043, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.4446735680103302, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0052, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.32150915265083313, + "learning_rate": 9.047178679583151e-06, + "loss": 0.005, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.15855731070041656, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0045, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.19377414882183075, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0057, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.25969046354293823, + "learning_rate": 9.018636566864313e-06, + "loss": 0.006, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.2349981814622879, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0073, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.1853523701429367, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0051, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.22417226433753967, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0058, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.1969340741634369, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0058, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.18523764610290527, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0059, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.28188323974609375, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0052, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.18134717643260956, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0048, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.15660132467746735, + "learning_rate": 8.942627394858978e-06, + "loss": 0.004, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.3179869055747986, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0044, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.14007267355918884, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0043, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.31531354784965515, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0062, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.1867508888244629, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0054, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4172282814979553, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0056, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.21233956515789032, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0054, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.13055016100406647, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0048, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.24662990868091583, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0054, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.1877284198999405, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0045, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.20158089697360992, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0052, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.23169469833374023, + "learning_rate": 8.83836825410936e-06, + "loss": 0.0048, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.27991265058517456, + "learning_rate": 8.828905148874785e-06, + "loss": 0.008, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.3321090638637543, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0063, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.14790703356266022, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0033, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.1504756361246109, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0052, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.2211659848690033, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0038, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.1777208149433136, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0041, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.2586643397808075, + "learning_rate": 8.772180411864604e-06, + "loss": 0.006, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.2705499529838562, + "learning_rate": 8.762735374981932e-06, + "loss": 0.0047, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.16527540981769562, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0037, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.24313445389270782, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0057, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.16705767810344696, + "learning_rate": 8.734416061983528e-06, + "loss": 0.004, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.20638783276081085, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0052, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.26159438490867615, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0039, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.30387070775032043, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0038, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.24292278289794922, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0042, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.3707493543624878, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0056, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.41142478585243225, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0044, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.22052627801895142, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0047, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.14626234769821167, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0047, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.25504666566848755, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0046, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.2020457535982132, + "learning_rate": 8.640192851412488e-06, + "loss": 0.006, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.2440478354692459, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0047, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.12040785700082779, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0044, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.25539812445640564, + "learning_rate": 8.611979388060327e-06, + "loss": 0.006, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.20701228082180023, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0041, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.24188214540481567, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0063, + "step": 18000 + }, + { + "epoch": 1.0791539337288034, + "grad_norm": 0.24987974762916565, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0063, + "step": 18010 + }, + { + "epoch": 1.0797531308047217, + "grad_norm": 0.20973123610019684, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0049, + "step": 18020 + }, + { + "epoch": 1.08035232788064, + "grad_norm": 0.19898714125156403, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0061, + "step": 18030 + }, + { + "epoch": 1.0809515249565582, + "grad_norm": 0.21703247725963593, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0056, + "step": 18040 + }, + { + "epoch": 1.0815507220324765, + "grad_norm": 0.18688541650772095, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0054, + "step": 18050 + }, + { + "epoch": 1.0821499191083948, + "grad_norm": 0.30194586515426636, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0049, + "step": 18060 + }, + { + "epoch": 1.082749116184313, + "grad_norm": 0.17975366115570068, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0046, + "step": 18070 + }, + { + "epoch": 1.0833483132602313, + "grad_norm": 0.25966599583625793, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0044, + "step": 18080 + }, + { + "epoch": 1.0839475103361496, + "grad_norm": 0.1702205240726471, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0058, + "step": 18090 + }, + { + "epoch": 1.0845467074120678, + "grad_norm": 0.18940114974975586, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0052, + "step": 18100 + }, + { + "epoch": 1.085145904487986, + "grad_norm": 0.18239127099514008, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0047, + "step": 18110 + }, + { + "epoch": 1.0857451015639044, + "grad_norm": 0.14571616053581238, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0046, + "step": 18120 + }, + { + "epoch": 1.0863442986398226, + "grad_norm": 0.17203395068645477, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0038, + "step": 18130 + }, + { + "epoch": 1.086943495715741, + "grad_norm": 0.249881312251091, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0056, + "step": 18140 + }, + { + "epoch": 1.0875426927916592, + "grad_norm": 0.296194463968277, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0044, + "step": 18150 + }, + { + "epoch": 1.0881418898675774, + "grad_norm": 0.21376049518585205, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0052, + "step": 18160 + }, + { + "epoch": 1.0887410869434957, + "grad_norm": 0.2952374815940857, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0036, + "step": 18170 + }, + { + "epoch": 1.089340284019414, + "grad_norm": 0.20862646400928497, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0051, + "step": 18180 + }, + { + "epoch": 1.0899394810953322, + "grad_norm": 0.17828255891799927, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0053, + "step": 18190 + }, + { + "epoch": 1.0905386781712505, + "grad_norm": 0.20771050453186035, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0038, + "step": 18200 + }, + { + "epoch": 1.0911378752471688, + "grad_norm": 0.3046565651893616, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0059, + "step": 18210 + }, + { + "epoch": 1.091737072323087, + "grad_norm": 0.12605167925357819, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0046, + "step": 18220 + }, + { + "epoch": 1.0923362693990053, + "grad_norm": 0.13702887296676636, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0038, + "step": 18230 + }, + { + "epoch": 1.0929354664749236, + "grad_norm": 0.11569058150053024, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0042, + "step": 18240 + }, + { + "epoch": 1.0935346635508418, + "grad_norm": 0.27488255500793457, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0054, + "step": 18250 + }, + { + "epoch": 1.09413386062676, + "grad_norm": 0.30820342898368835, + "learning_rate": 8.349909816537207e-06, + "loss": 0.005, + "step": 18260 + }, + { + "epoch": 1.0947330577026784, + "grad_norm": 0.3108576536178589, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0056, + "step": 18270 + }, + { + "epoch": 1.0953322547785966, + "grad_norm": 0.16087505221366882, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0044, + "step": 18280 + }, + { + "epoch": 1.095931451854515, + "grad_norm": 0.27139320969581604, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0055, + "step": 18290 + }, + { + "epoch": 1.0965306489304332, + "grad_norm": 0.17057007551193237, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0036, + "step": 18300 + }, + { + "epoch": 1.0971298460063514, + "grad_norm": 0.13946233689785004, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0057, + "step": 18310 + }, + { + "epoch": 1.0977290430822697, + "grad_norm": 0.2342602014541626, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0038, + "step": 18320 + }, + { + "epoch": 1.098328240158188, + "grad_norm": 0.17249339818954468, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0047, + "step": 18330 + }, + { + "epoch": 1.0989274372341062, + "grad_norm": 0.2641673684120178, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0044, + "step": 18340 + }, + { + "epoch": 1.0995266343100245, + "grad_norm": 0.18304336071014404, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0041, + "step": 18350 + }, + { + "epoch": 1.1001258313859428, + "grad_norm": 0.25955966114997864, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0045, + "step": 18360 + }, + { + "epoch": 1.100725028461861, + "grad_norm": 0.2159314751625061, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0038, + "step": 18370 + }, + { + "epoch": 1.1013242255377793, + "grad_norm": 0.254371702671051, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0043, + "step": 18380 + }, + { + "epoch": 1.1019234226136976, + "grad_norm": 0.10616741329431534, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0036, + "step": 18390 + }, + { + "epoch": 1.1025226196896158, + "grad_norm": 0.38598379492759705, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0065, + "step": 18400 + }, + { + "epoch": 1.1031218167655341, + "grad_norm": 0.3797863721847534, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0048, + "step": 18410 + }, + { + "epoch": 1.1037210138414524, + "grad_norm": 0.2059139758348465, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0062, + "step": 18420 + }, + { + "epoch": 1.1043202109173706, + "grad_norm": 0.19991335272789001, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0043, + "step": 18430 + }, + { + "epoch": 1.104919407993289, + "grad_norm": 0.17376656830310822, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0047, + "step": 18440 + }, + { + "epoch": 1.1055186050692072, + "grad_norm": 0.17102457582950592, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0056, + "step": 18450 + }, + { + "epoch": 1.1061178021451255, + "grad_norm": 0.501983642578125, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0065, + "step": 18460 + }, + { + "epoch": 1.1067169992210437, + "grad_norm": 0.40338510274887085, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0055, + "step": 18470 + }, + { + "epoch": 1.107316196296962, + "grad_norm": 0.10511627048254013, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0052, + "step": 18480 + }, + { + "epoch": 1.1079153933728803, + "grad_norm": 0.2610682249069214, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0038, + "step": 18490 + }, + { + "epoch": 1.1085145904487985, + "grad_norm": 0.09666074812412262, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0058, + "step": 18500 + }, + { + "epoch": 1.1091137875247168, + "grad_norm": 0.19014683365821838, + "learning_rate": 8.117972135268806e-06, + "loss": 0.005, + "step": 18510 + }, + { + "epoch": 1.109712984600635, + "grad_norm": 0.2999255657196045, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 1.1103121816765533, + "grad_norm": 0.20351538062095642, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0049, + "step": 18530 + }, + { + "epoch": 1.1109113787524716, + "grad_norm": 0.1562410295009613, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0034, + "step": 18540 + }, + { + "epoch": 1.1115105758283899, + "grad_norm": 0.14160799980163574, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0035, + "step": 18550 + }, + { + "epoch": 1.1121097729043081, + "grad_norm": 0.10796743631362915, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0056, + "step": 18560 + }, + { + "epoch": 1.1127089699802264, + "grad_norm": 0.28861188888549805, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0043, + "step": 18570 + }, + { + "epoch": 1.1133081670561449, + "grad_norm": 0.3835368752479553, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0037, + "step": 18580 + }, + { + "epoch": 1.113907364132063, + "grad_norm": 0.21850043535232544, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0038, + "step": 18590 + }, + { + "epoch": 1.1145065612079814, + "grad_norm": 0.2950346767902374, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0068, + "step": 18600 + }, + { + "epoch": 1.1151057582838995, + "grad_norm": 0.13051068782806396, + "learning_rate": 8.025779439806006e-06, + "loss": 0.0041, + "step": 18610 + }, + { + "epoch": 1.115704955359818, + "grad_norm": 0.11036359518766403, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0074, + "step": 18620 + }, + { + "epoch": 1.116304152435736, + "grad_norm": 0.35306516289711, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0087, + "step": 18630 + }, + { + "epoch": 1.1169033495116545, + "grad_norm": 0.29782727360725403, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0045, + "step": 18640 + }, + { + "epoch": 1.1175025465875728, + "grad_norm": 0.20690713822841644, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0042, + "step": 18650 + }, + { + "epoch": 1.118101743663491, + "grad_norm": 0.16064110398292542, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0038, + "step": 18660 + }, + { + "epoch": 1.1187009407394093, + "grad_norm": 0.2477649450302124, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0042, + "step": 18670 + }, + { + "epoch": 1.1193001378153276, + "grad_norm": 0.25939393043518066, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0045, + "step": 18680 + }, + { + "epoch": 1.1198993348912458, + "grad_norm": 0.3345301151275635, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0045, + "step": 18690 + }, + { + "epoch": 1.120498531967164, + "grad_norm": 0.19570066034793854, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0052, + "step": 18700 + }, + { + "epoch": 1.1210977290430824, + "grad_norm": 0.09655601531267166, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0044, + "step": 18710 + }, + { + "epoch": 1.1216969261190006, + "grad_norm": 0.13345655798912048, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0031, + "step": 18720 + }, + { + "epoch": 1.122296123194919, + "grad_norm": 0.3130756616592407, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0072, + "step": 18730 + }, + { + "epoch": 1.1228953202708372, + "grad_norm": 0.16259168088436127, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0036, + "step": 18740 + }, + { + "epoch": 1.1234945173467554, + "grad_norm": 0.2581227123737335, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0037, + "step": 18750 + }, + { + "epoch": 1.1240937144226737, + "grad_norm": 0.36706119775772095, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0043, + "step": 18760 + }, + { + "epoch": 1.124692911498592, + "grad_norm": 0.1705426573753357, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0069, + "step": 18770 + }, + { + "epoch": 1.1252921085745102, + "grad_norm": 0.4281153380870819, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0057, + "step": 18780 + }, + { + "epoch": 1.1258913056504285, + "grad_norm": 0.25743696093559265, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0036, + "step": 18790 + }, + { + "epoch": 1.1264905027263468, + "grad_norm": 0.17692404985427856, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0044, + "step": 18800 + }, + { + "epoch": 1.127089699802265, + "grad_norm": 0.17617255449295044, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0043, + "step": 18810 + }, + { + "epoch": 1.1276888968781833, + "grad_norm": 0.193951815366745, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0042, + "step": 18820 + }, + { + "epoch": 1.1282880939541016, + "grad_norm": 0.2187023162841797, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0047, + "step": 18830 + }, + { + "epoch": 1.1288872910300198, + "grad_norm": 0.21488729119300842, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0039, + "step": 18840 + }, + { + "epoch": 1.129486488105938, + "grad_norm": 0.13388743996620178, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0043, + "step": 18850 + }, + { + "epoch": 1.1300856851818564, + "grad_norm": 0.26977118849754333, + "learning_rate": 7.796848308199681e-06, + "loss": 0.004, + "step": 18860 + }, + { + "epoch": 1.1306848822577746, + "grad_norm": 0.40695786476135254, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0049, + "step": 18870 + }, + { + "epoch": 1.131284079333693, + "grad_norm": 0.29070621728897095, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0056, + "step": 18880 + }, + { + "epoch": 1.1318832764096112, + "grad_norm": 0.2745647728443146, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0056, + "step": 18890 + }, + { + "epoch": 1.1324824734855294, + "grad_norm": 0.20881050825119019, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0057, + "step": 18900 + }, + { + "epoch": 1.1330816705614477, + "grad_norm": 0.17475518584251404, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0041, + "step": 18910 + }, + { + "epoch": 1.133680867637366, + "grad_norm": 0.2414310723543167, + "learning_rate": 7.742248115573104e-06, + "loss": 0.004, + "step": 18920 + }, + { + "epoch": 1.1342800647132842, + "grad_norm": 0.20051640272140503, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0042, + "step": 18930 + }, + { + "epoch": 1.1348792617892025, + "grad_norm": 0.18383435904979706, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0038, + "step": 18940 + }, + { + "epoch": 1.1354784588651208, + "grad_norm": 0.16546988487243652, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0041, + "step": 18950 + }, + { + "epoch": 1.136077655941039, + "grad_norm": 0.17165544629096985, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0057, + "step": 18960 + }, + { + "epoch": 1.1366768530169573, + "grad_norm": 0.25065234303474426, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0048, + "step": 18970 + }, + { + "epoch": 1.1372760500928756, + "grad_norm": 0.19762223958969116, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0038, + "step": 18980 + }, + { + "epoch": 1.1378752471687938, + "grad_norm": 0.23894545435905457, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0033, + "step": 18990 + }, + { + "epoch": 1.1384744442447121, + "grad_norm": 0.2860289216041565, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0053, + "step": 19000 + }, + { + "epoch": 1.1390736413206304, + "grad_norm": 0.3699626624584198, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0061, + "step": 19010 + }, + { + "epoch": 1.1396728383965486, + "grad_norm": 0.2370971292257309, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0043, + "step": 19020 + }, + { + "epoch": 1.140272035472467, + "grad_norm": 0.19790691137313843, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0042, + "step": 19030 + }, + { + "epoch": 1.1408712325483852, + "grad_norm": 0.14648208022117615, + "learning_rate": 7.633462930388875e-06, + "loss": 0.005, + "step": 19040 + }, + { + "epoch": 1.1414704296243035, + "grad_norm": 0.105158232152462, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0032, + "step": 19050 + }, + { + "epoch": 1.1420696267002217, + "grad_norm": 0.24994254112243652, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0042, + "step": 19060 + }, + { + "epoch": 1.14266882377614, + "grad_norm": 0.30648791790008545, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0058, + "step": 19070 + }, + { + "epoch": 1.1432680208520583, + "grad_norm": 0.16284243762493134, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0047, + "step": 19080 + }, + { + "epoch": 1.1438672179279765, + "grad_norm": 0.14919471740722656, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0045, + "step": 19090 + }, + { + "epoch": 1.1444664150038948, + "grad_norm": 0.14879491925239563, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0047, + "step": 19100 + }, + { + "epoch": 1.145065612079813, + "grad_norm": 0.11741457879543304, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0041, + "step": 19110 + }, + { + "epoch": 1.1456648091557313, + "grad_norm": 0.09406878799200058, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0029, + "step": 19120 + }, + { + "epoch": 1.1462640062316496, + "grad_norm": 0.20860706269741058, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0054, + "step": 19130 + }, + { + "epoch": 1.1468632033075679, + "grad_norm": 0.24234607815742493, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0047, + "step": 19140 + }, + { + "epoch": 1.1474624003834861, + "grad_norm": 0.27025938034057617, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0042, + "step": 19150 + }, + { + "epoch": 1.1480615974594044, + "grad_norm": 0.15129081904888153, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0046, + "step": 19160 + }, + { + "epoch": 1.1486607945353227, + "grad_norm": 0.11173490434885025, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0035, + "step": 19170 + }, + { + "epoch": 1.149259991611241, + "grad_norm": 0.2204807698726654, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0036, + "step": 19180 + }, + { + "epoch": 1.1498591886871592, + "grad_norm": 0.20111115276813507, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0087, + "step": 19190 + }, + { + "epoch": 1.1504583857630775, + "grad_norm": 0.213748961687088, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0045, + "step": 19200 + }, + { + "epoch": 1.1510575828389957, + "grad_norm": 0.21150177717208862, + "learning_rate": 7.480328799175369e-06, + "loss": 0.004, + "step": 19210 + }, + { + "epoch": 1.151656779914914, + "grad_norm": 0.2450210005044937, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0036, + "step": 19220 + }, + { + "epoch": 1.1522559769908323, + "grad_norm": 0.16161729395389557, + "learning_rate": 7.4623904967312e-06, + "loss": 0.004, + "step": 19230 + }, + { + "epoch": 1.1528551740667505, + "grad_norm": 0.15077564120292664, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0038, + "step": 19240 + }, + { + "epoch": 1.1534543711426688, + "grad_norm": 0.3078431487083435, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0051, + "step": 19250 + }, + { + "epoch": 1.154053568218587, + "grad_norm": 0.15213221311569214, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0032, + "step": 19260 + }, + { + "epoch": 1.1546527652945053, + "grad_norm": 0.12404917925596237, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0042, + "step": 19270 + }, + { + "epoch": 1.1552519623704236, + "grad_norm": 0.18779516220092773, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0041, + "step": 19280 + }, + { + "epoch": 1.1558511594463419, + "grad_norm": 0.4039568603038788, + "learning_rate": 7.408675563767873e-06, + "loss": 0.005, + "step": 19290 + }, + { + "epoch": 1.1564503565222601, + "grad_norm": 0.2045651078224182, + "learning_rate": 7.399737764864619e-06, + "loss": 0.0057, + "step": 19300 + }, + { + "epoch": 1.1570495535981784, + "grad_norm": 0.3885338306427002, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0049, + "step": 19310 + }, + { + "epoch": 1.1576487506740967, + "grad_norm": 0.253049373626709, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0059, + "step": 19320 + }, + { + "epoch": 1.158247947750015, + "grad_norm": 0.250356525182724, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0076, + "step": 19330 + }, + { + "epoch": 1.1588471448259332, + "grad_norm": 0.3269367814064026, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0112, + "step": 19340 + }, + { + "epoch": 1.1594463419018515, + "grad_norm": 0.15401138365268707, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0052, + "step": 19350 + }, + { + "epoch": 1.1600455389777697, + "grad_norm": 0.1631775051355362, + "learning_rate": 7.346200065486093e-06, + "loss": 0.004, + "step": 19360 + }, + { + "epoch": 1.160644736053688, + "grad_norm": 0.17112085223197937, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0038, + "step": 19370 + }, + { + "epoch": 1.1612439331296063, + "grad_norm": 0.24018551409244537, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0056, + "step": 19380 + }, + { + "epoch": 1.1618431302055245, + "grad_norm": 0.17964349687099457, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0057, + "step": 19390 + }, + { + "epoch": 1.1624423272814428, + "grad_norm": 0.1747465431690216, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0053, + "step": 19400 + }, + { + "epoch": 1.163041524357361, + "grad_norm": 0.21299205720424652, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0038, + "step": 19410 + }, + { + "epoch": 1.1636407214332793, + "grad_norm": 0.13219258189201355, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0057, + "step": 19420 + }, + { + "epoch": 1.1642399185091976, + "grad_norm": 1.0558332204818726, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0066, + "step": 19430 + }, + { + "epoch": 1.1648391155851159, + "grad_norm": 0.2154799997806549, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0041, + "step": 19440 + }, + { + "epoch": 1.1654383126610341, + "grad_norm": 0.13665339350700378, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0044, + "step": 19450 + }, + { + "epoch": 1.1660375097369524, + "grad_norm": 0.2101723700761795, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0039, + "step": 19460 + }, + { + "epoch": 1.1666367068128707, + "grad_norm": 0.13208501040935516, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0054, + "step": 19470 + }, + { + "epoch": 1.167235903888789, + "grad_norm": 0.09342823177576065, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0032, + "step": 19480 + }, + { + "epoch": 1.1678351009647072, + "grad_norm": 0.22464905679225922, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0055, + "step": 19490 + }, + { + "epoch": 1.1684342980406255, + "grad_norm": 0.17030438780784607, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0042, + "step": 19500 + }, + { + "epoch": 1.1690334951165438, + "grad_norm": 0.17673689126968384, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0055, + "step": 19510 + }, + { + "epoch": 1.169632692192462, + "grad_norm": 0.24041922390460968, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0048, + "step": 19520 + }, + { + "epoch": 1.1702318892683803, + "grad_norm": 0.14808662235736847, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0031, + "step": 19530 + }, + { + "epoch": 1.1708310863442986, + "grad_norm": 0.2489791214466095, + "learning_rate": 7.186522173441719e-06, + "loss": 0.004, + "step": 19540 + }, + { + "epoch": 1.1714302834202168, + "grad_norm": 0.19468742609024048, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0042, + "step": 19550 + }, + { + "epoch": 1.172029480496135, + "grad_norm": 0.15028323233127594, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0061, + "step": 19560 + }, + { + "epoch": 1.1726286775720534, + "grad_norm": 0.13852037489414215, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0045, + "step": 19570 + }, + { + "epoch": 1.1732278746479716, + "grad_norm": 0.1401798278093338, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0063, + "step": 19580 + }, + { + "epoch": 1.17382707172389, + "grad_norm": 0.1831122189760208, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0034, + "step": 19590 + }, + { + "epoch": 1.1744262687998082, + "grad_norm": 0.2867920994758606, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0044, + "step": 19600 + }, + { + "epoch": 1.1750254658757266, + "grad_norm": 0.13363438844680786, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0038, + "step": 19610 + }, + { + "epoch": 1.1756246629516447, + "grad_norm": 0.20085696876049042, + "learning_rate": 7.116016051769541e-06, + "loss": 0.004, + "step": 19620 + }, + { + "epoch": 1.1762238600275632, + "grad_norm": 0.1598372906446457, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0042, + "step": 19630 + }, + { + "epoch": 1.1768230571034812, + "grad_norm": 0.09672598540782928, + "learning_rate": 7.098434895408162e-06, + "loss": 0.004, + "step": 19640 + }, + { + "epoch": 1.1774222541793997, + "grad_norm": 0.18206225335597992, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0048, + "step": 19650 + }, + { + "epoch": 1.1780214512553178, + "grad_norm": 0.1818019449710846, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0038, + "step": 19660 + }, + { + "epoch": 1.1786206483312363, + "grad_norm": 0.21658800542354584, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0044, + "step": 19670 + }, + { + "epoch": 1.1792198454071543, + "grad_norm": 0.08513368666172028, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0038, + "step": 19680 + }, + { + "epoch": 1.1798190424830728, + "grad_norm": 0.10634194314479828, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0044, + "step": 19690 + }, + { + "epoch": 1.1804182395589908, + "grad_norm": 0.12106078863143921, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0037, + "step": 19700 + }, + { + "epoch": 1.1810174366349093, + "grad_norm": 0.11508465558290482, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0036, + "step": 19710 + }, + { + "epoch": 1.1816166337108274, + "grad_norm": 0.20805053412914276, + "learning_rate": 7.028294242074066e-06, + "loss": 0.004, + "step": 19720 + }, + { + "epoch": 1.1822158307867459, + "grad_norm": 0.23920200765132904, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0045, + "step": 19730 + }, + { + "epoch": 1.1828150278626641, + "grad_norm": 0.1300375908613205, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0045, + "step": 19740 + }, + { + "epoch": 1.1834142249385824, + "grad_norm": 0.23444809019565582, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0036, + "step": 19750 + }, + { + "epoch": 1.1840134220145007, + "grad_norm": 0.2636217772960663, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0044, + "step": 19760 + }, + { + "epoch": 1.184612619090419, + "grad_norm": 0.31166398525238037, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.005, + "step": 19770 + }, + { + "epoch": 1.1852118161663372, + "grad_norm": 0.32881107926368713, + "learning_rate": 6.975884226362e-06, + "loss": 0.0055, + "step": 19780 + }, + { + "epoch": 1.1858110132422555, + "grad_norm": 0.41748252511024475, + "learning_rate": 6.967165692827958e-06, + "loss": 0.006, + "step": 19790 + }, + { + "epoch": 1.1864102103181737, + "grad_norm": 0.1588834673166275, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0039, + "step": 19800 + }, + { + "epoch": 1.187009407394092, + "grad_norm": 0.23697984218597412, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0039, + "step": 19810 + }, + { + "epoch": 1.1876086044700103, + "grad_norm": 0.19356773793697357, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0061, + "step": 19820 + }, + { + "epoch": 1.1882078015459285, + "grad_norm": 0.16373206675052643, + "learning_rate": 6.932338988482141e-06, + "loss": 0.004, + "step": 19830 + }, + { + "epoch": 1.1888069986218468, + "grad_norm": 0.1331700086593628, + "learning_rate": 6.923644220932124e-06, + "loss": 0.004, + "step": 19840 + }, + { + "epoch": 1.189406195697765, + "grad_norm": 0.4039696753025055, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0057, + "step": 19850 + }, + { + "epoch": 1.1900053927736833, + "grad_norm": 0.30325421690940857, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0065, + "step": 19860 + }, + { + "epoch": 1.1906045898496016, + "grad_norm": 0.21767468750476837, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0038, + "step": 19870 + }, + { + "epoch": 1.1912037869255199, + "grad_norm": 0.17474445700645447, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0056, + "step": 19880 + }, + { + "epoch": 1.1918029840014381, + "grad_norm": 0.17118008434772491, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0045, + "step": 19890 + }, + { + "epoch": 1.1924021810773564, + "grad_norm": 0.44261473417282104, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0063, + "step": 19900 + }, + { + "epoch": 1.1930013781532747, + "grad_norm": 0.18502798676490784, + "learning_rate": 6.862915366041247e-06, + "loss": 0.004, + "step": 19910 + }, + { + "epoch": 1.193600575229193, + "grad_norm": 0.19384194910526276, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0036, + "step": 19920 + }, + { + "epoch": 1.1941997723051112, + "grad_norm": 0.1448352187871933, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0044, + "step": 19930 + }, + { + "epoch": 1.1947989693810295, + "grad_norm": 0.3728172779083252, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0038, + "step": 19940 + }, + { + "epoch": 1.1953981664569477, + "grad_norm": 0.31421783566474915, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0043, + "step": 19950 + }, + { + "epoch": 1.195997363532866, + "grad_norm": 0.28181371092796326, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0045, + "step": 19960 + }, + { + "epoch": 1.1965965606087843, + "grad_norm": 0.2249889373779297, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0041, + "step": 19970 + }, + { + "epoch": 1.1971957576847025, + "grad_norm": 0.26402008533477783, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0043, + "step": 19980 + }, + { + "epoch": 1.1977949547606208, + "grad_norm": 0.22621415555477142, + "learning_rate": 6.793802468038111e-06, + "loss": 0.004, + "step": 19990 + }, + { + "epoch": 1.198394151836539, + "grad_norm": 0.2681289315223694, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0045, + "step": 20000 + }, + { + "epoch": 1.1989933489124573, + "grad_norm": 0.17681041359901428, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0037, + "step": 20010 + }, + { + "epoch": 1.1995925459883756, + "grad_norm": 0.16526542603969574, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0032, + "step": 20020 + }, + { + "epoch": 1.2001917430642939, + "grad_norm": 0.30313149094581604, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0046, + "step": 20030 + }, + { + "epoch": 1.2007909401402121, + "grad_norm": 0.17628541588783264, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0065, + "step": 20040 + }, + { + "epoch": 1.2013901372161304, + "grad_norm": 0.1840096414089203, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0051, + "step": 20050 + }, + { + "epoch": 1.2019893342920487, + "grad_norm": 0.146232470870018, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0035, + "step": 20060 + }, + { + "epoch": 1.202588531367967, + "grad_norm": 0.4804438352584839, + "learning_rate": 6.725005485342219e-06, + "loss": 0.005, + "step": 20070 + }, + { + "epoch": 1.2031877284438852, + "grad_norm": 0.2245558500289917, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0039, + "step": 20080 + }, + { + "epoch": 1.2037869255198035, + "grad_norm": 0.21845588088035583, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0053, + "step": 20090 + }, + { + "epoch": 1.2043861225957218, + "grad_norm": 0.1743943691253662, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0037, + "step": 20100 + }, + { + "epoch": 1.20498531967164, + "grad_norm": 0.16978098452091217, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0036, + "step": 20110 + }, + { + "epoch": 1.2055845167475583, + "grad_norm": 0.27158796787261963, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0043, + "step": 20120 + }, + { + "epoch": 1.2061837138234766, + "grad_norm": 0.13516400754451752, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0048, + "step": 20130 + }, + { + "epoch": 1.2067829108993948, + "grad_norm": 0.1645064353942871, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0038, + "step": 20140 + }, + { + "epoch": 1.207382107975313, + "grad_norm": 0.07616083323955536, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0046, + "step": 20150 + }, + { + "epoch": 1.2079813050512314, + "grad_norm": 0.13306911289691925, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0039, + "step": 20160 + }, + { + "epoch": 1.2085805021271496, + "grad_norm": 0.19445037841796875, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0044, + "step": 20170 + }, + { + "epoch": 1.209179699203068, + "grad_norm": 0.18423207104206085, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0049, + "step": 20180 + }, + { + "epoch": 1.2097788962789862, + "grad_norm": 0.19280213117599487, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0043, + "step": 20190 + }, + { + "epoch": 1.2103780933549044, + "grad_norm": 0.25472623109817505, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0033, + "step": 20200 + }, + { + "epoch": 1.2109772904308227, + "grad_norm": 0.16799427568912506, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0031, + "step": 20210 + }, + { + "epoch": 1.211576487506741, + "grad_norm": 0.2097395807504654, + "learning_rate": 6.596880604028027e-06, + "loss": 0.004, + "step": 20220 + }, + { + "epoch": 1.2121756845826592, + "grad_norm": 0.31450021266937256, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0047, + "step": 20230 + }, + { + "epoch": 1.2127748816585775, + "grad_norm": 0.16530238091945648, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0034, + "step": 20240 + }, + { + "epoch": 1.2133740787344958, + "grad_norm": 0.2506805956363678, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0038, + "step": 20250 + }, + { + "epoch": 1.213973275810414, + "grad_norm": 0.1876160055398941, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0035, + "step": 20260 + }, + { + "epoch": 1.2145724728863323, + "grad_norm": 0.23704354465007782, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0041, + "step": 20270 + }, + { + "epoch": 1.2151716699622506, + "grad_norm": 0.13814999163150787, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0042, + "step": 20280 + }, + { + "epoch": 1.2157708670381688, + "grad_norm": 0.1164403185248375, + "learning_rate": 6.53748481975927e-06, + "loss": 0.0042, + "step": 20290 + }, + { + "epoch": 1.216370064114087, + "grad_norm": 0.23078426718711853, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0038, + "step": 20300 + }, + { + "epoch": 1.2169692611900054, + "grad_norm": 0.21749110519886017, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0046, + "step": 20310 + }, + { + "epoch": 1.2175684582659236, + "grad_norm": 0.24972137808799744, + "learning_rate": 6.512107839793337e-06, + "loss": 0.0041, + "step": 20320 + }, + { + "epoch": 1.218167655341842, + "grad_norm": 0.2491082102060318, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0043, + "step": 20330 + }, + { + "epoch": 1.2187668524177602, + "grad_norm": 0.14915086328983307, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0048, + "step": 20340 + }, + { + "epoch": 1.2193660494936784, + "grad_norm": 0.2794116735458374, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0035, + "step": 20350 + }, + { + "epoch": 1.2199652465695967, + "grad_norm": 0.13765662908554077, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0047, + "step": 20360 + }, + { + "epoch": 1.220564443645515, + "grad_norm": 0.14874878525733948, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0042, + "step": 20370 + }, + { + "epoch": 1.2211636407214332, + "grad_norm": 0.1800280064344406, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0057, + "step": 20380 + }, + { + "epoch": 1.2217628377973515, + "grad_norm": 0.17518648505210876, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0049, + "step": 20390 + }, + { + "epoch": 1.2223620348732698, + "grad_norm": 0.16315865516662598, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0045, + "step": 20400 + }, + { + "epoch": 1.222961231949188, + "grad_norm": 0.3590790033340454, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0039, + "step": 20410 + }, + { + "epoch": 1.2235604290251063, + "grad_norm": 0.14534324407577515, + "learning_rate": 6.427861749601945e-06, + "loss": 0.004, + "step": 20420 + }, + { + "epoch": 1.2241596261010246, + "grad_norm": 0.1662825047969818, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0042, + "step": 20430 + }, + { + "epoch": 1.2247588231769428, + "grad_norm": 0.27466440200805664, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0045, + "step": 20440 + }, + { + "epoch": 1.2253580202528611, + "grad_norm": 0.1323469579219818, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0047, + "step": 20450 + }, + { + "epoch": 1.2259572173287794, + "grad_norm": 0.12367355078458786, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0077, + "step": 20460 + }, + { + "epoch": 1.2265564144046976, + "grad_norm": 0.18238325417041779, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0058, + "step": 20470 + }, + { + "epoch": 1.227155611480616, + "grad_norm": 0.2733745574951172, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0038, + "step": 20480 + }, + { + "epoch": 1.2277548085565342, + "grad_norm": 0.3367181420326233, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0039, + "step": 20490 + }, + { + "epoch": 1.2283540056324525, + "grad_norm": 0.20671530067920685, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0034, + "step": 20500 + }, + { + "epoch": 1.2289532027083707, + "grad_norm": 0.23353071510791779, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0033, + "step": 20510 + }, + { + "epoch": 1.229552399784289, + "grad_norm": 0.21081902086734772, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0031, + "step": 20520 + }, + { + "epoch": 1.2301515968602073, + "grad_norm": 0.3426077365875244, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0049, + "step": 20530 + }, + { + "epoch": 1.2307507939361255, + "grad_norm": 0.3905622959136963, + "learning_rate": 6.327475567095824e-06, + "loss": 0.004, + "step": 20540 + }, + { + "epoch": 1.2313499910120438, + "grad_norm": 0.1888400912284851, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0041, + "step": 20550 + }, + { + "epoch": 1.231949188087962, + "grad_norm": 0.23982487618923187, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0041, + "step": 20560 + }, + { + "epoch": 1.2325483851638803, + "grad_norm": 0.2061331421136856, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0046, + "step": 20570 + }, + { + "epoch": 1.2331475822397986, + "grad_norm": 0.17000116407871246, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0033, + "step": 20580 + }, + { + "epoch": 1.2337467793157169, + "grad_norm": 0.15905790030956268, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0049, + "step": 20590 + }, + { + "epoch": 1.2343459763916351, + "grad_norm": 0.16794419288635254, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0052, + "step": 20600 + }, + { + "epoch": 1.2349451734675534, + "grad_norm": 0.3003343641757965, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0061, + "step": 20610 + }, + { + "epoch": 1.2355443705434717, + "grad_norm": 0.1429288536310196, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0042, + "step": 20620 + }, + { + "epoch": 1.23614356761939, + "grad_norm": 0.18542084097862244, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0047, + "step": 20630 + }, + { + "epoch": 1.2367427646953082, + "grad_norm": 0.2692892253398895, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0035, + "step": 20640 + }, + { + "epoch": 1.2373419617712265, + "grad_norm": 0.23286236822605133, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0037, + "step": 20650 + }, + { + "epoch": 1.2379411588471447, + "grad_norm": 0.0963423103094101, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0041, + "step": 20660 + }, + { + "epoch": 1.238540355923063, + "grad_norm": 0.1425798237323761, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0043, + "step": 20670 + }, + { + "epoch": 1.2391395529989815, + "grad_norm": 0.0960182398557663, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0046, + "step": 20680 + }, + { + "epoch": 1.2397387500748995, + "grad_norm": 0.2674477994441986, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0043, + "step": 20690 + }, + { + "epoch": 1.240337947150818, + "grad_norm": 0.16276703774929047, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0041, + "step": 20700 + }, + { + "epoch": 1.240937144226736, + "grad_norm": 0.24255621433258057, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.003, + "step": 20710 + }, + { + "epoch": 1.2415363413026546, + "grad_norm": 0.20395220816135406, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0054, + "step": 20720 + }, + { + "epoch": 1.2421355383785726, + "grad_norm": 0.12099681794643402, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0082, + "step": 20730 + }, + { + "epoch": 1.242734735454491, + "grad_norm": 0.14017170667648315, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0042, + "step": 20740 + }, + { + "epoch": 1.2433339325304091, + "grad_norm": 0.28132137656211853, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0043, + "step": 20750 + }, + { + "epoch": 1.2439331296063276, + "grad_norm": 0.27220970392227173, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0039, + "step": 20760 + }, + { + "epoch": 1.2445323266822457, + "grad_norm": 0.23647353053092957, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0058, + "step": 20770 + }, + { + "epoch": 1.2451315237581642, + "grad_norm": 0.20623824000358582, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0053, + "step": 20780 + }, + { + "epoch": 1.2457307208340822, + "grad_norm": 0.12366114556789398, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0037, + "step": 20790 + }, + { + "epoch": 1.2463299179100007, + "grad_norm": 0.23330192267894745, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0056, + "step": 20800 + }, + { + "epoch": 1.246929114985919, + "grad_norm": 0.19991633296012878, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0031, + "step": 20810 + }, + { + "epoch": 1.2475283120618372, + "grad_norm": 0.1496160626411438, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0058, + "step": 20820 + }, + { + "epoch": 1.2481275091377555, + "grad_norm": 0.13247868418693542, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0037, + "step": 20830 + }, + { + "epoch": 1.2487267062136738, + "grad_norm": 0.19072194397449493, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0057, + "step": 20840 + }, + { + "epoch": 1.249325903289592, + "grad_norm": 0.10773085057735443, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0042, + "step": 20850 + }, + { + "epoch": 1.2499251003655103, + "grad_norm": 0.14058449864387512, + "learning_rate": 6.063685039328116e-06, + "loss": 0.005, + "step": 20860 + }, + { + "epoch": 1.2505242974414286, + "grad_norm": 0.10825464874505997, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0042, + "step": 20870 + }, + { + "epoch": 1.2511234945173468, + "grad_norm": 0.18059906363487244, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0046, + "step": 20880 + }, + { + "epoch": 1.251722691593265, + "grad_norm": 0.1713389754295349, + "learning_rate": 6.039253929027638e-06, + "loss": 0.005, + "step": 20890 + }, + { + "epoch": 1.2523218886691834, + "grad_norm": 0.23789434134960175, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0047, + "step": 20900 + }, + { + "epoch": 1.2529210857451016, + "grad_norm": 0.17626744508743286, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0041, + "step": 20910 + }, + { + "epoch": 1.25352028282102, + "grad_norm": 0.2091904729604721, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0044, + "step": 20920 + }, + { + "epoch": 1.2541194798969382, + "grad_norm": 0.17293672263622284, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0043, + "step": 20930 + }, + { + "epoch": 1.2547186769728564, + "grad_norm": 0.13156521320343018, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0039, + "step": 20940 + }, + { + "epoch": 1.2553178740487747, + "grad_norm": 0.19591976702213287, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0043, + "step": 20950 + }, + { + "epoch": 1.255917071124693, + "grad_norm": 0.16212835907936096, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0039, + "step": 20960 + }, + { + "epoch": 1.2565162682006112, + "grad_norm": 0.10661022365093231, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0037, + "step": 20970 + }, + { + "epoch": 1.2571154652765295, + "grad_norm": 0.16630858182907104, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0038, + "step": 20980 + }, + { + "epoch": 1.2577146623524478, + "grad_norm": 0.11001022905111313, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0037, + "step": 20990 + }, + { + "epoch": 1.258313859428366, + "grad_norm": 0.1888381838798523, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0044, + "step": 21000 + }, + { + "epoch": 1.2589130565042843, + "grad_norm": 0.19239328801631927, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0044, + "step": 21010 + }, + { + "epoch": 1.2595122535802026, + "grad_norm": 0.16555139422416687, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0032, + "step": 21020 + }, + { + "epoch": 1.2601114506561208, + "grad_norm": 0.19748231768608093, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0043, + "step": 21030 + }, + { + "epoch": 1.2607106477320391, + "grad_norm": 0.1546473354101181, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0049, + "step": 21040 + }, + { + "epoch": 1.2613098448079574, + "grad_norm": 0.30511707067489624, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0037, + "step": 21050 + }, + { + "epoch": 1.2619090418838756, + "grad_norm": 0.1722872257232666, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0048, + "step": 21060 + }, + { + "epoch": 1.262508238959794, + "grad_norm": 0.1784086525440216, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0049, + "step": 21070 + }, + { + "epoch": 1.2631074360357122, + "grad_norm": 0.15101182460784912, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0042, + "step": 21080 + }, + { + "epoch": 1.2637066331116305, + "grad_norm": 0.1252688318490982, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0041, + "step": 21090 + }, + { + "epoch": 1.2643058301875487, + "grad_norm": 0.15101821720600128, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0043, + "step": 21100 + }, + { + "epoch": 1.264905027263467, + "grad_norm": 0.21302345395088196, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0035, + "step": 21110 + }, + { + "epoch": 1.2655042243393853, + "grad_norm": 0.1591431051492691, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0033, + "step": 21120 + }, + { + "epoch": 1.2661034214153035, + "grad_norm": 0.16010484099388123, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0049, + "step": 21130 + }, + { + "epoch": 1.2667026184912218, + "grad_norm": 0.19287234544754028, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0037, + "step": 21140 + }, + { + "epoch": 1.26730181556714, + "grad_norm": 0.1804349720478058, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0036, + "step": 21150 + }, + { + "epoch": 1.2679010126430583, + "grad_norm": 0.14769446849822998, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0044, + "step": 21160 + }, + { + "epoch": 1.2685002097189766, + "grad_norm": 0.15914054214954376, + "learning_rate": 5.813791207086085e-06, + "loss": 0.004, + "step": 21170 + }, + { + "epoch": 1.2690994067948949, + "grad_norm": 0.19632315635681152, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0034, + "step": 21180 + }, + { + "epoch": 1.2696986038708131, + "grad_norm": 0.3017818331718445, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0046, + "step": 21190 + }, + { + "epoch": 1.2702978009467314, + "grad_norm": 0.2728461027145386, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0044, + "step": 21200 + }, + { + "epoch": 1.2708969980226497, + "grad_norm": 0.18619874119758606, + "learning_rate": 5.781966956563247e-06, + "loss": 0.004, + "step": 21210 + }, + { + "epoch": 1.271496195098568, + "grad_norm": 0.1235085129737854, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0037, + "step": 21220 + }, + { + "epoch": 1.2720953921744862, + "grad_norm": 0.15798084437847137, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0035, + "step": 21230 + }, + { + "epoch": 1.2726945892504045, + "grad_norm": 0.15713484585285187, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0036, + "step": 21240 + }, + { + "epoch": 1.2732937863263227, + "grad_norm": 0.15594886243343353, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0038, + "step": 21250 + }, + { + "epoch": 1.273892983402241, + "grad_norm": 0.1558992713689804, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0037, + "step": 21260 + }, + { + "epoch": 1.2744921804781593, + "grad_norm": 0.20599815249443054, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0054, + "step": 21270 + }, + { + "epoch": 1.2750913775540775, + "grad_norm": 0.2785670757293701, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0042, + "step": 21280 + }, + { + "epoch": 1.2756905746299958, + "grad_norm": 0.22550497949123383, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0035, + "step": 21290 + }, + { + "epoch": 1.276289771705914, + "grad_norm": 0.15210074186325073, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0035, + "step": 21300 + }, + { + "epoch": 1.2768889687818323, + "grad_norm": 0.18905121088027954, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0035, + "step": 21310 + }, + { + "epoch": 1.2774881658577506, + "grad_norm": 0.1337066888809204, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0046, + "step": 21320 + }, + { + "epoch": 1.2780873629336689, + "grad_norm": 0.23699362576007843, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0048, + "step": 21330 + }, + { + "epoch": 1.2786865600095871, + "grad_norm": 0.2480958253145218, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0037, + "step": 21340 + }, + { + "epoch": 1.2792857570855054, + "grad_norm": 0.09328999370336533, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0047, + "step": 21350 + }, + { + "epoch": 1.2798849541614237, + "grad_norm": 0.3416430950164795, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0048, + "step": 21360 + }, + { + "epoch": 1.280484151237342, + "grad_norm": 0.13258710503578186, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0032, + "step": 21370 + }, + { + "epoch": 1.2810833483132602, + "grad_norm": 0.18493984639644623, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0037, + "step": 21380 + }, + { + "epoch": 1.2816825453891785, + "grad_norm": 0.10433483123779297, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0045, + "step": 21390 + }, + { + "epoch": 1.2822817424650967, + "grad_norm": 0.18333138525485992, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0038, + "step": 21400 + }, + { + "epoch": 1.282880939541015, + "grad_norm": 0.25164106488227844, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0058, + "step": 21410 + }, + { + "epoch": 1.2834801366169333, + "grad_norm": 0.17989882826805115, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0041, + "step": 21420 + }, + { + "epoch": 1.2840793336928515, + "grad_norm": 0.1597793847322464, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0036, + "step": 21430 + }, + { + "epoch": 1.2846785307687698, + "grad_norm": 0.1543695032596588, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0036, + "step": 21440 + }, + { + "epoch": 1.285277727844688, + "grad_norm": 0.2985675036907196, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0043, + "step": 21450 + }, + { + "epoch": 1.2858769249206063, + "grad_norm": 0.1357773244380951, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0036, + "step": 21460 + }, + { + "epoch": 1.2864761219965246, + "grad_norm": 0.23978300392627716, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.005, + "step": 21470 + }, + { + "epoch": 1.2870753190724429, + "grad_norm": 0.12806151807308197, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0035, + "step": 21480 + }, + { + "epoch": 1.2876745161483611, + "grad_norm": 0.2222731113433838, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0039, + "step": 21490 + }, + { + "epoch": 1.2882737132242794, + "grad_norm": 0.16744646430015564, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0035, + "step": 21500 + }, + { + "epoch": 1.2888729103001977, + "grad_norm": 0.2162114977836609, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0048, + "step": 21510 + }, + { + "epoch": 1.289472107376116, + "grad_norm": 0.14857177436351776, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0036, + "step": 21520 + }, + { + "epoch": 1.2900713044520342, + "grad_norm": 0.21318115293979645, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0032, + "step": 21530 + }, + { + "epoch": 1.2906705015279525, + "grad_norm": 0.257682204246521, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0036, + "step": 21540 + }, + { + "epoch": 1.2912696986038708, + "grad_norm": 0.254349946975708, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0042, + "step": 21550 + }, + { + "epoch": 1.291868895679789, + "grad_norm": 0.148925319314003, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0029, + "step": 21560 + }, + { + "epoch": 1.2924680927557073, + "grad_norm": 0.1902056336402893, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0031, + "step": 21570 + }, + { + "epoch": 1.2930672898316256, + "grad_norm": 0.17580094933509827, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0026, + "step": 21580 + }, + { + "epoch": 1.2936664869075438, + "grad_norm": 0.18856695294380188, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0045, + "step": 21590 + }, + { + "epoch": 1.294265683983462, + "grad_norm": 0.17185454070568085, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0039, + "step": 21600 + }, + { + "epoch": 1.2948648810593804, + "grad_norm": 0.1997966468334198, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0043, + "step": 21610 + }, + { + "epoch": 1.2954640781352986, + "grad_norm": 0.14173944294452667, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0033, + "step": 21620 + }, + { + "epoch": 1.296063275211217, + "grad_norm": 0.20653635263442993, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0039, + "step": 21630 + }, + { + "epoch": 1.2966624722871352, + "grad_norm": 0.19571708142757416, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0026, + "step": 21640 + }, + { + "epoch": 1.2972616693630534, + "grad_norm": 0.0877508670091629, + "learning_rate": 5.438496901657042e-06, + "loss": 0.005, + "step": 21650 + }, + { + "epoch": 1.2978608664389717, + "grad_norm": 0.17305001616477966, + "learning_rate": 5.430834687545416e-06, + "loss": 0.0038, + "step": 21660 + }, + { + "epoch": 1.2984600635148902, + "grad_norm": 0.16555450856685638, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0035, + "step": 21670 + }, + { + "epoch": 1.2990592605908082, + "grad_norm": 0.15395715832710266, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0035, + "step": 21680 + }, + { + "epoch": 1.2996584576667267, + "grad_norm": 0.2430422455072403, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0032, + "step": 21690 + }, + { + "epoch": 1.3002576547426448, + "grad_norm": 0.2465265393257141, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0034, + "step": 21700 + }, + { + "epoch": 1.3008568518185633, + "grad_norm": 0.08382703363895416, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0038, + "step": 21710 + }, + { + "epoch": 1.3014560488944813, + "grad_norm": 0.3427184224128723, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0042, + "step": 21720 + }, + { + "epoch": 1.3020552459703998, + "grad_norm": 0.13029031455516815, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0039, + "step": 21730 + }, + { + "epoch": 1.3026544430463178, + "grad_norm": 0.11826448887586594, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0035, + "step": 21740 + }, + { + "epoch": 1.3032536401222363, + "grad_norm": 0.1612391620874405, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0039, + "step": 21750 + }, + { + "epoch": 1.3038528371981544, + "grad_norm": 0.21143540740013123, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0057, + "step": 21760 + }, + { + "epoch": 1.3044520342740729, + "grad_norm": 0.22977286577224731, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.005, + "step": 21770 + }, + { + "epoch": 1.305051231349991, + "grad_norm": 0.11853202432394028, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0058, + "step": 21780 + }, + { + "epoch": 1.3056504284259094, + "grad_norm": 0.24277184903621674, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0038, + "step": 21790 + }, + { + "epoch": 1.3062496255018274, + "grad_norm": 0.2625603675842285, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0048, + "step": 21800 + }, + { + "epoch": 1.306848822577746, + "grad_norm": 0.1333419382572174, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0033, + "step": 21810 + }, + { + "epoch": 1.307448019653664, + "grad_norm": 0.09627685695886612, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0035, + "step": 21820 + }, + { + "epoch": 1.3080472167295825, + "grad_norm": 0.416618674993515, + "learning_rate": 5.301584321328435e-06, + "loss": 0.0038, + "step": 21830 + }, + { + "epoch": 1.3086464138055005, + "grad_norm": 0.18699553608894348, + "learning_rate": 5.294041118587667e-06, + "loss": 0.004, + "step": 21840 + }, + { + "epoch": 1.309245610881419, + "grad_norm": 0.1827329397201538, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0039, + "step": 21850 + }, + { + "epoch": 1.309844807957337, + "grad_norm": 0.19719162583351135, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0034, + "step": 21860 + }, + { + "epoch": 1.3104440050332555, + "grad_norm": 0.09895205497741699, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0042, + "step": 21870 + }, + { + "epoch": 1.3110432021091736, + "grad_norm": 0.11187861114740372, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0036, + "step": 21880 + }, + { + "epoch": 1.311642399185092, + "grad_norm": 0.154103085398674, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0037, + "step": 21890 + }, + { + "epoch": 1.3122415962610101, + "grad_norm": 0.11124159395694733, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0038, + "step": 21900 + }, + { + "epoch": 1.3128407933369286, + "grad_norm": 0.27686378359794617, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0041, + "step": 21910 + }, + { + "epoch": 1.3134399904128466, + "grad_norm": 0.12900429964065552, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0036, + "step": 21920 + }, + { + "epoch": 1.3140391874887651, + "grad_norm": 0.26441213488578796, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0032, + "step": 21930 + }, + { + "epoch": 1.3146383845646834, + "grad_norm": 0.2187345325946808, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.004, + "step": 21940 + }, + { + "epoch": 1.3152375816406017, + "grad_norm": 0.08503159135580063, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0034, + "step": 21950 + }, + { + "epoch": 1.31583677871652, + "grad_norm": 0.12869144976139069, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0035, + "step": 21960 + }, + { + "epoch": 1.3164359757924382, + "grad_norm": 0.13212713599205017, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0027, + "step": 21970 + }, + { + "epoch": 1.3170351728683565, + "grad_norm": 0.23211228847503662, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0032, + "step": 21980 + }, + { + "epoch": 1.3176343699442747, + "grad_norm": 0.2017366737127304, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0043, + "step": 21990 + }, + { + "epoch": 1.318233567020193, + "grad_norm": 0.21221789717674255, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0039, + "step": 22000 + }, + { + "epoch": 1.3188327640961113, + "grad_norm": 0.24497511982917786, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0046, + "step": 22010 + }, + { + "epoch": 1.3194319611720295, + "grad_norm": 0.15008985996246338, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0039, + "step": 22020 + }, + { + "epoch": 1.3200311582479478, + "grad_norm": 0.15641193091869354, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0039, + "step": 22030 + }, + { + "epoch": 1.320630355323866, + "grad_norm": 0.2608455419540405, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0036, + "step": 22040 + }, + { + "epoch": 1.3212295523997843, + "grad_norm": 0.09808705747127533, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0038, + "step": 22050 + }, + { + "epoch": 1.3218287494757026, + "grad_norm": 0.18084567785263062, + "learning_rate": 5.129800405815733e-06, + "loss": 0.0045, + "step": 22060 + }, + { + "epoch": 1.3224279465516209, + "grad_norm": 0.1957635134458542, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0036, + "step": 22070 + }, + { + "epoch": 1.3230271436275391, + "grad_norm": 0.1479685753583908, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0045, + "step": 22080 + }, + { + "epoch": 1.3236263407034574, + "grad_norm": 0.14854201674461365, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0035, + "step": 22090 + }, + { + "epoch": 1.3242255377793757, + "grad_norm": 0.14744973182678223, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0033, + "step": 22100 + }, + { + "epoch": 1.324824734855294, + "grad_norm": 0.7196730375289917, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0071, + "step": 22110 + }, + { + "epoch": 1.3254239319312122, + "grad_norm": 0.22570419311523438, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0056, + "step": 22120 + }, + { + "epoch": 1.3260231290071305, + "grad_norm": 0.16870586574077606, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0042, + "step": 22130 + }, + { + "epoch": 1.3266223260830488, + "grad_norm": 0.12610554695129395, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0051, + "step": 22140 + }, + { + "epoch": 1.327221523158967, + "grad_norm": 0.11198554188013077, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0042, + "step": 22150 + }, + { + "epoch": 1.3278207202348853, + "grad_norm": 0.13166265189647675, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0037, + "step": 22160 + }, + { + "epoch": 1.3284199173108036, + "grad_norm": 0.1181526631116867, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0037, + "step": 22170 + }, + { + "epoch": 1.3290191143867218, + "grad_norm": 0.2055635005235672, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0027, + "step": 22180 + }, + { + "epoch": 1.32961831146264, + "grad_norm": 0.13400030136108398, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0028, + "step": 22190 + }, + { + "epoch": 1.3302175085385584, + "grad_norm": 0.09746947884559631, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0048, + "step": 22200 + }, + { + "epoch": 1.3308167056144766, + "grad_norm": 0.22124870121479034, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0048, + "step": 22210 + }, + { + "epoch": 1.331415902690395, + "grad_norm": 0.09961193799972534, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0038, + "step": 22220 + }, + { + "epoch": 1.3320150997663132, + "grad_norm": 0.20024695992469788, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0036, + "step": 22230 + }, + { + "epoch": 1.3326142968422314, + "grad_norm": 0.3697144687175751, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0044, + "step": 22240 + }, + { + "epoch": 1.3332134939181497, + "grad_norm": 0.1713833063840866, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0038, + "step": 22250 + }, + { + "epoch": 1.333812690994068, + "grad_norm": 0.1914745569229126, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0051, + "step": 22260 + }, + { + "epoch": 1.3344118880699862, + "grad_norm": 0.190393328666687, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0035, + "step": 22270 + }, + { + "epoch": 1.3350110851459045, + "grad_norm": 0.17361588776111603, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0038, + "step": 22280 + }, + { + "epoch": 1.3356102822218228, + "grad_norm": 0.19456325471401215, + "learning_rate": 4.961660586405147e-06, + "loss": 0.0036, + "step": 22290 + }, + { + "epoch": 1.336209479297741, + "grad_norm": 0.15772588551044464, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0039, + "step": 22300 + }, + { + "epoch": 1.3368086763736593, + "grad_norm": 0.11680205166339874, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0045, + "step": 22310 + }, + { + "epoch": 1.3374078734495776, + "grad_norm": 0.3643893599510193, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0044, + "step": 22320 + }, + { + "epoch": 1.3380070705254958, + "grad_norm": 0.1628265231847763, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0036, + "step": 22330 + }, + { + "epoch": 1.338606267601414, + "grad_norm": 0.10073156654834747, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0041, + "step": 22340 + }, + { + "epoch": 1.3392054646773324, + "grad_norm": 0.13039462268352509, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0045, + "step": 22350 + }, + { + "epoch": 1.3398046617532506, + "grad_norm": 0.12775596976280212, + "learning_rate": 4.911226880894818e-06, + "loss": 0.003, + "step": 22360 + }, + { + "epoch": 1.340403858829169, + "grad_norm": 0.1513100564479828, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0044, + "step": 22370 + }, + { + "epoch": 1.3410030559050872, + "grad_norm": 0.1346164345741272, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0036, + "step": 22380 + }, + { + "epoch": 1.3416022529810054, + "grad_norm": 0.12880294024944305, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0041, + "step": 22390 + }, + { + "epoch": 1.3422014500569237, + "grad_norm": 0.3154917359352112, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0038, + "step": 22400 + }, + { + "epoch": 1.342800647132842, + "grad_norm": 0.18458192050457, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.0057, + "step": 22410 + }, + { + "epoch": 1.3433998442087602, + "grad_norm": 0.2524041533470154, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0038, + "step": 22420 + }, + { + "epoch": 1.3439990412846785, + "grad_norm": 0.11894001811742783, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0033, + "step": 22430 + }, + { + "epoch": 1.3445982383605968, + "grad_norm": 0.1094699576497078, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0037, + "step": 22440 + }, + { + "epoch": 1.345197435436515, + "grad_norm": 0.11090611666440964, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0045, + "step": 22450 + }, + { + "epoch": 1.3457966325124333, + "grad_norm": 0.3179106116294861, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0041, + "step": 22460 + }, + { + "epoch": 1.3463958295883516, + "grad_norm": 0.09424899518489838, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0034, + "step": 22470 + }, + { + "epoch": 1.3469950266642698, + "grad_norm": 0.3028348982334137, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0035, + "step": 22480 + }, + { + "epoch": 1.3475942237401881, + "grad_norm": 0.30831560492515564, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0045, + "step": 22490 + }, + { + "epoch": 1.3481934208161064, + "grad_norm": 0.34811046719551086, + "learning_rate": 4.81141273556404e-06, + "loss": 0.005, + "step": 22500 + }, + { + "epoch": 1.3487926178920246, + "grad_norm": 0.18413113057613373, + "learning_rate": 4.804337352679613e-06, + "loss": 0.0044, + "step": 22510 + }, + { + "epoch": 1.349391814967943, + "grad_norm": 0.11229179799556732, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.004, + "step": 22520 + }, + { + "epoch": 1.3499910120438612, + "grad_norm": 0.2966957688331604, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0056, + "step": 22530 + }, + { + "epoch": 1.3505902091197795, + "grad_norm": 0.10525348782539368, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0037, + "step": 22540 + }, + { + "epoch": 1.3511894061956977, + "grad_norm": 0.1479673534631729, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0049, + "step": 22550 + }, + { + "epoch": 1.351788603271616, + "grad_norm": 0.5229315757751465, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0051, + "step": 22560 + }, + { + "epoch": 1.3523878003475343, + "grad_norm": 0.17021632194519043, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0038, + "step": 22570 + }, + { + "epoch": 1.3529869974234525, + "grad_norm": 0.10177282989025116, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0036, + "step": 22580 + }, + { + "epoch": 1.3535861944993708, + "grad_norm": 0.17768025398254395, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0034, + "step": 22590 + }, + { + "epoch": 1.354185391575289, + "grad_norm": 0.2090948224067688, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0027, + "step": 22600 + }, + { + "epoch": 1.3547845886512073, + "grad_norm": 0.1722206026315689, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0031, + "step": 22610 + }, + { + "epoch": 1.3553837857271256, + "grad_norm": 0.09709088504314423, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0037, + "step": 22620 + }, + { + "epoch": 1.3559829828030439, + "grad_norm": 0.1969165802001953, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0034, + "step": 22630 + }, + { + "epoch": 1.3565821798789621, + "grad_norm": 0.0810595229268074, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0038, + "step": 22640 + }, + { + "epoch": 1.3571813769548804, + "grad_norm": 0.22003750503063202, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0041, + "step": 22650 + }, + { + "epoch": 1.3577805740307987, + "grad_norm": 0.2809178829193115, + "learning_rate": 4.699083753549858e-06, + "loss": 0.003, + "step": 22660 + }, + { + "epoch": 1.358379771106717, + "grad_norm": 0.1343737691640854, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0026, + "step": 22670 + }, + { + "epoch": 1.3589789681826352, + "grad_norm": 0.19191010296344757, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0035, + "step": 22680 + }, + { + "epoch": 1.3595781652585535, + "grad_norm": 0.16617201268672943, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0031, + "step": 22690 + }, + { + "epoch": 1.3601773623344717, + "grad_norm": 0.24936997890472412, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0032, + "step": 22700 + }, + { + "epoch": 1.36077655941039, + "grad_norm": 0.5643696188926697, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0053, + "step": 22710 + }, + { + "epoch": 1.3613757564863083, + "grad_norm": 0.19725625216960907, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0031, + "step": 22720 + }, + { + "epoch": 1.3619749535622265, + "grad_norm": 0.1692969799041748, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0035, + "step": 22730 + }, + { + "epoch": 1.362574150638145, + "grad_norm": 0.17487913370132446, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0041, + "step": 22740 + }, + { + "epoch": 1.363173347714063, + "grad_norm": 0.25642889738082886, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0043, + "step": 22750 + }, + { + "epoch": 1.3637725447899816, + "grad_norm": 0.3692823350429535, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0039, + "step": 22760 + }, + { + "epoch": 1.3643717418658996, + "grad_norm": 0.230118989944458, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0047, + "step": 22770 + }, + { + "epoch": 1.364970938941818, + "grad_norm": 0.1609203815460205, + "learning_rate": 4.616077433849538e-06, + "loss": 0.0038, + "step": 22780 + }, + { + "epoch": 1.3655701360177361, + "grad_norm": 0.21201254427433014, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0029, + "step": 22790 + }, + { + "epoch": 1.3661693330936546, + "grad_norm": 0.10142157226800919, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0034, + "step": 22800 + }, + { + "epoch": 1.3667685301695727, + "grad_norm": 0.19121089577674866, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0033, + "step": 22810 + }, + { + "epoch": 1.3673677272454912, + "grad_norm": 0.156619131565094, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0045, + "step": 22820 + }, + { + "epoch": 1.3679669243214092, + "grad_norm": 0.14690659940242767, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0042, + "step": 22830 + }, + { + "epoch": 1.3685661213973277, + "grad_norm": 0.13466109335422516, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0041, + "step": 22840 + }, + { + "epoch": 1.3691653184732457, + "grad_norm": 0.3713383674621582, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0056, + "step": 22850 + }, + { + "epoch": 1.3697645155491642, + "grad_norm": 0.12184764444828033, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0038, + "step": 22860 + }, + { + "epoch": 1.3703637126250823, + "grad_norm": 0.23971956968307495, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0035, + "step": 22870 + }, + { + "epoch": 1.3709629097010008, + "grad_norm": 0.3320925235748291, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0052, + "step": 22880 + }, + { + "epoch": 1.3715621067769188, + "grad_norm": 0.11913793534040451, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0029, + "step": 22890 + }, + { + "epoch": 1.3721613038528373, + "grad_norm": 0.11725693941116333, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0044, + "step": 22900 + }, + { + "epoch": 1.3727605009287553, + "grad_norm": 0.1550632119178772, + "learning_rate": 4.527371771040039e-06, + "loss": 0.0049, + "step": 22910 + }, + { + "epoch": 1.3733596980046738, + "grad_norm": 0.23413509130477905, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0043, + "step": 22920 + }, + { + "epoch": 1.3739588950805919, + "grad_norm": 0.16070885956287384, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0032, + "step": 22930 + }, + { + "epoch": 1.3745580921565104, + "grad_norm": 0.12317437678575516, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0038, + "step": 22940 + }, + { + "epoch": 1.3751572892324284, + "grad_norm": 0.3462170660495758, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0048, + "step": 22950 + }, + { + "epoch": 1.375756486308347, + "grad_norm": 0.12654773890972137, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0045, + "step": 22960 + }, + { + "epoch": 1.376355683384265, + "grad_norm": 0.06262557208538055, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0026, + "step": 22970 + }, + { + "epoch": 1.3769548804601834, + "grad_norm": 0.1439850926399231, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0045, + "step": 22980 + }, + { + "epoch": 1.3775540775361017, + "grad_norm": 0.24463413655757904, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0042, + "step": 22990 + }, + { + "epoch": 1.37815327461202, + "grad_norm": 0.22048236429691315, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0036, + "step": 23000 + }, + { + "epoch": 1.3787524716879382, + "grad_norm": 0.10628963261842728, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0037, + "step": 23010 + }, + { + "epoch": 1.3793516687638565, + "grad_norm": 0.14685721695423126, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0043, + "step": 23020 + }, + { + "epoch": 1.3799508658397748, + "grad_norm": 0.18807503581047058, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0042, + "step": 23030 + }, + { + "epoch": 1.380550062915693, + "grad_norm": 0.19162075221538544, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0047, + "step": 23040 + }, + { + "epoch": 1.3811492599916113, + "grad_norm": 0.2444164752960205, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0041, + "step": 23050 + }, + { + "epoch": 1.3817484570675296, + "grad_norm": 0.12120077759027481, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0037, + "step": 23060 + }, + { + "epoch": 1.3823476541434478, + "grad_norm": 0.19946682453155518, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.003, + "step": 23070 + }, + { + "epoch": 1.3829468512193661, + "grad_norm": 0.23982395231723785, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0035, + "step": 23080 + }, + { + "epoch": 1.3835460482952844, + "grad_norm": 0.13806626200675964, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0031, + "step": 23090 + }, + { + "epoch": 1.3841452453712026, + "grad_norm": 0.2610985040664673, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0047, + "step": 23100 + }, + { + "epoch": 1.384744442447121, + "grad_norm": 0.1384919434785843, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0055, + "step": 23110 + }, + { + "epoch": 1.3853436395230392, + "grad_norm": 0.14737965166568756, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0037, + "step": 23120 + }, + { + "epoch": 1.3859428365989575, + "grad_norm": 0.1304326057434082, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0036, + "step": 23130 + }, + { + "epoch": 1.3865420336748757, + "grad_norm": 0.22288398444652557, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0035, + "step": 23140 + }, + { + "epoch": 1.387141230750794, + "grad_norm": 0.11266916245222092, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0071, + "step": 23150 + }, + { + "epoch": 1.3877404278267123, + "grad_norm": 0.15941838920116425, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0024, + "step": 23160 + }, + { + "epoch": 1.3883396249026305, + "grad_norm": 0.18921831250190735, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0026, + "step": 23170 + }, + { + "epoch": 1.3889388219785488, + "grad_norm": 0.10112889111042023, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0037, + "step": 23180 + }, + { + "epoch": 1.389538019054467, + "grad_norm": 0.1865631341934204, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0028, + "step": 23190 + }, + { + "epoch": 1.3901372161303853, + "grad_norm": 0.20046782493591309, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0042, + "step": 23200 + }, + { + "epoch": 1.3907364132063036, + "grad_norm": 0.11953336745500565, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0026, + "step": 23210 + }, + { + "epoch": 1.3913356102822219, + "grad_norm": 0.17050383985042572, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0029, + "step": 23220 + }, + { + "epoch": 1.3919348073581401, + "grad_norm": 0.28782936930656433, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0042, + "step": 23230 + }, + { + "epoch": 1.3925340044340584, + "grad_norm": 0.2104359269142151, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0034, + "step": 23240 + }, + { + "epoch": 1.3931332015099767, + "grad_norm": 0.12790441513061523, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0048, + "step": 23250 + }, + { + "epoch": 1.393732398585895, + "grad_norm": 0.12111827731132507, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0043, + "step": 23260 + }, + { + "epoch": 1.3943315956618132, + "grad_norm": 0.2542783319950104, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0043, + "step": 23270 + }, + { + "epoch": 1.3949307927377315, + "grad_norm": 0.17177502810955048, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0036, + "step": 23280 + }, + { + "epoch": 1.3955299898136497, + "grad_norm": 0.14121277630329132, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0051, + "step": 23290 + }, + { + "epoch": 1.396129186889568, + "grad_norm": 0.11357807368040085, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0033, + "step": 23300 + }, + { + "epoch": 1.3967283839654863, + "grad_norm": 0.3277477025985718, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0043, + "step": 23310 + }, + { + "epoch": 1.3973275810414045, + "grad_norm": 0.37000587582588196, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0037, + "step": 23320 + }, + { + "epoch": 1.3979267781173228, + "grad_norm": 0.11122190207242966, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0034, + "step": 23330 + }, + { + "epoch": 1.398525975193241, + "grad_norm": 0.14530375599861145, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0039, + "step": 23340 + }, + { + "epoch": 1.3991251722691593, + "grad_norm": 0.19974422454833984, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0033, + "step": 23350 + }, + { + "epoch": 1.3997243693450776, + "grad_norm": 0.15466761589050293, + "learning_rate": 4.230335566422999e-06, + "loss": 0.003, + "step": 23360 + }, + { + "epoch": 1.4003235664209959, + "grad_norm": 0.19129224121570587, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0042, + "step": 23370 + }, + { + "epoch": 1.4009227634969141, + "grad_norm": 0.2474614828824997, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0046, + "step": 23380 + }, + { + "epoch": 1.4015219605728324, + "grad_norm": 0.15569351613521576, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0048, + "step": 23390 + }, + { + "epoch": 1.4021211576487507, + "grad_norm": 0.09572251886129379, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0044, + "step": 23400 + }, + { + "epoch": 1.402720354724669, + "grad_norm": 0.13737086951732635, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0043, + "step": 23410 + }, + { + "epoch": 1.4033195518005872, + "grad_norm": 0.12266672402620316, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0039, + "step": 23420 + }, + { + "epoch": 1.4039187488765055, + "grad_norm": 0.09208404272794724, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0039, + "step": 23430 + }, + { + "epoch": 1.4045179459524237, + "grad_norm": 0.16571840643882751, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0047, + "step": 23440 + }, + { + "epoch": 1.405117143028342, + "grad_norm": 0.3071173131465912, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0039, + "step": 23450 + }, + { + "epoch": 1.4057163401042603, + "grad_norm": 0.09059276431798935, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0031, + "step": 23460 + }, + { + "epoch": 1.4063155371801785, + "grad_norm": 0.16070133447647095, + "learning_rate": 4.160146936563338e-06, + "loss": 0.004, + "step": 23470 + }, + { + "epoch": 1.4069147342560968, + "grad_norm": 0.12942227721214294, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0027, + "step": 23480 + }, + { + "epoch": 1.407513931332015, + "grad_norm": 0.13913804292678833, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0048, + "step": 23490 + }, + { + "epoch": 1.4081131284079333, + "grad_norm": 0.206321582198143, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0053, + "step": 23500 + }, + { + "epoch": 1.4087123254838516, + "grad_norm": 0.20973987877368927, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0041, + "step": 23510 + }, + { + "epoch": 1.4093115225597699, + "grad_norm": 0.23191478848457336, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0063, + "step": 23520 + }, + { + "epoch": 1.4099107196356881, + "grad_norm": 0.18233250081539154, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0029, + "step": 23530 + }, + { + "epoch": 1.4105099167116064, + "grad_norm": 0.133034810423851, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0043, + "step": 23540 + }, + { + "epoch": 1.4111091137875247, + "grad_norm": 0.10777711123228073, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0028, + "step": 23550 + }, + { + "epoch": 1.411708310863443, + "grad_norm": 0.14128559827804565, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0034, + "step": 23560 + }, + { + "epoch": 1.4123075079393612, + "grad_norm": 0.13215866684913635, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0036, + "step": 23570 + }, + { + "epoch": 1.4129067050152795, + "grad_norm": 0.18918493390083313, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0043, + "step": 23580 + }, + { + "epoch": 1.4135059020911978, + "grad_norm": 0.14459657669067383, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0045, + "step": 23590 + }, + { + "epoch": 1.414105099167116, + "grad_norm": 0.17287056148052216, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0039, + "step": 23600 + }, + { + "epoch": 1.4147042962430343, + "grad_norm": 0.13909804821014404, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0039, + "step": 23610 + }, + { + "epoch": 1.4153034933189526, + "grad_norm": 0.14798089861869812, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0037, + "step": 23620 + }, + { + "epoch": 1.4159026903948708, + "grad_norm": 0.10916659235954285, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0023, + "step": 23630 + }, + { + "epoch": 1.416501887470789, + "grad_norm": 0.1151762530207634, + "learning_rate": 4.053587511509546e-06, + "loss": 0.005, + "step": 23640 + }, + { + "epoch": 1.4171010845467074, + "grad_norm": 0.14232765138149261, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0032, + "step": 23650 + }, + { + "epoch": 1.4177002816226256, + "grad_norm": 0.09513483196496964, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0043, + "step": 23660 + }, + { + "epoch": 1.418299478698544, + "grad_norm": 0.09156285226345062, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0039, + "step": 23670 + }, + { + "epoch": 1.4188986757744622, + "grad_norm": 0.1405397206544876, + "learning_rate": 4.028855757736123e-06, + "loss": 0.004, + "step": 23680 + }, + { + "epoch": 1.4194978728503804, + "grad_norm": 0.15840958058834076, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0037, + "step": 23690 + }, + { + "epoch": 1.4200970699262987, + "grad_norm": 0.190508171916008, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0037, + "step": 23700 + }, + { + "epoch": 1.420696267002217, + "grad_norm": 0.15277954936027527, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0028, + "step": 23710 + }, + { + "epoch": 1.4212954640781352, + "grad_norm": 0.14111991226673126, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0034, + "step": 23720 + }, + { + "epoch": 1.4218946611540535, + "grad_norm": 0.31528833508491516, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0044, + "step": 23730 + }, + { + "epoch": 1.4224938582299718, + "grad_norm": 0.1420607715845108, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0039, + "step": 23740 + }, + { + "epoch": 1.42309305530589, + "grad_norm": 0.1340852528810501, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0032, + "step": 23750 + }, + { + "epoch": 1.4236922523818083, + "grad_norm": 0.11166475713253021, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0033, + "step": 23760 + }, + { + "epoch": 1.4242914494577266, + "grad_norm": 0.13635945320129395, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0028, + "step": 23770 + }, + { + "epoch": 1.4248906465336448, + "grad_norm": 0.15865778923034668, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0036, + "step": 23780 + }, + { + "epoch": 1.4254898436095633, + "grad_norm": 0.08569981157779694, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0026, + "step": 23790 + }, + { + "epoch": 1.4260890406854814, + "grad_norm": 0.1041082963347435, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0033, + "step": 23800 + }, + { + "epoch": 1.4266882377613999, + "grad_norm": 0.17262709140777588, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0041, + "step": 23810 + }, + { + "epoch": 1.427287434837318, + "grad_norm": 0.20455610752105713, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0035, + "step": 23820 + }, + { + "epoch": 1.4278866319132364, + "grad_norm": 0.15869568288326263, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0027, + "step": 23830 + }, + { + "epoch": 1.4284858289891544, + "grad_norm": 0.14855770766735077, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0042, + "step": 23840 + }, + { + "epoch": 1.429085026065073, + "grad_norm": 0.08842955529689789, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0028, + "step": 23850 + }, + { + "epoch": 1.429684223140991, + "grad_norm": 0.18251122534275055, + "learning_rate": 3.919189353330104e-06, + "loss": 0.003, + "step": 23860 + }, + { + "epoch": 1.4302834202169095, + "grad_norm": 0.24990014731884003, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0038, + "step": 23870 + }, + { + "epoch": 1.4308826172928275, + "grad_norm": 0.1088186502456665, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0036, + "step": 23880 + }, + { + "epoch": 1.431481814368746, + "grad_norm": 0.09780745953321457, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0042, + "step": 23890 + }, + { + "epoch": 1.432081011444664, + "grad_norm": 0.1625395119190216, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0033, + "step": 23900 + }, + { + "epoch": 1.4326802085205825, + "grad_norm": 0.16848890483379364, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0033, + "step": 23910 + }, + { + "epoch": 1.4332794055965006, + "grad_norm": 0.19756828248500824, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0051, + "step": 23920 + }, + { + "epoch": 1.433878602672419, + "grad_norm": 0.15720513463020325, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0033, + "step": 23930 + }, + { + "epoch": 1.4344777997483371, + "grad_norm": 0.22365699708461761, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0028, + "step": 23940 + }, + { + "epoch": 1.4350769968242556, + "grad_norm": 0.07928138971328735, + "learning_rate": 3.865363184624925e-06, + "loss": 0.003, + "step": 23950 + }, + { + "epoch": 1.4356761939001736, + "grad_norm": 0.26314112544059753, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0049, + "step": 23960 + }, + { + "epoch": 1.4362753909760921, + "grad_norm": 0.1249697357416153, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0036, + "step": 23970 + }, + { + "epoch": 1.4368745880520102, + "grad_norm": 0.09758924692869186, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0031, + "step": 23980 + }, + { + "epoch": 1.4374737851279287, + "grad_norm": 0.08506497740745544, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0037, + "step": 23990 + }, + { + "epoch": 1.4380729822038467, + "grad_norm": 0.1978219896554947, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0036, + "step": 24000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4991099461999e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..19d22af7b7d6155175015b5c3c5b452030d153ea --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-24000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccf8e16588ffacf58cd09ed0241d355125d76c992d11c15a4bc8ee94db38dc3b +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b26c6d76803d8a7c93a3bc6857d8c678a059b5cb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dad879958aa4cd32b62962e92133083e053e47019d370e03438481886a51c84b +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7bde415ad97b84d892af2007e3300744a42f81d6 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10b899adbb944ea037666e79c13548d61aa08105f4f3c52909ae121ae2e6ad46 +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a1cf4d12722e229187ff387a0b2f540265438af4 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7850d1d55751b57560fddc5d89cb2763161f16f6c955699e9150708cad86c2f0 +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..0da6184f9732635317d9591566929a0f088174db --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -24.608807465362545, + -30.57493604888916, + -14.421680474472046, + -1.8400005650520326, + -2.2583390679359434, + -1.9374337060928344, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 2.947746359062201, + 22.348905650329584, + 21.642364361572263, + 2.36660552740097, + 4.0908002225875855, + 3.2823701507568366, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + -6.435277462005615, + -1.046771764755249, + 3.5443263053894043, + 0.010237408801913261, + 0.7088965773582458, + 0.433538019657135, + 0.11327514797449112, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 7.037599563598633, + 16.91518783569336, + 8.290277481079102, + 0.6919190883636475, + 1.1289485692977905, + 0.9604002833366394, + 0.9935636520385742, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.702568125152588, + -21.763728466033935, + -21.216347326660156, + -2.3684931322097778, + -4.066458044528961, + -3.2888745792388914, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.601868363571164, + 30.525507734680176, + 14.354210775756833, + 1.8357849156379702, + 2.250663768482209, + 1.934181491851806, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.545124530792236, + 1.3164341449737549, + -3.4697155952453613, + -0.00962071679532528, + -0.7082296013832092, + -0.43808361887931824, + 0.13391299545764923, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.980162143707275, + 16.702543258666992, + 8.168180465698242, + 0.6913491487503052, + 1.1232151985168457, + 0.9606267809867859, + 0.990993082523346, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff8c0ecf109af377331583e4079865e7d8037bc8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 5 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a4271bb2c4a9a7b6a9c624131e01adc7eafb24b1 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/trainer_state.json @@ -0,0 +1,18234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5579123973875006, + "eval_steps": 500, + "global_step": 26000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 2.688621997833252, + "learning_rate": 1.8e-07, + "loss": 0.1495, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.1722424030303955, + "learning_rate": 3.8e-07, + "loss": 0.1358, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 2.3095974922180176, + "learning_rate": 5.800000000000001e-07, + "loss": 0.1268, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 2.131070852279663, + "learning_rate": 7.8e-07, + "loss": 0.1224, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 2.273555278778076, + "learning_rate": 9.800000000000001e-07, + "loss": 0.118, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 1.3571869134902954, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.111, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 1.6004165410995483, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0826, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 1.0413638353347778, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.0657, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 1.1965473890304565, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.0493, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 1.1422100067138672, + "learning_rate": 1.98e-06, + "loss": 0.0444, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 0.6911118626594543, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.0457, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 0.6770259737968445, + "learning_rate": 2.38e-06, + "loss": 0.0257, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 0.4811704456806183, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.0208, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 0.7260023951530457, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.0203, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 0.4369716942310333, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.0174, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 0.4100959300994873, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.0133, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 1.0024627447128296, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.0149, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.4598183035850525, + "learning_rate": 3.58e-06, + "loss": 0.0143, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 0.7042055130004883, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.0143, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 0.7677909731864929, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0151, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 0.45090702176094055, + "learning_rate": 4.18e-06, + "loss": 0.0113, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 0.4400976598262787, + "learning_rate": 4.38e-06, + "loss": 0.0155, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 0.2424178272485733, + "learning_rate": 4.58e-06, + "loss": 0.0113, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 0.4720967411994934, + "learning_rate": 4.78e-06, + "loss": 0.0166, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 0.41622042655944824, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0104, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 0.6915765404701233, + "learning_rate": 5.18e-06, + "loss": 0.0108, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.25931113958358765, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0104, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.42486071586608887, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0084, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.3798843324184418, + "learning_rate": 5.78e-06, + "loss": 0.0107, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.3281213343143463, + "learning_rate": 5.98e-06, + "loss": 0.0081, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 0.3394489884376526, + "learning_rate": 6.18e-06, + "loss": 0.01, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 0.38298189640045166, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0098, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 0.3188078999519348, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0104, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.3152049779891968, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0087, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.34163472056388855, + "learning_rate": 6.98e-06, + "loss": 0.01, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 0.43860143423080444, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0065, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.2845093309879303, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0086, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 0.4009752869606018, + "learning_rate": 7.58e-06, + "loss": 0.0099, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.37756970524787903, + "learning_rate": 7.78e-06, + "loss": 0.0097, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.38135284185409546, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0076, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 0.3145769536495209, + "learning_rate": 8.18e-06, + "loss": 0.0106, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 0.32534345984458923, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0069, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.24024507403373718, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0089, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 0.32857799530029297, + "learning_rate": 8.78e-06, + "loss": 0.0105, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.28823110461235046, + "learning_rate": 8.98e-06, + "loss": 0.0101, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 0.32506972551345825, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0126, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 0.19875723123550415, + "learning_rate": 9.38e-06, + "loss": 0.0081, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.3245992958545685, + "learning_rate": 9.58e-06, + "loss": 0.0099, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.24933603405952454, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0117, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.3154098391532898, + "learning_rate": 9.980000000000001e-06, + "loss": 0.009, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.3685779273509979, + "learning_rate": 1.018e-05, + "loss": 0.0101, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 0.7251449823379517, + "learning_rate": 1.038e-05, + "loss": 0.0119, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 0.3183727264404297, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.009, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.3737810254096985, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0089, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.45293235778808594, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.011, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 0.3476772606372833, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.008, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.38373252749443054, + "learning_rate": 1.138e-05, + "loss": 0.0088, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 0.2530902624130249, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.008, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 0.19455896317958832, + "learning_rate": 1.178e-05, + "loss": 0.008, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.3315221071243286, + "learning_rate": 1.198e-05, + "loss": 0.0102, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.23430880904197693, + "learning_rate": 1.218e-05, + "loss": 0.007, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.4636307656764984, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0075, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.3785994052886963, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0109, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.2804955542087555, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0099, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.393702894449234, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0132, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.400641530752182, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0099, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 0.24428881704807281, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0076, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 0.4449252188205719, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0103, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.406582236289978, + "learning_rate": 1.378e-05, + "loss": 0.0098, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.36386170983314514, + "learning_rate": 1.398e-05, + "loss": 0.0088, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.38196030259132385, + "learning_rate": 1.418e-05, + "loss": 0.01, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.28740620613098145, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.008, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.3616485297679901, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0094, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.4004146158695221, + "learning_rate": 1.478e-05, + "loss": 0.009, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.4585514962673187, + "learning_rate": 1.498e-05, + "loss": 0.0092, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.20028235018253326, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0138, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 0.46603646874427795, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0139, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.3518030047416687, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0116, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.22323082387447357, + "learning_rate": 1.578e-05, + "loss": 0.0097, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.26777058839797974, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0081, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.32380548119544983, + "learning_rate": 1.618e-05, + "loss": 0.0087, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.5248059630393982, + "learning_rate": 1.638e-05, + "loss": 0.0102, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.3495309054851532, + "learning_rate": 1.658e-05, + "loss": 0.0121, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.3551771342754364, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.5039486289024353, + "learning_rate": 1.698e-05, + "loss": 0.0094, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.3826751410961151, + "learning_rate": 1.718e-05, + "loss": 0.0107, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.46699973940849304, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0122, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3312668204307556, + "learning_rate": 1.758e-05, + "loss": 0.0087, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 0.28113219141960144, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0121, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.49752357602119446, + "learning_rate": 1.798e-05, + "loss": 0.0101, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.4177795350551605, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0096, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.34015583992004395, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0082, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.4612225890159607, + "learning_rate": 1.858e-05, + "loss": 0.0084, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.3813643753528595, + "learning_rate": 1.878e-05, + "loss": 0.012, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 0.27937838435173035, + "learning_rate": 1.898e-05, + "loss": 0.0104, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.4471273422241211, + "learning_rate": 1.918e-05, + "loss": 0.0125, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.4010440707206726, + "learning_rate": 1.938e-05, + "loss": 0.0106, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.41607654094696045, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0107, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 0.3589233458042145, + "learning_rate": 1.978e-05, + "loss": 0.0081, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.5726460814476013, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0111, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.36717164516448975, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0102, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.47284170985221863, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.01, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.5372244119644165, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0117, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.40928924083709717, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0088, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.4905182421207428, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0107, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.3709850609302521, + "learning_rate": 1.999981616897523e-05, + "loss": 0.01, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 0.6419615745544434, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0095, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.4986196458339691, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0127, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.5523516535758972, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0115, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.5443158745765686, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0113, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 0.5146775245666504, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0101, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.2972394824028015, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0092, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.4030104875564575, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0097, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 0.4765481650829315, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0136, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.4051239788532257, + "learning_rate": 1.999882759038658e-05, + "loss": 0.0113, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.3703782558441162, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0108, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5248176455497742, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0112, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.3100311756134033, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0083, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.45929211378097534, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0114, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 0.5695507526397705, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0095, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.5395359992980957, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0151, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.5106327533721924, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0124, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.3423260450363159, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0132, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.32126766443252563, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.011, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.5105165839195251, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0085, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 0.31927764415740967, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0088, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 0.4421865940093994, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0093, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.2930506765842438, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0091, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.2920694053173065, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0085, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.2661049962043762, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0081, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 0.3047257661819458, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0083, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.2774506211280823, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0091, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.2554785907268524, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0096, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.5792570114135742, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0108, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.3250623941421509, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0125, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 0.5885359048843384, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0117, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.37988749146461487, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.009, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.3751101493835449, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0099, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.31976667046546936, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0097, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 0.37007251381874084, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0079, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.4624205231666565, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0103, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 0.3769538700580597, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0094, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.25460657477378845, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0076, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.3976004719734192, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0109, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.2983521521091461, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0113, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.25581008195877075, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0101, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.29260268807411194, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0102, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.3522181808948517, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0105, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.36269208788871765, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0103, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.40412119030952454, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0116, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.24089744687080383, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0119, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.4667617082595825, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0084, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.30139675736427307, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0101, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.38486286997795105, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0097, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.3526909649372101, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0071, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.3023934066295624, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0125, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.2796316146850586, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0072, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.25742489099502563, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0089, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.3626627027988434, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.01, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.3032572567462921, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0084, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.23514018952846527, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0086, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.3835832476615906, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0091, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.5170259475708008, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0146, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 0.8983817100524902, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0112, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.26260825991630554, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0086, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.481942743062973, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0126, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.311187207698822, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0064, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.3346790373325348, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0073, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.33836621046066284, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0085, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.3678463101387024, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0098, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.6136184334754944, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0154, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.39811593294143677, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0112, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6973778009414673, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0099, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.4773237109184265, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0091, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.3776084780693054, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.009, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 0.5061993598937988, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0097, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.41183987259864807, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.009, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 0.31513598561286926, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0112, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.4571514129638672, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0097, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.3183996379375458, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.01, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.2978666126728058, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0089, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.4791043698787689, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0087, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.5216032266616821, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0124, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.44693392515182495, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0092, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 0.41371819376945496, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0111, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.3593288064002991, + "learning_rate": 1.996106060741973e-05, + "loss": 0.014, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 0.4550306499004364, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0098, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.3510669469833374, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0066, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.2778814136981964, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0108, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.32210350036621094, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0067, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.42160800099372864, + "learning_rate": 1.995639934033493e-05, + "loss": 0.012, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.49051347374916077, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0102, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.3643694519996643, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.009, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.3717772960662842, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0076, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.32102280855178833, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0081, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.36725476384162903, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0102, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.39626258611679077, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0078, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.4183773696422577, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0105, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.3494930863380432, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0078, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.6155357956886292, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0119, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.34380587935447693, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.0105, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.5476253032684326, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.01, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 0.37999996542930603, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0094, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 0.3124147057533264, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0125, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.4887244999408722, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.01, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5969874858856201, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0106, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 0.4295594096183777, + "learning_rate": 1.993971819309759e-05, + "loss": 0.007, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.3899303078651428, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0096, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.3912282884120941, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0075, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.5355616807937622, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0093, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.29141828417778015, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0129, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.24389855563640594, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.009, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.4070908725261688, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0085, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.26783379912376404, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0071, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.2644960880279541, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0089, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.35223162174224854, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0093, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.47337162494659424, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0095, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.25418519973754883, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0093, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 0.36384159326553345, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0082, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.30014440417289734, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0081, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.41121408343315125, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0081, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.5576186776161194, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.008, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.35785913467407227, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0083, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.3306240439414978, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0084, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 0.37215736508369446, + "learning_rate": 1.991774193879505e-05, + "loss": 0.012, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.5504099726676941, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0088, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.24932143092155457, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.007, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.5866615176200867, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0088, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.5174368619918823, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0121, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.2345893532037735, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0095, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.2683233916759491, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0068, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.2471713274717331, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0085, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.5090919733047485, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0108, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.2857886552810669, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0078, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.23729385435581207, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0096, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.30867621302604675, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0088, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.42522960901260376, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0103, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.37170591950416565, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0105, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.3672806918621063, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0121, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.4048611521720886, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.01, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.24768167734146118, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0125, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 0.5003495812416077, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0125, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.4303686022758484, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0084, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.3701602518558502, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0101, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.38272005319595337, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0075, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.2844183146953583, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0105, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.31114980578422546, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0095, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.3436568081378937, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0113, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.273001104593277, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0076, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.2653564512729645, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0077, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.3115384578704834, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0132, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.25932809710502625, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0083, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.28656521439552307, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0066, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.31808462738990784, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.0115, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.18877890706062317, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0092, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.3685394525527954, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0091, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.3878263533115387, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0082, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 0.284507691860199, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0085, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.3473755121231079, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0081, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.39935287833213806, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0081, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.34282153844833374, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0076, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.3581090271472931, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0087, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.37332627177238464, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0089, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 0.5224587321281433, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0089, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.42577075958251953, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0108, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.4602234959602356, + "learning_rate": 1.985504281027289e-05, + "loss": 0.014, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.4852961003780365, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0091, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.4437471628189087, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0112, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.37050408124923706, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0068, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.3345497250556946, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0069, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.36727628111839294, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0081, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 0.37056809663772583, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0152, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.5640603303909302, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0085, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 0.3653910160064697, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0078, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.2954258322715759, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0083, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6086210012435913, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0082, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 0.5260390043258667, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0105, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.3067379295825958, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.0092, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.3480100929737091, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0088, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.26472753286361694, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0067, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.5254784226417542, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0146, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.35744136571884155, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0098, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.36186468601226807, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0084, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 0.35203835368156433, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0115, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.30590811371803284, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0108, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.34612980484962463, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0082, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.2946765720844269, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0075, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.33707642555236816, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.007, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.2572688162326813, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0099, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.3901146352291107, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0185, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.4349755644798279, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0084, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.2383752018213272, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0092, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.46043846011161804, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0073, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.24630354344844818, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0062, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.5232640504837036, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0067, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 0.3850713074207306, + "learning_rate": 1.979809151602651e-05, + "loss": 0.014, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 0.44703760743141174, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0081, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.3762659728527069, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0099, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.4593638479709625, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0093, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.40554332733154297, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0125, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.33439910411834717, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0081, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.2623269855976105, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0062, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.22419600188732147, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0078, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.37183159589767456, + "learning_rate": 1.978133252131276e-05, + "loss": 0.01, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.27857136726379395, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0089, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.27683520317077637, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0069, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.45064759254455566, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0076, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.24215294420719147, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.0071, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.5163891315460205, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.0078, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.3922234773635864, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0077, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.19653558731079102, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0063, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.17621839046478271, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0084, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.6482162475585938, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0075, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.32759004831314087, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0088, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.33347561955451965, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0073, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.42883744835853577, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0084, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 0.3348788917064667, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0082, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.28349289298057556, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0102, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.2733197510242462, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0074, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.3263874351978302, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.01, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.295757532119751, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0071, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5598515868186951, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0093, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.425937294960022, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0083, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.2442379742860794, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0087, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.3378766179084778, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0163, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5137761831283569, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0099, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.3825916647911072, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0096, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.32084307074546814, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0066, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.3979593515396118, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0077, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.3103732764720917, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0067, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.5531997084617615, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0131, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5423216819763184, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0121, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.5038735270500183, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0087, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.44273868203163147, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.008, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.335232675075531, + "learning_rate": 1.971017390295979e-05, + "loss": 0.009, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.4746256470680237, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.26807400584220886, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0075, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.35464033484458923, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0123, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.33803898096084595, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0094, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 0.20334473252296448, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0101, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.34386369585990906, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0081, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.38781842589378357, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0088, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.25994163751602173, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0079, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.3342406451702118, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0091, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.3120318353176117, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0079, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.3556351661682129, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0073, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.21421445906162262, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0095, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.39498451352119446, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0087, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.5480947494506836, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0079, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.16734588146209717, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0072, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.3987548351287842, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0083, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.3929785490036011, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0096, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.2884303331375122, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0102, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.3338335454463959, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0092, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.47452738881111145, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0093, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.25584715604782104, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0068, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 0.3038389980792999, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0076, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.4123639464378357, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0101, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.38520675897598267, + "learning_rate": 1.964833301001045e-05, + "loss": 0.014, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.3355116844177246, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0094, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.3479195535182953, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0105, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.2700177729129791, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0076, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.2166757434606552, + "learning_rate": 1.963745667883003e-05, + "loss": 0.008, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 0.18578873574733734, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0071, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.26316413283348083, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0079, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.28762468695640564, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0115, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 0.3712877631187439, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0074, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.2862299382686615, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0072, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.2730867564678192, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0101, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.327648401260376, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0092, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.41153189539909363, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0083, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.32522135972976685, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0095, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.22764958441257477, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0085, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.3491888642311096, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.009, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.3123551607131958, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0103, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.1881783902645111, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0085, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.40902259945869446, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0089, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.382953941822052, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0088, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 0.23950865864753723, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0064, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.3419397175312042, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0118, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.42207059264183044, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0091, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.40754130482673645, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0087, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.2390766590833664, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0069, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.2974188029766083, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.0091, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.2993582785129547, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0092, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.42652204632759094, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0068, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.3138194680213928, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.009, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.38833311200141907, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0083, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.4015152156352997, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0081, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.42086881399154663, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.007, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.26732996106147766, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0071, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.5763937830924988, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0101, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.2955382764339447, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0075, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.4625638723373413, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0094, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.29631468653678894, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0096, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.46335819363594055, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0103, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.3183141350746155, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.008, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.26456212997436523, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0083, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 0.40924879908561707, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0097, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 0.3981763422489166, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0094, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.36437541246414185, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0064, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.2935962378978729, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0081, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.3478807210922241, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0079, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.3460087180137634, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0069, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.2706817090511322, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0088, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.2674945890903473, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0083, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.2268197238445282, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0072, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.3216208219528198, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0092, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.3226968050003052, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0101, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.2743329405784607, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0075, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.32573118805885315, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0094, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 0.53167325258255, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0099, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.3915646970272064, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0089, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.4526256322860718, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0101, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.323249489068985, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0094, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4046335816383362, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0088, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.34745559096336365, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0078, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.30308133363723755, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0071, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.37923407554626465, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0076, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 0.26785972714424133, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0093, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.2778306305408478, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0083, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.611038088798523, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0098, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.4114893078804016, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0111, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.2732110023498535, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0076, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.2964401841163635, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0095, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.40240928530693054, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0097, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.3901022672653198, + "learning_rate": 1.944152646499645e-05, + "loss": 0.008, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.38001132011413574, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0109, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.35937973856925964, + "learning_rate": 1.943474465322135e-05, + "loss": 0.007, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.2745327651500702, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0075, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.1598518043756485, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.007, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.401614785194397, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0115, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.4127846360206604, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0068, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.22147920727729797, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0061, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.28602245450019836, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0067, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.22147324681282043, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0076, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.2550548315048218, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0088, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.24113087356090546, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0076, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.3658410608768463, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0075, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.3856262266635895, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0112, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.33494284749031067, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0075, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.3767516314983368, + "learning_rate": 1.938969919958475e-05, + "loss": 0.01, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.24380649626255035, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.009, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.30575039982795715, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0079, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.32913386821746826, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.009, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.29845312237739563, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0099, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.44377902150154114, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0092, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.34614384174346924, + "learning_rate": 1.936834723687526e-05, + "loss": 0.009, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.3316318690776825, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0096, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.4076138734817505, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0081, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 0.30320486426353455, + "learning_rate": 1.935753861926916e-05, + "loss": 0.015, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.32243025302886963, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.011, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.323745459318161, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0077, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5750753283500671, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0088, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.22709843516349792, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0101, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.3067542314529419, + "learning_rate": 1.933932815280178e-05, + "loss": 0.007, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.392337828874588, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0089, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.43343180418014526, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0073, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.4371345341205597, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0078, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.35214635729789734, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0077, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.3259161412715912, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0074, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.3849303722381592, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0066, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.3968902826309204, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0091, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.33016201853752136, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0095, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.3859156668186188, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.008, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.3020654618740082, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.007, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.44503262639045715, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0105, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.3908904194831848, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0073, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.39256253838539124, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0078, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.352611243724823, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0077, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.39203983545303345, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0081, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.23835115134716034, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0066, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.24996638298034668, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0098, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.29537609219551086, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0082, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.2898835837841034, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0077, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.4040369391441345, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0083, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.3501318395137787, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0094, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5462452173233032, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0097, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.4217568337917328, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0072, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.18295089900493622, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0083, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.3695569336414337, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0078, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.37818798422813416, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0089, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.29818472266197205, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0084, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.3328498303890228, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.01, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.340724378824234, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0075, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.2966301441192627, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0063, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.30677109956741333, + "learning_rate": 1.922098355206593e-05, + "loss": 0.008, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.2091839611530304, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.0078, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.4229014217853546, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0115, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.40779992938041687, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0075, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.378817081451416, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.008, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.29796919226646423, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0092, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.2702767252922058, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0076, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.31349876523017883, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0085, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 0.30500444769859314, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0093, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.2860834002494812, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0061, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.26036593317985535, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0099, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.19049863517284393, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0075, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.3235284388065338, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0083, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.364092618227005, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.011, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.2409065216779709, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0092, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.36907926201820374, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.008, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.3230077922344208, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0073, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.191047802567482, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0063, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.3346494436264038, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0082, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.21352025866508484, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0075, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.5505086779594421, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0095, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.34264758229255676, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0083, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.20266413688659668, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0074, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.24938757717609406, + "learning_rate": 1.912718096497034e-05, + "loss": 0.007, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.4140026569366455, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0086, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.4424414038658142, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0104, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 0.5327904224395752, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0115, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 0.35958340764045715, + "learning_rate": 1.911035077753307e-05, + "loss": 0.01, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.2547682523727417, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0066, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.3701247274875641, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0115, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.34443217515945435, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0077, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.20353800058364868, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0061, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5660653114318848, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0091, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.26445311307907104, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0073, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.5561402440071106, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0071, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.3700469434261322, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0083, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.35783904790878296, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.008, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.3238641619682312, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0081, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.25247740745544434, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0099, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.435730904340744, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.008, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.37758126854896545, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0068, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.33323949575424194, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.0094, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.4356318712234497, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0093, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 0.37893903255462646, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0058, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.4411139190196991, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0085, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.3852006793022156, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0087, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4287096858024597, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0107, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.43085435032844543, + "learning_rate": 1.902392195640386e-05, + "loss": 0.009, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 0.2709400951862335, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0066, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.358126163482666, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0082, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.25320038199424744, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0077, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 0.31440937519073486, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0077, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.25246965885162354, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0079, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.28420332074165344, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0101, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.25251317024230957, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0075, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.19744229316711426, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0069, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4457854628562927, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0073, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.36817625164985657, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0096, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.3394709825515747, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0073, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.2909093201160431, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0065, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.20237651467323303, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0057, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.29520732164382935, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0072, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.25512900948524475, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0096, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.45816823840141296, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0073, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.33459368348121643, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0096, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.21619321405887604, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0063, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.25518253445625305, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0067, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.2273867279291153, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.007, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.2864684462547302, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0074, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.3077942728996277, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0075, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 0.40526703000068665, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0079, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.41480064392089844, + "learning_rate": 1.891523933768891e-05, + "loss": 0.01, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.2750788629055023, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0064, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 0.29671600461006165, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0095, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 0.24160107970237732, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0069, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 0.2949109971523285, + "learning_rate": 1.889660337749874e-05, + "loss": 0.007, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.2847975492477417, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0059, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.30052465200424194, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0067, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.36128419637680054, + "learning_rate": 1.888252908366661e-05, + "loss": 0.014, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.36974236369132996, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0064, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.43730056285858154, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0084, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.3145422339439392, + "learning_rate": 1.88683715346172e-05, + "loss": 0.008, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.35473865270614624, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0091, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.2501350939273834, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.008, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.34808069467544556, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0099, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.45218509435653687, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.0068, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.34530994296073914, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0098, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.38257333636283875, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0101, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.3040159344673157, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0079, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.3323517143726349, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0068, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.2639414370059967, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0078, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.3493870794773102, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0081, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.5838330984115601, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0091, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 0.428803026676178, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0087, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.3654572069644928, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0114, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.3295663297176361, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0075, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.3469060957431793, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0074, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.3366406261920929, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0066, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.32569241523742676, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0054, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3086700737476349, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0086, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.38562801480293274, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0092, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.3523421585559845, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0085, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.2278694063425064, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0063, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.32141822576522827, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0147, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.3375259041786194, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0077, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4483063220977783, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0062, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.3667140007019043, + "learning_rate": 1.874717450126662e-05, + "loss": 0.008, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.3419000506401062, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0079, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.36556369066238403, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0079, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.33135318756103516, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0064, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.4458329975605011, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0091, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.34939518570899963, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0072, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.34424352645874023, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0077, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.3460613191127777, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0113, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.38822048902511597, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0066, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.35550639033317566, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0083, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 0.30869176983833313, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0087, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.38202086091041565, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0081, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.25744789838790894, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0074, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.29700344800949097, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0082, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.305786669254303, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0076, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.3291271924972534, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0071, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.26111704111099243, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0074, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.348176509141922, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0086, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.27502793073654175, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0076, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.2831551432609558, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0092, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.39652079343795776, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0066, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.3885122239589691, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0087, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.37296077609062195, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0104, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.33606627583503723, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0086, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.3855937421321869, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0097, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.3322301506996155, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0076, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 0.33322253823280334, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.009, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.22358210384845734, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0088, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.5901851058006287, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0088, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.4703235328197479, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0084, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.20072896778583527, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0077, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.3537980616092682, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0098, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.3123277723789215, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0068, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.35979342460632324, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0065, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.38628828525543213, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0074, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.3498038053512573, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0074, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.20784054696559906, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0059, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.1811107099056244, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0085, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.43317103385925293, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0064, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.3815033435821533, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0064, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.35989734530448914, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.008, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.46118423342704773, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.012, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.25334376096725464, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0078, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.26764073967933655, + "learning_rate": 1.852547637090483e-05, + "loss": 0.01, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.2785920202732086, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0066, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.41587865352630615, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0061, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.42850133776664734, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.009, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.32369133830070496, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0091, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.2930110692977905, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0069, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.3199067711830139, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0085, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 0.4349478483200073, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0078, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 0.3054976165294647, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0061, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.2826739251613617, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0068, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.25106528401374817, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.007, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.25897887349128723, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0076, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.26398584246635437, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.0069, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.41751599311828613, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0083, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.17239610850811005, + "learning_rate": 1.844974808419918e-05, + "loss": 0.006, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3300461173057556, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0051, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.2645586133003235, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0068, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.24550332129001617, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0071, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.2889944911003113, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0091, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.476601779460907, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0066, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.35630306601524353, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0074, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.35651877522468567, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0084, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.3889803886413574, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0079, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4214278757572174, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.009, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.30540233850479126, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0083, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.3624532222747803, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0076, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.32963570952415466, + "learning_rate": 1.838347361898993e-05, + "loss": 0.01, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.3533381521701813, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0064, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.3011729419231415, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0065, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.4733760952949524, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0089, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.38553985953330994, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0059, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.2560643255710602, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0073, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.39531010389328003, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0106, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.2701983153820038, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0086, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.352717787027359, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0096, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.29157745838165283, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0073, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.4267994165420532, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0075, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.36308032274246216, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0075, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.33457428216934204, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0103, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.3717971444129944, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0069, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 0.21432936191558838, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0081, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.2878777086734772, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0057, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.4453850984573364, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0095, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.36917057633399963, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0063, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.3252313733100891, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0082, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.2529674470424652, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0057, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.2816419303417206, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0097, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.6464210152626038, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0076, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.33034399151802063, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0069, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.27335023880004883, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0078, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 0.3158395290374756, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0064, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.5128306746482849, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0087, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 0.24884961545467377, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0084, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.324278324842453, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0075, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6472476124763489, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0093, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.21269051730632782, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0066, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.29203882813453674, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0074, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.30436405539512634, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0087, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5066608190536499, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0081, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.32647472620010376, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0066, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.2804315388202667, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0066, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.24779941141605377, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0074, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.34001022577285767, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0101, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.2611280381679535, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0082, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.3129233717918396, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0079, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.2822776734828949, + "learning_rate": 1.815952390818299e-05, + "loss": 0.0098, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.36969345808029175, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0064, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.33959338068962097, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0088, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.2628033459186554, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0062, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.38812723755836487, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0061, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.26403307914733887, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0055, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 0.3789900541305542, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0081, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.28676870465278625, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0127, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.606293797492981, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0082, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.37321826815605164, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0063, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.368115097284317, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0091, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.3368416726589203, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0068, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.23466472327709198, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.006, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.3796599507331848, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0169, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.2202090471982956, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0099, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.5006175637245178, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0086, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.3673453629016876, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0083, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4379428029060364, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.006, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.43015891313552856, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0084, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.2806220054626465, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0061, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.23545289039611816, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0062, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 0.32115358114242554, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0075, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.3217777907848358, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0062, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.3224331736564636, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0072, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.31703537702560425, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0082, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 0.4175204932689667, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.008, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.22969186305999756, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0084, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.3421284258365631, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0077, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.32668444514274597, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0071, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.2729822099208832, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0068, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.33153197169303894, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0074, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.4678424000740051, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0076, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.23711496591567993, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0076, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.3230719566345215, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0084, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.32328692078590393, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0075, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.566879153251648, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0072, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.26277920603752136, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0062, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.339163601398468, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0082, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.23408609628677368, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0061, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.2942394018173218, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0065, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 0.3774799704551697, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0063, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.2847958207130432, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0072, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.2577030062675476, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0088, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.2883673906326294, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0075, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.3596307933330536, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0073, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.30285483598709106, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0076, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.2933914363384247, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0077, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.7666468024253845, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0102, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.31347739696502686, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0072, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.3435507118701935, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0081, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.3266170620918274, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0058, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.284027099609375, + "learning_rate": 1.784745142605655e-05, + "loss": 0.005, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.19972574710845947, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0072, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.2587524950504303, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0067, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.2922254204750061, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0064, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.17053507268428802, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0092, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.2850453555583954, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0073, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.2844892144203186, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0075, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.28969481587409973, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0079, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.4704195261001587, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0102, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.2652505338191986, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0077, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.2656702399253845, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0118, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.2282119244337082, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0086, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.30130353569984436, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0062, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.2295757234096527, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0066, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.25287938117980957, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0065, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.3274557292461395, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0076, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.34377023577690125, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0079, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.36259520053863525, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0055, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.24462608993053436, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0067, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.3615039587020874, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0088, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.40002626180648804, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0086, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.3362888991832733, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0062, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.33698126673698425, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0087, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.3287750482559204, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.0068, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.23409898579120636, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0063, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.23275460302829742, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0066, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.35324692726135254, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0068, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.2781875729560852, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0066, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.3083304166793823, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0069, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.22543831169605255, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0066, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.22566530108451843, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0066, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.3640650808811188, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0073, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.35346123576164246, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0069, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 0.30858153104782104, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0076, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.30895760655403137, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0074, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.30667638778686523, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0082, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.3134152889251709, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0086, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.21407048404216766, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0077, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.3456077575683594, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0083, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.4259016513824463, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.009, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.38690924644470215, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0094, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.31742537021636963, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0065, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.3568819463253021, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0077, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.3771888315677643, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0073, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.25528469681739807, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0067, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.36028411984443665, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0064, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.41987329721450806, + "learning_rate": 1.754802282200567e-05, + "loss": 0.007, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.18902993202209473, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0064, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.1859915405511856, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0086, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.1778331696987152, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0052, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4222147464752197, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.007, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.26806506514549255, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0074, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.34431734681129456, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0056, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.41732800006866455, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0079, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.3027847409248352, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0054, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.47592151165008545, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0066, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9539707899093628, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0095, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.4084669351577759, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0082, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.3052361309528351, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0072, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.23123528063297272, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.009, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.20356184244155884, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0073, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.048543930053711, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.3017459213733673, + "learning_rate": 1.74400239259128e-05, + "loss": 0.007, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.3679676353931427, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0085, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.20339734852313995, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0087, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.3523346781730652, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0076, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.4162348210811615, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0063, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.3293565511703491, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0067, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.24455691874027252, + "learning_rate": 1.739902378104222e-05, + "loss": 0.007, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 0.17645037174224854, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0051, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.2554231286048889, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0076, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.20006878674030304, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0076, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.27911216020584106, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0082, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.5701723694801331, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0081, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.222118079662323, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0072, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.2762138843536377, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0049, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 1.4110082387924194, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0114, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.31313180923461914, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0078, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.20941513776779175, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0079, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.3963930308818817, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0053, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.2066672146320343, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0049, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.3919369876384735, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0082, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.2544628083705902, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.0054, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.31123557686805725, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0078, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.24768301844596863, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0051, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.26674744486808777, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0052, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.27382466197013855, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0075, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.23384103178977966, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.0059, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.3531075417995453, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0068, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.34425088763237, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0066, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.2716144323348999, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0058, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.35163211822509766, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0071, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.23585639894008636, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0072, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.28066661953926086, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0068, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.3146689832210541, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0071, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.37553170323371887, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.008, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.18403242528438568, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0068, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.3904851973056793, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0072, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.4481397867202759, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0074, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.31124234199523926, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0074, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.3815377354621887, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0084, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.2909438908100128, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0074, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.3408021330833435, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0074, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.23902025818824768, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0076, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.2194853127002716, + "learning_rate": 1.714740708672306e-05, + "loss": 0.006, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 0.4337097108364105, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0092, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.4132380783557892, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0078, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.3434816598892212, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0076, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.25129666924476624, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0058, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.45458248257637024, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0064, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.5350340008735657, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.009, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 0.28008121252059937, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0073, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.33276447653770447, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0064, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37103456258773804, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0078, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 0.4689319133758545, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0073, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.3622629642486572, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.006, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.2822306156158447, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0073, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.19226481020450592, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0059, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.27806007862091064, + "learning_rate": 1.704700993266678e-05, + "loss": 0.007, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.25948378443717957, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0076, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.5857216715812683, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0095, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.30467140674591064, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0073, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.2067701816558838, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0068, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 0.5653601288795471, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0087, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.3107249140739441, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0065, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4027363061904907, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0098, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.2757766544818878, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0091, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.30397671461105347, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0061, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.28112074732780457, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0063, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.14751966297626495, + "learning_rate": 1.696714953556411e-05, + "loss": 0.008, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.2988373935222626, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0055, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.2706286311149597, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0066, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.3612031042575836, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.006, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.25386789441108704, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0065, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.3170768916606903, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0056, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.4776926338672638, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0059, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.34828829765319824, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0088, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.20440815389156342, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0066, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.2943046987056732, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0068, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.16982606053352356, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0073, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.5607914924621582, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0085, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.35823172330856323, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0064, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.23943926393985748, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0068, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.24083787202835083, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0056, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.37987980246543884, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0062, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.35953620076179504, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0069, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.22255095839500427, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0071, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.4121200442314148, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0098, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.2377164363861084, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0076, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.2298472374677658, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0064, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.40824711322784424, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0066, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.33295100927352905, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.007, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.3978032171726227, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0077, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.27672451734542847, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.006, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.2591206729412079, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0089, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.1749347746372223, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0051, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.18699893355369568, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0056, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.240631103515625, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0089, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.3650512993335724, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0075, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.3503545820713043, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0067, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.3086877167224884, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0061, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.41695648431777954, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0064, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.33144691586494446, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0067, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.2679164409637451, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0072, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.22681233286857605, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0071, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.36362454295158386, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0067, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.20192845165729523, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0067, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.3895004093647003, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0055, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.22510671615600586, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0069, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.19641445577144623, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0101, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.2914806008338928, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0076, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.3187137544155121, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0059, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.3116552233695984, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0095, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.2597426772117615, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0058, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.21480600535869598, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0055, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.23912057280540466, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.006, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.317941278219223, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0064, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.58933025598526, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0095, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.21906700730323792, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0105, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.23899045586585999, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0059, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.2969389259815216, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0124, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.3514954447746277, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0066, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.18145518004894257, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0077, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.3087640404701233, + "learning_rate": 1.656303606359183e-05, + "loss": 0.006, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.3532063364982605, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0055, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.34000685811042786, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0096, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.24904295802116394, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0073, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.36314642429351807, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.008, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.20241902768611908, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.009, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.3215351700782776, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0075, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 0.4313117563724518, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0081, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.48170387744903564, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0071, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.3369109630584717, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0066, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.34541958570480347, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0058, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.2493886947631836, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0058, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.22845667600631714, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0096, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.2695702016353607, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0055, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 0.28211796283721924, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0052, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.1901162564754486, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0058, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.2701025605201721, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0061, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.36527693271636963, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0072, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.3061700463294983, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0067, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.5612105131149292, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0087, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.23399880528450012, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0072, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.314933180809021, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0078, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.35548436641693115, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0094, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.37685567140579224, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0084, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.3190719783306122, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0065, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.26337119936943054, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0063, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.3518264889717102, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0072, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.3185817003250122, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0068, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.2995646893978119, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0064, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.3110463619232178, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0063, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.24277286231517792, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0064, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.17603862285614014, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0061, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.28089356422424316, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0076, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.2855492830276489, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0047, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.3247278928756714, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0058, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.18349547684192657, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0061, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.30654969811439514, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.007, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.2674420177936554, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0067, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.38177546858787537, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0091, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.33796218037605286, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0068, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.3754856586456299, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0063, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.21820858120918274, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.007, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.36184942722320557, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0061, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.32240399718284607, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0063, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 0.24755406379699707, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0059, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.397858589887619, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0064, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.389072448015213, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0063, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.3368140757083893, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0071, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.29631632566452026, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0062, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.24265453219413757, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0076, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.19892603158950806, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0064, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.1852462887763977, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0051, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.1886446475982666, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0075, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.25982722640037537, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0068, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.3376137614250183, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0058, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.33173730969429016, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0064, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.3177517354488373, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0072, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.3385971784591675, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0066, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.29163679480552673, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0073, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.2335229516029358, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0056, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.24502214789390564, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0054, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.2009458988904953, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0061, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.3341793715953827, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0082, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.3872147798538208, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0063, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.29940876364707947, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0073, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.4895729720592499, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0086, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.4485950469970703, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.0053, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.22961653769016266, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0077, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.24187293648719788, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.005, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.3535212278366089, + "learning_rate": 1.601916647245149e-05, + "loss": 0.007, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.26539868116378784, + "learning_rate": 1.601107070706339e-05, + "loss": 0.008, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.43096065521240234, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0076, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.16919535398483276, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0058, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.2383720725774765, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0064, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.36103156208992004, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0067, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.2657287120819092, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0072, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.21437199413776398, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0065, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.34000417590141296, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0046, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.4855337142944336, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0068, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.3178497850894928, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0064, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.3171309530735016, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0067, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.3364340662956238, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0067, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2272711992263794, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0069, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.29505178332328796, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0078, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.3755042552947998, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0081, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.2983969449996948, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0085, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.3112468421459198, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0072, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.1950412392616272, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0061, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.2153436243534088, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0065, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.25062650442123413, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0079, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.1407836377620697, + "learning_rate": 1.584793312377278e-05, + "loss": 0.005, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.17276513576507568, + "learning_rate": 1.583971586792325e-05, + "loss": 0.006, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.47983887791633606, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0076, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.28724750876426697, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0076, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.3224884569644928, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0079, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.37969788908958435, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0063, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.48106926679611206, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0071, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.3555319905281067, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0075, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.19486083090305328, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.006, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.42018064856529236, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0074, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.3075830936431885, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0071, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.20921990275382996, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0063, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.20436584949493408, + "learning_rate": 1.574895332125391e-05, + "loss": 0.006, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.28120604157447815, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0071, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.22980183362960815, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0078, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.24825431406497955, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0064, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.22042447328567505, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0071, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.249199777841568, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0076, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.32628607749938965, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0057, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.35151633620262146, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0059, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.29098865389823914, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0064, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.24006013572216034, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0058, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.2797141671180725, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0073, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.2963006794452667, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0058, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.19539053738117218, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0053, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.2686854898929596, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0051, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.35952430963516235, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0071, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.21042552590370178, + "learning_rate": 1.562410199183484e-05, + "loss": 0.005, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.27942436933517456, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0068, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.17137926816940308, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0063, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.20331411063671112, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0047, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.15683002769947052, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0052, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.14726290106773376, + "learning_rate": 1.558221191857467e-05, + "loss": 0.006, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.2940376400947571, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0068, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.4059796929359436, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0067, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.2587816119194031, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0086, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.3462979793548584, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0078, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.5607128739356995, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0079, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.24189788103103638, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0052, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 0.23362945020198822, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0073, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.22395116090774536, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0059, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.3514958322048187, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0064, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.25395795702934265, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0081, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.2948741018772125, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0051, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.22298739850521088, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0038, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.46948447823524475, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0097, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.2992243468761444, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0083, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.18001538515090942, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0055, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.23337051272392273, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0066, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.2863878905773163, + "learning_rate": 1.543878746906905e-05, + "loss": 0.006, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.23027309775352478, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0072, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.21359150111675262, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0064, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.3878735601902008, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0069, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.29146283864974976, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.007, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.21782676875591278, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0051, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.45582008361816406, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0063, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.4554077982902527, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0067, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.2254059612751007, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0064, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.13952374458312988, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0061, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.23241721093654633, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0072, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.3424162268638611, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0058, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.21074503660202026, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0057, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.33662086725234985, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0056, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.24403709173202515, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0073, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.27195101976394653, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0058, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.34224429726600647, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0072, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.29089581966400146, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0053, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3397226333618164, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0066, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.30517837405204773, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0092, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.3485032021999359, + "learning_rate": 1.52681291800283e-05, + "loss": 0.007, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.31346458196640015, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0045, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.1864607185125351, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.006, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.20976679027080536, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0053, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.22616958618164062, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0059, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.14772117137908936, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0073, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.33677151799201965, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0059, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.32354292273521423, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0061, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.21409569680690765, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0064, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.4659721851348877, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0061, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 0.32267874479293823, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0064, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.5019848942756653, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0061, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.32694318890571594, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0076, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.3013843297958374, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0068, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.1973707377910614, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0059, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22204430401325226, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0056, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.3365449607372284, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0059, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.3398677110671997, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.007, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.27888917922973633, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0062, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.2814931273460388, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0069, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.3317541182041168, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.006, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.21940776705741882, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0052, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.239700049161911, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0059, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.19117280840873718, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0071, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.21827168762683868, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0056, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.25645333528518677, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0085, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.30847233533859253, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0055, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.3127819895744324, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0058, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.30181658267974854, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0075, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.34778207540512085, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0077, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.18988046050071716, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0048, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.3479195833206177, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0045, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.24158424139022827, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0051, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.14698052406311035, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0053, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.4441753625869751, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0065, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.28078633546829224, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0064, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.29406028985977173, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0048, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3856968581676483, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0067, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.36528849601745605, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0062, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.34250667691230774, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0053, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.2862832844257355, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0055, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.3683549761772156, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0091, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.26892581582069397, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0069, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.2220073938369751, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0052, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.18825116753578186, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0065, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.28731998801231384, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0069, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.26817163825035095, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0058, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.44162800908088684, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0065, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 0.2990165948867798, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0074, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.20428279042243958, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0053, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.2918189465999603, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0056, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.30408942699432373, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0063, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.2593521177768707, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0061, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.34048640727996826, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0054, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.2438877820968628, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0059, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.5205245018005371, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0065, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.3658570349216461, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0061, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.23279106616973877, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0039, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.2704083323478699, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0054, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.1849551945924759, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0061, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.21807430684566498, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0059, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.47879981994628906, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0061, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.24125567078590393, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0056, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.25820469856262207, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0053, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.30664944648742676, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0075, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.3646678030490875, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0057, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.2534210979938507, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0045, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.2125798910856247, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0074, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 0.4387839734554291, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0072, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.337387353181839, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.01, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.23150259256362915, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0072, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.3243090808391571, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0076, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.26716119050979614, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.006, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.15551891922950745, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0061, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.1841796338558197, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0058, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 0.3119230270385742, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.006, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.2633327841758728, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0059, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.24567869305610657, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0055, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.3697315454483032, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0061, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.1941021829843521, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0052, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.2610131502151489, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.007, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.24856074154376984, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0062, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.27259066700935364, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0052, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.20962993800640106, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0055, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.4015270471572876, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0062, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.22935271263122559, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0063, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.29984018206596375, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0059, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.35775551199913025, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0079, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.15501125156879425, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0054, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.3543296158313751, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0072, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.1982075721025467, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0064, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.2616399824619293, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0062, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.2612541615962982, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0064, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3081730008125305, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0055, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.24024926126003265, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0083, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.20793405175209045, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0055, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.21445533633232117, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0058, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.24078251421451569, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0059, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.36214157938957214, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0061, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.2583295702934265, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0054, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.2641732394695282, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0069, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.2179708331823349, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0049, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.27418699860572815, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0049, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.3894921839237213, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0076, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.3912152945995331, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0063, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.16886518895626068, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0059, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.2731325626373291, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0073, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.3299262225627899, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.007, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.2671407163143158, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0058, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.2701479196548462, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0059, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.3803080916404724, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0061, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.2621704041957855, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0061, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.27780428528785706, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0065, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.3326016962528229, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0081, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.3632255792617798, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0069, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.24395202100276947, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0065, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.3215671181678772, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0066, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.2625272572040558, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0065, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.31547197699546814, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0043, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.1893424689769745, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0059, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.27042335271835327, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0059, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.22597061097621918, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0063, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.1742873191833496, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0062, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.16797663271427155, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0048, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.42558521032333374, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0075, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.37216684222221375, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0061, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.19943472743034363, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0065, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.2211161106824875, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0075, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.2680184245109558, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0052, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.2402123361825943, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0051, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.1881084442138672, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0066, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.26134756207466125, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0063, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.3185539245605469, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0062, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.3118845820426941, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0061, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.22595946490764618, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.007, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.2627023458480835, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0067, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.2984865605831146, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0051, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.25496092438697815, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0057, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.3078263998031616, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0074, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.17885653674602509, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0057, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.37737196683883667, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0058, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.21651378273963928, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0053, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.1974128633737564, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0059, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.17184904217720032, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0058, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.3074864447116852, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0059, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.28784239292144775, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0061, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.3435216546058655, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0065, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.38048845529556274, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0057, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.1875533014535904, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0052, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.48555630445480347, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0063, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 0.25066429376602173, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0055, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.2763892412185669, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0059, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.21217335760593414, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0092, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.23555652797222137, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0064, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.14828811585903168, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.006, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 0.27303484082221985, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0047, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.14681454002857208, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0067, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.43693456053733826, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0081, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.2940906286239624, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0059, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.20382657647132874, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0074, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.25655868649482727, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0069, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.31879740953445435, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0062, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4898712933063507, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0051, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.17142456769943237, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0061, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.14010348916053772, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0045, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.26882827281951904, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0056, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.2636195421218872, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0048, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.24932081997394562, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0045, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.3367895185947418, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0049, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.15173649787902832, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0053, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.34083831310272217, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0072, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3327343165874481, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0048, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.36545902490615845, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0076, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.22761192917823792, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0067, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.19272181391716003, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0072, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.2881070375442505, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.006, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.32841676473617554, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0063, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.19850151240825653, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0052, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.31401291489601135, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0052, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.4023345112800598, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0058, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.25802844762802124, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0051, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.19678954780101776, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0053, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.4545653164386749, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0073, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.36174362897872925, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0068, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.31692951917648315, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0063, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.3470834195613861, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0064, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.29541268944740295, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0062, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.26377183198928833, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.006, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.2019137591123581, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0058, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.45156505703926086, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.007, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.15810425579547882, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.006, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.20093902945518494, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.006, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.28989917039871216, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0062, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.39454182982444763, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0063, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.25967612862586975, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0069, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.2058791220188141, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0065, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.26367849111557007, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0074, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.2432256042957306, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0054, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.19844679534435272, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0048, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.16757237911224365, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0052, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.2988821566104889, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0047, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.2231496274471283, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0048, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.265029639005661, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0048, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.41179928183555603, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.0049, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.33498677611351013, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0052, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.2323407232761383, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0048, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.27306419610977173, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0061, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.2791977822780609, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0088, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.453421026468277, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0073, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.3209727108478546, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0063, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.2572932839393616, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0056, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.19572272896766663, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0051, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.2831172049045563, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0057, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.21267575025558472, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0059, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.3220005929470062, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0057, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.2515857517719269, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0063, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.18344618380069733, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0052, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.34515154361724854, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0052, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.16711464524269104, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0054, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.3027217984199524, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.006, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.31168296933174133, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.007, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5778804421424866, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0056, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.2591782212257385, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0061, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.2449295073747635, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0046, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 0.19733767211437225, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0054, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.14837461709976196, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0053, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.3784295916557312, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0054, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.2400134950876236, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0054, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.17671307921409607, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0051, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.2664073705673218, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.006, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.25426605343818665, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0062, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.26733267307281494, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0049, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.46151378750801086, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.006, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.17070212960243225, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0062, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.42009514570236206, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0052, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.20439159870147705, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0053, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.25189417600631714, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0066, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.21402288973331451, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0072, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.294109046459198, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0061, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.29355865716934204, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0061, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.2937833368778229, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0061, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.1926010102033615, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0056, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.21794214844703674, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0065, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.23409108817577362, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0067, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.4696379005908966, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0062, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.28415724635124207, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0061, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.22433705627918243, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0064, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3090682923793793, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0056, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.23742817342281342, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0057, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.2670089900493622, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0052, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.2810697555541992, + "learning_rate": 1.299277443549658e-05, + "loss": 0.007, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.44233059883117676, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0069, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.14227768778800964, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0064, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.298776239156723, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0072, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.2882034480571747, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0064, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.23135380446910858, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0064, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.2870500981807709, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.005, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.24524538218975067, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0064, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.2949783504009247, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0081, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.2215491235256195, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0069, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.26351356506347656, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0082, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.1909482628107071, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0052, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.13428187370300293, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0068, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.2125115543603897, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0048, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.27032148838043213, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0056, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.20981402695178986, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0069, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.24961373209953308, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0073, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.13643066585063934, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0054, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.25289252400398254, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0071, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.4061530828475952, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.006, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.29924723505973816, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0055, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.37029367685317993, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0053, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.37273409962654114, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0066, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.18242980539798737, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0054, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.18563945591449738, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0044, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.32972440123558044, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0045, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 0.3327874541282654, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0065, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.2077408730983734, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0055, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.1813255399465561, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0055, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.17811767756938934, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0055, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.20526157319545746, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0043, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.112189382314682, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0055, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.29082757234573364, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0099, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.23212411999702454, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0067, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.17449915409088135, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0047, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.3327349126338959, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0047, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.2709571123123169, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0056, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.19788618385791779, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0063, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.22075456380844116, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0064, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.2943982779979706, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0057, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.1718410849571228, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0056, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.3546068072319031, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0055, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.18132814764976501, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0047, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.17795684933662415, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0048, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.22964486479759216, + "learning_rate": 1.257232766480803e-05, + "loss": 0.005, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.3259448707103729, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0072, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.18410101532936096, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0045, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.28669047355651855, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0056, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.25986725091934204, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0055, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.1731722205877304, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0053, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.17501944303512573, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.005, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.2749968469142914, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0046, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.26125603914260864, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0055, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.22476239502429962, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0103, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.26169249415397644, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0067, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.19236186146736145, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0048, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.26535508036613464, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0055, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.2534106373786926, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0052, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.29464206099510193, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0076, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.3711875081062317, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0059, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.26430103182792664, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0055, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.27274343371391296, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.006, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.15951389074325562, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0069, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.33735600113868713, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0064, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.19443227350711823, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0051, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.1960541307926178, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0049, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21133695542812347, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0066, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.22702853381633759, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.006, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.22489185631275177, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0061, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.33164891600608826, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0067, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.22196516394615173, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.0055, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.19532594084739685, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0048, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.41902172565460205, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0064, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.30388328433036804, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0052, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.2507944703102112, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0051, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.30817684531211853, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0052, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.27485454082489014, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.006, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.14287802577018738, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0047, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 0.14513961970806122, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.0049, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.3345814645290375, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0051, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.2974685728549957, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0049, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.3455393612384796, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0062, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.16792115569114685, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.005, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.3038713335990906, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.005, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.2928559184074402, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0054, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.2317439168691635, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0039, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.3498123586177826, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0067, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.2850436866283417, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0045, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.18316122889518738, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0089, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.34362390637397766, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0066, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.13047993183135986, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0057, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.3403606116771698, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0055, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.27717292308807373, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0043, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.27412480115890503, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0049, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.1914675235748291, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0075, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.3778243958950043, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0084, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.20566068589687347, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.007, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.1868937760591507, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0051, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.24719548225402832, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.005, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.20591633021831512, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0053, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4353996217250824, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.005, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.31571000814437866, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.005, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.14182177186012268, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0048, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.3461489975452423, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0062, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.17980965971946716, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0043, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.28671878576278687, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0048, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.18663623929023743, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0072, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.25223061442375183, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0063, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.20179906487464905, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.37325599789619446, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0079, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.18855971097946167, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0052, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.2992260754108429, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0051, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.18020357191562653, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0046, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.2106374204158783, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0044, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3749687373638153, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0068, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.1616801619529724, + "learning_rate": 1.188676298665799e-05, + "loss": 0.007, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.20882001519203186, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0143, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.16600479185581207, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0052, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.406480073928833, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0051, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.27349016070365906, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0056, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.2340608835220337, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0044, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.3165459632873535, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0042, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.19552721083164215, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0047, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.21882636845111847, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0061, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.23699741065502167, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0052, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.283207505941391, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0053, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.2782933712005615, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0062, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.3389151096343994, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0074, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.25642505288124084, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0061, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.19476772844791412, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0067, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.1992277055978775, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0057, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.21006375551223755, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0058, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.18808932602405548, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0073, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.258075475692749, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0052, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.29291409254074097, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0052, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.19002115726470947, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0041, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.4246057868003845, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.006, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 0.16166792809963226, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.005, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.35779255628585815, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0065, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.20405125617980957, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0082, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.23229332268238068, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0095, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.21156901121139526, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0074, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.22334401309490204, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0051, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.18344342708587646, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0048, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.22982414066791534, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0056, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.24991759657859802, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0046, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.27965986728668213, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0045, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.309841126203537, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0054, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.20964398980140686, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0044, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.45226722955703735, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0057, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.17177052795886993, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0064, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.8886939287185669, + "learning_rate": 1.153689339251154e-05, + "loss": 0.008, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.14726528525352478, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0066, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.32135209441185, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0064, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.22926779091358185, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0052, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.21345189213752747, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0047, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.31324461102485657, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0072, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.2185574620962143, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0047, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.36229151487350464, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0042, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.3479749262332916, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0053, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.23806153237819672, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0065, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.30633601546287537, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0079, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.2326052039861679, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0063, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 0.1756114363670349, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0064, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.18622055649757385, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0045, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.3261238932609558, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0059, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.16155003011226654, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0057, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.22661013901233673, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0046, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.24310468137264252, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0044, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.16182619333267212, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0056, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.1656215786933899, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0039, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.2945510447025299, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0049, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.24436083436012268, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0058, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.34221476316452026, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0069, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.26235878467559814, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0055, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.23333275318145752, + "learning_rate": 1.130316049722011e-05, + "loss": 0.005, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.23382601141929626, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0057, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 0.1693800389766693, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0058, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.3740929067134857, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.005, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.26146796345710754, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0038, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.13361674547195435, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0053, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8631370663642883, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0085, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.2952764630317688, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0054, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.23047442734241486, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0054, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.25271645188331604, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0059, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.3246142864227295, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0066, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.31531205773353577, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0045, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4806351959705353, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0089, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.15645328164100647, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0051, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.29767802357673645, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0044, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.23338516056537628, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0055, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.20454354584217072, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0049, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.2087928056716919, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.004, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.18911990523338318, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0058, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.16931432485580444, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0053, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.3027138411998749, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0055, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.22635169327259064, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0039, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.26646292209625244, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0047, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.20067426562309265, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0054, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.22507227957248688, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0076, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.18533077836036682, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.005, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.1757635474205017, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0077, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.2326493263244629, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.006, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.2661048471927643, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0048, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.3285987079143524, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0047, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.3764145076274872, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.005, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.19637148082256317, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0048, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 0.16601431369781494, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.005, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.12405529618263245, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0036, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.21413138508796692, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0043, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.3323937952518463, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0057, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.20915299654006958, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0054, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.28372666239738464, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0048, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.32995301485061646, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0051, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.2148507684469223, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0061, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.22549118101596832, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.005, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.19749189913272858, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0049, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.250184565782547, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0065, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.23174546658992767, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.2707926034927368, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0049, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.175989031791687, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0058, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.2267833948135376, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0044, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.3495822846889496, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0048, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.2051204890012741, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0063, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.22149987518787384, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0058, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.21434035897254944, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0046, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.2996143400669098, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0065, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.22886960208415985, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0053, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.3317148685455322, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.005, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.45717868208885193, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0062, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.1223258301615715, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0051, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.2037084549665451, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0046, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.3772616982460022, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0045, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.30312252044677734, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0069, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.14988413453102112, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0047, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.3409348130226135, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0069, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.2308650016784668, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0049, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.15572187304496765, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0051, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.1962181180715561, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0049, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.337464302778244, + "learning_rate": 1.067930046280971e-05, + "loss": 0.005, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.17047251760959625, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0045, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.3098141849040985, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0043, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.17919068038463593, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0052, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.3461310863494873, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.006, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.37006744742393494, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0066, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.19726566970348358, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.005, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.1319705843925476, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0049, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.2131422460079193, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0055, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.1435563862323761, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0067, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.24024318158626556, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0055, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.1511068344116211, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0052, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.16795606911182404, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0047, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.1475641280412674, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0046, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.21277494728565216, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0048, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.2511015832424164, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0043, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.24675171077251434, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0059, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.2560728192329407, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0055, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.30879196524620056, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.005, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.1838868409395218, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0052, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.1673516035079956, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.20293423533439636, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0047, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.25513023138046265, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0052, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.26149800419807434, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0045, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.27551159262657166, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0041, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.2508440911769867, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0043, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.2889135181903839, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0043, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.1755184680223465, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0051, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.2095116674900055, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.33451047539711, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0079, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.44589516520500183, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0064, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.24158142507076263, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0047, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.15632936358451843, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.006, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.10808487981557846, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0065, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.1782998889684677, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0046, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.16395118832588196, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.004, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 0.30205732583999634, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0058, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.1561775654554367, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.004, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.1649634838104248, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0062, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.15428072214126587, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0043, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.11285894364118576, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0067, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.3470291793346405, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0056, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.16610246896743774, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0051, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.29931193590164185, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0051, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.15366005897521973, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.005, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.2352767139673233, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0057, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.19226962327957153, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0042, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.1903623789548874, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0044, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.4167932868003845, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0071, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.2913760840892792, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0046, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.2632276713848114, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0063, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.21258050203323364, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0043, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.19750680029392242, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.0032, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.2896588444709778, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0045, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3017624020576477, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0074, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.18355949223041534, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0051, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.16483789682388306, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0056, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.2190672904253006, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0043, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.28435200452804565, + "learning_rate": 1.011517750003287e-05, + "loss": 0.005, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.2564929723739624, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0049, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.2592712342739105, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0048, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.18716935813426971, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0047, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.18236829340457916, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0049, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.27956655621528625, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0056, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.13664546608924866, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0048, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.21617569029331207, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0052, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.2196502536535263, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0054, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.20864732563495636, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0041, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.38381293416023254, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.005, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.1605401486158371, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0045, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.2079813927412033, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0051, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.2110205590724945, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0054, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.2421400547027588, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0048, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.41358140110969543, + "learning_rate": 9.969762660447491e-06, + "loss": 0.006, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.23386628925800323, + "learning_rate": 9.960077585586335e-06, + "loss": 0.005, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.20425592362880707, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0059, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.21164651215076447, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0042, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.1642364114522934, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0034, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.18716906011104584, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0044, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.15626995265483856, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0044, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.18394386768341064, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0044, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.3590037524700165, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0073, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.2103291153907776, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0051, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.19865299761295319, + "learning_rate": 9.87296819358355e-06, + "loss": 0.006, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.2052467316389084, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0065, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.31245940923690796, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0049, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.2959006726741791, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0042, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.33695659041404724, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0071, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.20898328721523285, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0062, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.3500119149684906, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0049, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.3926694095134735, + "learning_rate": 9.805290087509098e-06, + "loss": 0.007, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.24234539270401, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0039, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.1705496460199356, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0056, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.2907398045063019, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0048, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.2366454005241394, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0047, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.25498414039611816, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0046, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.163838192820549, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0048, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.1613040417432785, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0048, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.3639470338821411, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0042, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.22151169180870056, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0043, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.13474372029304504, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0051, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.2601003050804138, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0038, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.20202822983264923, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0046, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.18514803051948547, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0061, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.16678287088871002, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0038, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.17608965933322906, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0041, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.26356828212738037, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0059, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.297612726688385, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0047, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.16363881528377533, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.12642459571361542, + "learning_rate": 9.621949874438232e-06, + "loss": 0.004, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.3339644968509674, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0052, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.20784282684326172, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0046, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.28467273712158203, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0047, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.3124372661113739, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0051, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.3490087389945984, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0047, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.15114343166351318, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0051, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.41157594323158264, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0058, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.40405890345573425, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0045, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.1149911880493164, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0087, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.18746539950370789, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0058, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.1327875554561615, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0049, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.1530160903930664, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0038, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.2663615047931671, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0049, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.3390499949455261, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0046, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.2461002618074417, + "learning_rate": 9.477616135359713e-06, + "loss": 0.006, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.2141093611717224, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0049, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.20443470776081085, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0052, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.14927290380001068, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0039, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.3012462854385376, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0047, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.33484792709350586, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0045, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.19986321032047272, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0041, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.21612870693206787, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0043, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.19541047513484955, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0044, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.24203962087631226, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0049, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.1470087766647339, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0049, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.2336059808731079, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0048, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.32893121242523193, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0044, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.32034680247306824, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0055, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.27538758516311646, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0049, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.18869644403457642, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0065, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.2719379961490631, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0047, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.2850756347179413, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0043, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.19997543096542358, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0068, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.19222821295261383, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0044, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.16414248943328857, + "learning_rate": 9.285803018919292e-06, + "loss": 0.004, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.23754803836345673, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0039, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.2682085335254669, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0048, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.18268488347530365, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0046, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.14906349778175354, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0034, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.19079554080963135, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0041, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.09538780897855759, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0043, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.19193744659423828, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0044, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.1366361379623413, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0049, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.29436588287353516, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0052, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.24179348349571228, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0047, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.236627459526062, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0061, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.1719210296869278, + "learning_rate": 9.171095634265995e-06, + "loss": 0.0054, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.2724406123161316, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0048, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.09852395206689835, + "learning_rate": 9.152007262148612e-06, + "loss": 0.004, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.23493632674217224, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0049, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.20697079598903656, + "learning_rate": 9.132927564918328e-06, + "loss": 0.0047, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.16597376763820648, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0048, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.23542962968349457, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0046, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.18859006464481354, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0054, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.16773538291454315, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0044, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.2122378647327423, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0042, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.18205690383911133, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0046, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.1791398823261261, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0043, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.4446735680103302, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0052, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.32150915265083313, + "learning_rate": 9.047178679583151e-06, + "loss": 0.005, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.15855731070041656, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0045, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.19377414882183075, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0057, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.25969046354293823, + "learning_rate": 9.018636566864313e-06, + "loss": 0.006, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.2349981814622879, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0073, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.1853523701429367, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0051, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.22417226433753967, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0058, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.1969340741634369, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0058, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.18523764610290527, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0059, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.28188323974609375, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0052, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.18134717643260956, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0048, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.15660132467746735, + "learning_rate": 8.942627394858978e-06, + "loss": 0.004, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.3179869055747986, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0044, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.14007267355918884, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0043, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.31531354784965515, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0062, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.1867508888244629, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0054, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4172282814979553, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0056, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.21233956515789032, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0054, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.13055016100406647, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0048, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.24662990868091583, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0054, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.1877284198999405, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0045, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.20158089697360992, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0052, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.23169469833374023, + "learning_rate": 8.83836825410936e-06, + "loss": 0.0048, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.27991265058517456, + "learning_rate": 8.828905148874785e-06, + "loss": 0.008, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.3321090638637543, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0063, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.14790703356266022, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0033, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.1504756361246109, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0052, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.2211659848690033, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0038, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.1777208149433136, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0041, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.2586643397808075, + "learning_rate": 8.772180411864604e-06, + "loss": 0.006, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.2705499529838562, + "learning_rate": 8.762735374981932e-06, + "loss": 0.0047, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.16527540981769562, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0037, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.24313445389270782, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0057, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.16705767810344696, + "learning_rate": 8.734416061983528e-06, + "loss": 0.004, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.20638783276081085, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0052, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.26159438490867615, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0039, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.30387070775032043, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0038, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.24292278289794922, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0042, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.3707493543624878, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0056, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.41142478585243225, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0044, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.22052627801895142, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0047, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.14626234769821167, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0047, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.25504666566848755, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0046, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.2020457535982132, + "learning_rate": 8.640192851412488e-06, + "loss": 0.006, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.2440478354692459, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0047, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.12040785700082779, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0044, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.25539812445640564, + "learning_rate": 8.611979388060327e-06, + "loss": 0.006, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.20701228082180023, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0041, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.24188214540481567, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0063, + "step": 18000 + }, + { + "epoch": 1.0791539337288034, + "grad_norm": 0.24987974762916565, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0063, + "step": 18010 + }, + { + "epoch": 1.0797531308047217, + "grad_norm": 0.20973123610019684, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0049, + "step": 18020 + }, + { + "epoch": 1.08035232788064, + "grad_norm": 0.19898714125156403, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0061, + "step": 18030 + }, + { + "epoch": 1.0809515249565582, + "grad_norm": 0.21703247725963593, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0056, + "step": 18040 + }, + { + "epoch": 1.0815507220324765, + "grad_norm": 0.18688541650772095, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0054, + "step": 18050 + }, + { + "epoch": 1.0821499191083948, + "grad_norm": 0.30194586515426636, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0049, + "step": 18060 + }, + { + "epoch": 1.082749116184313, + "grad_norm": 0.17975366115570068, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0046, + "step": 18070 + }, + { + "epoch": 1.0833483132602313, + "grad_norm": 0.25966599583625793, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0044, + "step": 18080 + }, + { + "epoch": 1.0839475103361496, + "grad_norm": 0.1702205240726471, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0058, + "step": 18090 + }, + { + "epoch": 1.0845467074120678, + "grad_norm": 0.18940114974975586, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0052, + "step": 18100 + }, + { + "epoch": 1.085145904487986, + "grad_norm": 0.18239127099514008, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0047, + "step": 18110 + }, + { + "epoch": 1.0857451015639044, + "grad_norm": 0.14571616053581238, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0046, + "step": 18120 + }, + { + "epoch": 1.0863442986398226, + "grad_norm": 0.17203395068645477, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0038, + "step": 18130 + }, + { + "epoch": 1.086943495715741, + "grad_norm": 0.249881312251091, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0056, + "step": 18140 + }, + { + "epoch": 1.0875426927916592, + "grad_norm": 0.296194463968277, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0044, + "step": 18150 + }, + { + "epoch": 1.0881418898675774, + "grad_norm": 0.21376049518585205, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0052, + "step": 18160 + }, + { + "epoch": 1.0887410869434957, + "grad_norm": 0.2952374815940857, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0036, + "step": 18170 + }, + { + "epoch": 1.089340284019414, + "grad_norm": 0.20862646400928497, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0051, + "step": 18180 + }, + { + "epoch": 1.0899394810953322, + "grad_norm": 0.17828255891799927, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0053, + "step": 18190 + }, + { + "epoch": 1.0905386781712505, + "grad_norm": 0.20771050453186035, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0038, + "step": 18200 + }, + { + "epoch": 1.0911378752471688, + "grad_norm": 0.3046565651893616, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0059, + "step": 18210 + }, + { + "epoch": 1.091737072323087, + "grad_norm": 0.12605167925357819, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0046, + "step": 18220 + }, + { + "epoch": 1.0923362693990053, + "grad_norm": 0.13702887296676636, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0038, + "step": 18230 + }, + { + "epoch": 1.0929354664749236, + "grad_norm": 0.11569058150053024, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0042, + "step": 18240 + }, + { + "epoch": 1.0935346635508418, + "grad_norm": 0.27488255500793457, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0054, + "step": 18250 + }, + { + "epoch": 1.09413386062676, + "grad_norm": 0.30820342898368835, + "learning_rate": 8.349909816537207e-06, + "loss": 0.005, + "step": 18260 + }, + { + "epoch": 1.0947330577026784, + "grad_norm": 0.3108576536178589, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0056, + "step": 18270 + }, + { + "epoch": 1.0953322547785966, + "grad_norm": 0.16087505221366882, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0044, + "step": 18280 + }, + { + "epoch": 1.095931451854515, + "grad_norm": 0.27139320969581604, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0055, + "step": 18290 + }, + { + "epoch": 1.0965306489304332, + "grad_norm": 0.17057007551193237, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0036, + "step": 18300 + }, + { + "epoch": 1.0971298460063514, + "grad_norm": 0.13946233689785004, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0057, + "step": 18310 + }, + { + "epoch": 1.0977290430822697, + "grad_norm": 0.2342602014541626, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0038, + "step": 18320 + }, + { + "epoch": 1.098328240158188, + "grad_norm": 0.17249339818954468, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0047, + "step": 18330 + }, + { + "epoch": 1.0989274372341062, + "grad_norm": 0.2641673684120178, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0044, + "step": 18340 + }, + { + "epoch": 1.0995266343100245, + "grad_norm": 0.18304336071014404, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0041, + "step": 18350 + }, + { + "epoch": 1.1001258313859428, + "grad_norm": 0.25955966114997864, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0045, + "step": 18360 + }, + { + "epoch": 1.100725028461861, + "grad_norm": 0.2159314751625061, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0038, + "step": 18370 + }, + { + "epoch": 1.1013242255377793, + "grad_norm": 0.254371702671051, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0043, + "step": 18380 + }, + { + "epoch": 1.1019234226136976, + "grad_norm": 0.10616741329431534, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0036, + "step": 18390 + }, + { + "epoch": 1.1025226196896158, + "grad_norm": 0.38598379492759705, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0065, + "step": 18400 + }, + { + "epoch": 1.1031218167655341, + "grad_norm": 0.3797863721847534, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0048, + "step": 18410 + }, + { + "epoch": 1.1037210138414524, + "grad_norm": 0.2059139758348465, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0062, + "step": 18420 + }, + { + "epoch": 1.1043202109173706, + "grad_norm": 0.19991335272789001, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0043, + "step": 18430 + }, + { + "epoch": 1.104919407993289, + "grad_norm": 0.17376656830310822, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0047, + "step": 18440 + }, + { + "epoch": 1.1055186050692072, + "grad_norm": 0.17102457582950592, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0056, + "step": 18450 + }, + { + "epoch": 1.1061178021451255, + "grad_norm": 0.501983642578125, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0065, + "step": 18460 + }, + { + "epoch": 1.1067169992210437, + "grad_norm": 0.40338510274887085, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0055, + "step": 18470 + }, + { + "epoch": 1.107316196296962, + "grad_norm": 0.10511627048254013, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0052, + "step": 18480 + }, + { + "epoch": 1.1079153933728803, + "grad_norm": 0.2610682249069214, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0038, + "step": 18490 + }, + { + "epoch": 1.1085145904487985, + "grad_norm": 0.09666074812412262, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0058, + "step": 18500 + }, + { + "epoch": 1.1091137875247168, + "grad_norm": 0.19014683365821838, + "learning_rate": 8.117972135268806e-06, + "loss": 0.005, + "step": 18510 + }, + { + "epoch": 1.109712984600635, + "grad_norm": 0.2999255657196045, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 1.1103121816765533, + "grad_norm": 0.20351538062095642, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0049, + "step": 18530 + }, + { + "epoch": 1.1109113787524716, + "grad_norm": 0.1562410295009613, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0034, + "step": 18540 + }, + { + "epoch": 1.1115105758283899, + "grad_norm": 0.14160799980163574, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0035, + "step": 18550 + }, + { + "epoch": 1.1121097729043081, + "grad_norm": 0.10796743631362915, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0056, + "step": 18560 + }, + { + "epoch": 1.1127089699802264, + "grad_norm": 0.28861188888549805, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0043, + "step": 18570 + }, + { + "epoch": 1.1133081670561449, + "grad_norm": 0.3835368752479553, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0037, + "step": 18580 + }, + { + "epoch": 1.113907364132063, + "grad_norm": 0.21850043535232544, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0038, + "step": 18590 + }, + { + "epoch": 1.1145065612079814, + "grad_norm": 0.2950346767902374, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0068, + "step": 18600 + }, + { + "epoch": 1.1151057582838995, + "grad_norm": 0.13051068782806396, + "learning_rate": 8.025779439806006e-06, + "loss": 0.0041, + "step": 18610 + }, + { + "epoch": 1.115704955359818, + "grad_norm": 0.11036359518766403, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0074, + "step": 18620 + }, + { + "epoch": 1.116304152435736, + "grad_norm": 0.35306516289711, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0087, + "step": 18630 + }, + { + "epoch": 1.1169033495116545, + "grad_norm": 0.29782727360725403, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0045, + "step": 18640 + }, + { + "epoch": 1.1175025465875728, + "grad_norm": 0.20690713822841644, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0042, + "step": 18650 + }, + { + "epoch": 1.118101743663491, + "grad_norm": 0.16064110398292542, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0038, + "step": 18660 + }, + { + "epoch": 1.1187009407394093, + "grad_norm": 0.2477649450302124, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0042, + "step": 18670 + }, + { + "epoch": 1.1193001378153276, + "grad_norm": 0.25939393043518066, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0045, + "step": 18680 + }, + { + "epoch": 1.1198993348912458, + "grad_norm": 0.3345301151275635, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0045, + "step": 18690 + }, + { + "epoch": 1.120498531967164, + "grad_norm": 0.19570066034793854, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0052, + "step": 18700 + }, + { + "epoch": 1.1210977290430824, + "grad_norm": 0.09655601531267166, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0044, + "step": 18710 + }, + { + "epoch": 1.1216969261190006, + "grad_norm": 0.13345655798912048, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0031, + "step": 18720 + }, + { + "epoch": 1.122296123194919, + "grad_norm": 0.3130756616592407, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0072, + "step": 18730 + }, + { + "epoch": 1.1228953202708372, + "grad_norm": 0.16259168088436127, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0036, + "step": 18740 + }, + { + "epoch": 1.1234945173467554, + "grad_norm": 0.2581227123737335, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0037, + "step": 18750 + }, + { + "epoch": 1.1240937144226737, + "grad_norm": 0.36706119775772095, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0043, + "step": 18760 + }, + { + "epoch": 1.124692911498592, + "grad_norm": 0.1705426573753357, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0069, + "step": 18770 + }, + { + "epoch": 1.1252921085745102, + "grad_norm": 0.4281153380870819, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0057, + "step": 18780 + }, + { + "epoch": 1.1258913056504285, + "grad_norm": 0.25743696093559265, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0036, + "step": 18790 + }, + { + "epoch": 1.1264905027263468, + "grad_norm": 0.17692404985427856, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0044, + "step": 18800 + }, + { + "epoch": 1.127089699802265, + "grad_norm": 0.17617255449295044, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0043, + "step": 18810 + }, + { + "epoch": 1.1276888968781833, + "grad_norm": 0.193951815366745, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0042, + "step": 18820 + }, + { + "epoch": 1.1282880939541016, + "grad_norm": 0.2187023162841797, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0047, + "step": 18830 + }, + { + "epoch": 1.1288872910300198, + "grad_norm": 0.21488729119300842, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0039, + "step": 18840 + }, + { + "epoch": 1.129486488105938, + "grad_norm": 0.13388743996620178, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0043, + "step": 18850 + }, + { + "epoch": 1.1300856851818564, + "grad_norm": 0.26977118849754333, + "learning_rate": 7.796848308199681e-06, + "loss": 0.004, + "step": 18860 + }, + { + "epoch": 1.1306848822577746, + "grad_norm": 0.40695786476135254, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0049, + "step": 18870 + }, + { + "epoch": 1.131284079333693, + "grad_norm": 0.29070621728897095, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0056, + "step": 18880 + }, + { + "epoch": 1.1318832764096112, + "grad_norm": 0.2745647728443146, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0056, + "step": 18890 + }, + { + "epoch": 1.1324824734855294, + "grad_norm": 0.20881050825119019, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0057, + "step": 18900 + }, + { + "epoch": 1.1330816705614477, + "grad_norm": 0.17475518584251404, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0041, + "step": 18910 + }, + { + "epoch": 1.133680867637366, + "grad_norm": 0.2414310723543167, + "learning_rate": 7.742248115573104e-06, + "loss": 0.004, + "step": 18920 + }, + { + "epoch": 1.1342800647132842, + "grad_norm": 0.20051640272140503, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0042, + "step": 18930 + }, + { + "epoch": 1.1348792617892025, + "grad_norm": 0.18383435904979706, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0038, + "step": 18940 + }, + { + "epoch": 1.1354784588651208, + "grad_norm": 0.16546988487243652, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0041, + "step": 18950 + }, + { + "epoch": 1.136077655941039, + "grad_norm": 0.17165544629096985, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0057, + "step": 18960 + }, + { + "epoch": 1.1366768530169573, + "grad_norm": 0.25065234303474426, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0048, + "step": 18970 + }, + { + "epoch": 1.1372760500928756, + "grad_norm": 0.19762223958969116, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0038, + "step": 18980 + }, + { + "epoch": 1.1378752471687938, + "grad_norm": 0.23894545435905457, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0033, + "step": 18990 + }, + { + "epoch": 1.1384744442447121, + "grad_norm": 0.2860289216041565, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0053, + "step": 19000 + }, + { + "epoch": 1.1390736413206304, + "grad_norm": 0.3699626624584198, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0061, + "step": 19010 + }, + { + "epoch": 1.1396728383965486, + "grad_norm": 0.2370971292257309, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0043, + "step": 19020 + }, + { + "epoch": 1.140272035472467, + "grad_norm": 0.19790691137313843, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0042, + "step": 19030 + }, + { + "epoch": 1.1408712325483852, + "grad_norm": 0.14648208022117615, + "learning_rate": 7.633462930388875e-06, + "loss": 0.005, + "step": 19040 + }, + { + "epoch": 1.1414704296243035, + "grad_norm": 0.105158232152462, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0032, + "step": 19050 + }, + { + "epoch": 1.1420696267002217, + "grad_norm": 0.24994254112243652, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0042, + "step": 19060 + }, + { + "epoch": 1.14266882377614, + "grad_norm": 0.30648791790008545, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0058, + "step": 19070 + }, + { + "epoch": 1.1432680208520583, + "grad_norm": 0.16284243762493134, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0047, + "step": 19080 + }, + { + "epoch": 1.1438672179279765, + "grad_norm": 0.14919471740722656, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0045, + "step": 19090 + }, + { + "epoch": 1.1444664150038948, + "grad_norm": 0.14879491925239563, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0047, + "step": 19100 + }, + { + "epoch": 1.145065612079813, + "grad_norm": 0.11741457879543304, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0041, + "step": 19110 + }, + { + "epoch": 1.1456648091557313, + "grad_norm": 0.09406878799200058, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0029, + "step": 19120 + }, + { + "epoch": 1.1462640062316496, + "grad_norm": 0.20860706269741058, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0054, + "step": 19130 + }, + { + "epoch": 1.1468632033075679, + "grad_norm": 0.24234607815742493, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0047, + "step": 19140 + }, + { + "epoch": 1.1474624003834861, + "grad_norm": 0.27025938034057617, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0042, + "step": 19150 + }, + { + "epoch": 1.1480615974594044, + "grad_norm": 0.15129081904888153, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0046, + "step": 19160 + }, + { + "epoch": 1.1486607945353227, + "grad_norm": 0.11173490434885025, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0035, + "step": 19170 + }, + { + "epoch": 1.149259991611241, + "grad_norm": 0.2204807698726654, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0036, + "step": 19180 + }, + { + "epoch": 1.1498591886871592, + "grad_norm": 0.20111115276813507, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0087, + "step": 19190 + }, + { + "epoch": 1.1504583857630775, + "grad_norm": 0.213748961687088, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0045, + "step": 19200 + }, + { + "epoch": 1.1510575828389957, + "grad_norm": 0.21150177717208862, + "learning_rate": 7.480328799175369e-06, + "loss": 0.004, + "step": 19210 + }, + { + "epoch": 1.151656779914914, + "grad_norm": 0.2450210005044937, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0036, + "step": 19220 + }, + { + "epoch": 1.1522559769908323, + "grad_norm": 0.16161729395389557, + "learning_rate": 7.4623904967312e-06, + "loss": 0.004, + "step": 19230 + }, + { + "epoch": 1.1528551740667505, + "grad_norm": 0.15077564120292664, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0038, + "step": 19240 + }, + { + "epoch": 1.1534543711426688, + "grad_norm": 0.3078431487083435, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0051, + "step": 19250 + }, + { + "epoch": 1.154053568218587, + "grad_norm": 0.15213221311569214, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0032, + "step": 19260 + }, + { + "epoch": 1.1546527652945053, + "grad_norm": 0.12404917925596237, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0042, + "step": 19270 + }, + { + "epoch": 1.1552519623704236, + "grad_norm": 0.18779516220092773, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0041, + "step": 19280 + }, + { + "epoch": 1.1558511594463419, + "grad_norm": 0.4039568603038788, + "learning_rate": 7.408675563767873e-06, + "loss": 0.005, + "step": 19290 + }, + { + "epoch": 1.1564503565222601, + "grad_norm": 0.2045651078224182, + "learning_rate": 7.399737764864619e-06, + "loss": 0.0057, + "step": 19300 + }, + { + "epoch": 1.1570495535981784, + "grad_norm": 0.3885338306427002, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0049, + "step": 19310 + }, + { + "epoch": 1.1576487506740967, + "grad_norm": 0.253049373626709, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0059, + "step": 19320 + }, + { + "epoch": 1.158247947750015, + "grad_norm": 0.250356525182724, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0076, + "step": 19330 + }, + { + "epoch": 1.1588471448259332, + "grad_norm": 0.3269367814064026, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0112, + "step": 19340 + }, + { + "epoch": 1.1594463419018515, + "grad_norm": 0.15401138365268707, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0052, + "step": 19350 + }, + { + "epoch": 1.1600455389777697, + "grad_norm": 0.1631775051355362, + "learning_rate": 7.346200065486093e-06, + "loss": 0.004, + "step": 19360 + }, + { + "epoch": 1.160644736053688, + "grad_norm": 0.17112085223197937, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0038, + "step": 19370 + }, + { + "epoch": 1.1612439331296063, + "grad_norm": 0.24018551409244537, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0056, + "step": 19380 + }, + { + "epoch": 1.1618431302055245, + "grad_norm": 0.17964349687099457, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0057, + "step": 19390 + }, + { + "epoch": 1.1624423272814428, + "grad_norm": 0.1747465431690216, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0053, + "step": 19400 + }, + { + "epoch": 1.163041524357361, + "grad_norm": 0.21299205720424652, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0038, + "step": 19410 + }, + { + "epoch": 1.1636407214332793, + "grad_norm": 0.13219258189201355, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0057, + "step": 19420 + }, + { + "epoch": 1.1642399185091976, + "grad_norm": 1.0558332204818726, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0066, + "step": 19430 + }, + { + "epoch": 1.1648391155851159, + "grad_norm": 0.2154799997806549, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0041, + "step": 19440 + }, + { + "epoch": 1.1654383126610341, + "grad_norm": 0.13665339350700378, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0044, + "step": 19450 + }, + { + "epoch": 1.1660375097369524, + "grad_norm": 0.2101723700761795, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0039, + "step": 19460 + }, + { + "epoch": 1.1666367068128707, + "grad_norm": 0.13208501040935516, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0054, + "step": 19470 + }, + { + "epoch": 1.167235903888789, + "grad_norm": 0.09342823177576065, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0032, + "step": 19480 + }, + { + "epoch": 1.1678351009647072, + "grad_norm": 0.22464905679225922, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0055, + "step": 19490 + }, + { + "epoch": 1.1684342980406255, + "grad_norm": 0.17030438780784607, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0042, + "step": 19500 + }, + { + "epoch": 1.1690334951165438, + "grad_norm": 0.17673689126968384, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0055, + "step": 19510 + }, + { + "epoch": 1.169632692192462, + "grad_norm": 0.24041922390460968, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0048, + "step": 19520 + }, + { + "epoch": 1.1702318892683803, + "grad_norm": 0.14808662235736847, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0031, + "step": 19530 + }, + { + "epoch": 1.1708310863442986, + "grad_norm": 0.2489791214466095, + "learning_rate": 7.186522173441719e-06, + "loss": 0.004, + "step": 19540 + }, + { + "epoch": 1.1714302834202168, + "grad_norm": 0.19468742609024048, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0042, + "step": 19550 + }, + { + "epoch": 1.172029480496135, + "grad_norm": 0.15028323233127594, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0061, + "step": 19560 + }, + { + "epoch": 1.1726286775720534, + "grad_norm": 0.13852037489414215, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0045, + "step": 19570 + }, + { + "epoch": 1.1732278746479716, + "grad_norm": 0.1401798278093338, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0063, + "step": 19580 + }, + { + "epoch": 1.17382707172389, + "grad_norm": 0.1831122189760208, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0034, + "step": 19590 + }, + { + "epoch": 1.1744262687998082, + "grad_norm": 0.2867920994758606, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0044, + "step": 19600 + }, + { + "epoch": 1.1750254658757266, + "grad_norm": 0.13363438844680786, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0038, + "step": 19610 + }, + { + "epoch": 1.1756246629516447, + "grad_norm": 0.20085696876049042, + "learning_rate": 7.116016051769541e-06, + "loss": 0.004, + "step": 19620 + }, + { + "epoch": 1.1762238600275632, + "grad_norm": 0.1598372906446457, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0042, + "step": 19630 + }, + { + "epoch": 1.1768230571034812, + "grad_norm": 0.09672598540782928, + "learning_rate": 7.098434895408162e-06, + "loss": 0.004, + "step": 19640 + }, + { + "epoch": 1.1774222541793997, + "grad_norm": 0.18206225335597992, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0048, + "step": 19650 + }, + { + "epoch": 1.1780214512553178, + "grad_norm": 0.1818019449710846, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0038, + "step": 19660 + }, + { + "epoch": 1.1786206483312363, + "grad_norm": 0.21658800542354584, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0044, + "step": 19670 + }, + { + "epoch": 1.1792198454071543, + "grad_norm": 0.08513368666172028, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0038, + "step": 19680 + }, + { + "epoch": 1.1798190424830728, + "grad_norm": 0.10634194314479828, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0044, + "step": 19690 + }, + { + "epoch": 1.1804182395589908, + "grad_norm": 0.12106078863143921, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0037, + "step": 19700 + }, + { + "epoch": 1.1810174366349093, + "grad_norm": 0.11508465558290482, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0036, + "step": 19710 + }, + { + "epoch": 1.1816166337108274, + "grad_norm": 0.20805053412914276, + "learning_rate": 7.028294242074066e-06, + "loss": 0.004, + "step": 19720 + }, + { + "epoch": 1.1822158307867459, + "grad_norm": 0.23920200765132904, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0045, + "step": 19730 + }, + { + "epoch": 1.1828150278626641, + "grad_norm": 0.1300375908613205, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0045, + "step": 19740 + }, + { + "epoch": 1.1834142249385824, + "grad_norm": 0.23444809019565582, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0036, + "step": 19750 + }, + { + "epoch": 1.1840134220145007, + "grad_norm": 0.2636217772960663, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0044, + "step": 19760 + }, + { + "epoch": 1.184612619090419, + "grad_norm": 0.31166398525238037, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.005, + "step": 19770 + }, + { + "epoch": 1.1852118161663372, + "grad_norm": 0.32881107926368713, + "learning_rate": 6.975884226362e-06, + "loss": 0.0055, + "step": 19780 + }, + { + "epoch": 1.1858110132422555, + "grad_norm": 0.41748252511024475, + "learning_rate": 6.967165692827958e-06, + "loss": 0.006, + "step": 19790 + }, + { + "epoch": 1.1864102103181737, + "grad_norm": 0.1588834673166275, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0039, + "step": 19800 + }, + { + "epoch": 1.187009407394092, + "grad_norm": 0.23697984218597412, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0039, + "step": 19810 + }, + { + "epoch": 1.1876086044700103, + "grad_norm": 0.19356773793697357, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0061, + "step": 19820 + }, + { + "epoch": 1.1882078015459285, + "grad_norm": 0.16373206675052643, + "learning_rate": 6.932338988482141e-06, + "loss": 0.004, + "step": 19830 + }, + { + "epoch": 1.1888069986218468, + "grad_norm": 0.1331700086593628, + "learning_rate": 6.923644220932124e-06, + "loss": 0.004, + "step": 19840 + }, + { + "epoch": 1.189406195697765, + "grad_norm": 0.4039696753025055, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0057, + "step": 19850 + }, + { + "epoch": 1.1900053927736833, + "grad_norm": 0.30325421690940857, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0065, + "step": 19860 + }, + { + "epoch": 1.1906045898496016, + "grad_norm": 0.21767468750476837, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0038, + "step": 19870 + }, + { + "epoch": 1.1912037869255199, + "grad_norm": 0.17474445700645447, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0056, + "step": 19880 + }, + { + "epoch": 1.1918029840014381, + "grad_norm": 0.17118008434772491, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0045, + "step": 19890 + }, + { + "epoch": 1.1924021810773564, + "grad_norm": 0.44261473417282104, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0063, + "step": 19900 + }, + { + "epoch": 1.1930013781532747, + "grad_norm": 0.18502798676490784, + "learning_rate": 6.862915366041247e-06, + "loss": 0.004, + "step": 19910 + }, + { + "epoch": 1.193600575229193, + "grad_norm": 0.19384194910526276, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0036, + "step": 19920 + }, + { + "epoch": 1.1941997723051112, + "grad_norm": 0.1448352187871933, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0044, + "step": 19930 + }, + { + "epoch": 1.1947989693810295, + "grad_norm": 0.3728172779083252, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0038, + "step": 19940 + }, + { + "epoch": 1.1953981664569477, + "grad_norm": 0.31421783566474915, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0043, + "step": 19950 + }, + { + "epoch": 1.195997363532866, + "grad_norm": 0.28181371092796326, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0045, + "step": 19960 + }, + { + "epoch": 1.1965965606087843, + "grad_norm": 0.2249889373779297, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0041, + "step": 19970 + }, + { + "epoch": 1.1971957576847025, + "grad_norm": 0.26402008533477783, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0043, + "step": 19980 + }, + { + "epoch": 1.1977949547606208, + "grad_norm": 0.22621415555477142, + "learning_rate": 6.793802468038111e-06, + "loss": 0.004, + "step": 19990 + }, + { + "epoch": 1.198394151836539, + "grad_norm": 0.2681289315223694, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0045, + "step": 20000 + }, + { + "epoch": 1.1989933489124573, + "grad_norm": 0.17681041359901428, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0037, + "step": 20010 + }, + { + "epoch": 1.1995925459883756, + "grad_norm": 0.16526542603969574, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0032, + "step": 20020 + }, + { + "epoch": 1.2001917430642939, + "grad_norm": 0.30313149094581604, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0046, + "step": 20030 + }, + { + "epoch": 1.2007909401402121, + "grad_norm": 0.17628541588783264, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0065, + "step": 20040 + }, + { + "epoch": 1.2013901372161304, + "grad_norm": 0.1840096414089203, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0051, + "step": 20050 + }, + { + "epoch": 1.2019893342920487, + "grad_norm": 0.146232470870018, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0035, + "step": 20060 + }, + { + "epoch": 1.202588531367967, + "grad_norm": 0.4804438352584839, + "learning_rate": 6.725005485342219e-06, + "loss": 0.005, + "step": 20070 + }, + { + "epoch": 1.2031877284438852, + "grad_norm": 0.2245558500289917, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0039, + "step": 20080 + }, + { + "epoch": 1.2037869255198035, + "grad_norm": 0.21845588088035583, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0053, + "step": 20090 + }, + { + "epoch": 1.2043861225957218, + "grad_norm": 0.1743943691253662, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0037, + "step": 20100 + }, + { + "epoch": 1.20498531967164, + "grad_norm": 0.16978098452091217, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0036, + "step": 20110 + }, + { + "epoch": 1.2055845167475583, + "grad_norm": 0.27158796787261963, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0043, + "step": 20120 + }, + { + "epoch": 1.2061837138234766, + "grad_norm": 0.13516400754451752, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0048, + "step": 20130 + }, + { + "epoch": 1.2067829108993948, + "grad_norm": 0.1645064353942871, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0038, + "step": 20140 + }, + { + "epoch": 1.207382107975313, + "grad_norm": 0.07616083323955536, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0046, + "step": 20150 + }, + { + "epoch": 1.2079813050512314, + "grad_norm": 0.13306911289691925, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0039, + "step": 20160 + }, + { + "epoch": 1.2085805021271496, + "grad_norm": 0.19445037841796875, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0044, + "step": 20170 + }, + { + "epoch": 1.209179699203068, + "grad_norm": 0.18423207104206085, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0049, + "step": 20180 + }, + { + "epoch": 1.2097788962789862, + "grad_norm": 0.19280213117599487, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0043, + "step": 20190 + }, + { + "epoch": 1.2103780933549044, + "grad_norm": 0.25472623109817505, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0033, + "step": 20200 + }, + { + "epoch": 1.2109772904308227, + "grad_norm": 0.16799427568912506, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0031, + "step": 20210 + }, + { + "epoch": 1.211576487506741, + "grad_norm": 0.2097395807504654, + "learning_rate": 6.596880604028027e-06, + "loss": 0.004, + "step": 20220 + }, + { + "epoch": 1.2121756845826592, + "grad_norm": 0.31450021266937256, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0047, + "step": 20230 + }, + { + "epoch": 1.2127748816585775, + "grad_norm": 0.16530238091945648, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0034, + "step": 20240 + }, + { + "epoch": 1.2133740787344958, + "grad_norm": 0.2506805956363678, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0038, + "step": 20250 + }, + { + "epoch": 1.213973275810414, + "grad_norm": 0.1876160055398941, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0035, + "step": 20260 + }, + { + "epoch": 1.2145724728863323, + "grad_norm": 0.23704354465007782, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0041, + "step": 20270 + }, + { + "epoch": 1.2151716699622506, + "grad_norm": 0.13814999163150787, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0042, + "step": 20280 + }, + { + "epoch": 1.2157708670381688, + "grad_norm": 0.1164403185248375, + "learning_rate": 6.53748481975927e-06, + "loss": 0.0042, + "step": 20290 + }, + { + "epoch": 1.216370064114087, + "grad_norm": 0.23078426718711853, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0038, + "step": 20300 + }, + { + "epoch": 1.2169692611900054, + "grad_norm": 0.21749110519886017, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0046, + "step": 20310 + }, + { + "epoch": 1.2175684582659236, + "grad_norm": 0.24972137808799744, + "learning_rate": 6.512107839793337e-06, + "loss": 0.0041, + "step": 20320 + }, + { + "epoch": 1.218167655341842, + "grad_norm": 0.2491082102060318, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0043, + "step": 20330 + }, + { + "epoch": 1.2187668524177602, + "grad_norm": 0.14915086328983307, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0048, + "step": 20340 + }, + { + "epoch": 1.2193660494936784, + "grad_norm": 0.2794116735458374, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0035, + "step": 20350 + }, + { + "epoch": 1.2199652465695967, + "grad_norm": 0.13765662908554077, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0047, + "step": 20360 + }, + { + "epoch": 1.220564443645515, + "grad_norm": 0.14874878525733948, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0042, + "step": 20370 + }, + { + "epoch": 1.2211636407214332, + "grad_norm": 0.1800280064344406, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0057, + "step": 20380 + }, + { + "epoch": 1.2217628377973515, + "grad_norm": 0.17518648505210876, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0049, + "step": 20390 + }, + { + "epoch": 1.2223620348732698, + "grad_norm": 0.16315865516662598, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0045, + "step": 20400 + }, + { + "epoch": 1.222961231949188, + "grad_norm": 0.3590790033340454, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0039, + "step": 20410 + }, + { + "epoch": 1.2235604290251063, + "grad_norm": 0.14534324407577515, + "learning_rate": 6.427861749601945e-06, + "loss": 0.004, + "step": 20420 + }, + { + "epoch": 1.2241596261010246, + "grad_norm": 0.1662825047969818, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0042, + "step": 20430 + }, + { + "epoch": 1.2247588231769428, + "grad_norm": 0.27466440200805664, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0045, + "step": 20440 + }, + { + "epoch": 1.2253580202528611, + "grad_norm": 0.1323469579219818, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0047, + "step": 20450 + }, + { + "epoch": 1.2259572173287794, + "grad_norm": 0.12367355078458786, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0077, + "step": 20460 + }, + { + "epoch": 1.2265564144046976, + "grad_norm": 0.18238325417041779, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0058, + "step": 20470 + }, + { + "epoch": 1.227155611480616, + "grad_norm": 0.2733745574951172, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0038, + "step": 20480 + }, + { + "epoch": 1.2277548085565342, + "grad_norm": 0.3367181420326233, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0039, + "step": 20490 + }, + { + "epoch": 1.2283540056324525, + "grad_norm": 0.20671530067920685, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0034, + "step": 20500 + }, + { + "epoch": 1.2289532027083707, + "grad_norm": 0.23353071510791779, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0033, + "step": 20510 + }, + { + "epoch": 1.229552399784289, + "grad_norm": 0.21081902086734772, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0031, + "step": 20520 + }, + { + "epoch": 1.2301515968602073, + "grad_norm": 0.3426077365875244, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0049, + "step": 20530 + }, + { + "epoch": 1.2307507939361255, + "grad_norm": 0.3905622959136963, + "learning_rate": 6.327475567095824e-06, + "loss": 0.004, + "step": 20540 + }, + { + "epoch": 1.2313499910120438, + "grad_norm": 0.1888400912284851, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0041, + "step": 20550 + }, + { + "epoch": 1.231949188087962, + "grad_norm": 0.23982487618923187, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0041, + "step": 20560 + }, + { + "epoch": 1.2325483851638803, + "grad_norm": 0.2061331421136856, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0046, + "step": 20570 + }, + { + "epoch": 1.2331475822397986, + "grad_norm": 0.17000116407871246, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0033, + "step": 20580 + }, + { + "epoch": 1.2337467793157169, + "grad_norm": 0.15905790030956268, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0049, + "step": 20590 + }, + { + "epoch": 1.2343459763916351, + "grad_norm": 0.16794419288635254, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0052, + "step": 20600 + }, + { + "epoch": 1.2349451734675534, + "grad_norm": 0.3003343641757965, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0061, + "step": 20610 + }, + { + "epoch": 1.2355443705434717, + "grad_norm": 0.1429288536310196, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0042, + "step": 20620 + }, + { + "epoch": 1.23614356761939, + "grad_norm": 0.18542084097862244, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0047, + "step": 20630 + }, + { + "epoch": 1.2367427646953082, + "grad_norm": 0.2692892253398895, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0035, + "step": 20640 + }, + { + "epoch": 1.2373419617712265, + "grad_norm": 0.23286236822605133, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0037, + "step": 20650 + }, + { + "epoch": 1.2379411588471447, + "grad_norm": 0.0963423103094101, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0041, + "step": 20660 + }, + { + "epoch": 1.238540355923063, + "grad_norm": 0.1425798237323761, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0043, + "step": 20670 + }, + { + "epoch": 1.2391395529989815, + "grad_norm": 0.0960182398557663, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0046, + "step": 20680 + }, + { + "epoch": 1.2397387500748995, + "grad_norm": 0.2674477994441986, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0043, + "step": 20690 + }, + { + "epoch": 1.240337947150818, + "grad_norm": 0.16276703774929047, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0041, + "step": 20700 + }, + { + "epoch": 1.240937144226736, + "grad_norm": 0.24255621433258057, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.003, + "step": 20710 + }, + { + "epoch": 1.2415363413026546, + "grad_norm": 0.20395220816135406, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0054, + "step": 20720 + }, + { + "epoch": 1.2421355383785726, + "grad_norm": 0.12099681794643402, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0082, + "step": 20730 + }, + { + "epoch": 1.242734735454491, + "grad_norm": 0.14017170667648315, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0042, + "step": 20740 + }, + { + "epoch": 1.2433339325304091, + "grad_norm": 0.28132137656211853, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0043, + "step": 20750 + }, + { + "epoch": 1.2439331296063276, + "grad_norm": 0.27220970392227173, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0039, + "step": 20760 + }, + { + "epoch": 1.2445323266822457, + "grad_norm": 0.23647353053092957, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0058, + "step": 20770 + }, + { + "epoch": 1.2451315237581642, + "grad_norm": 0.20623824000358582, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0053, + "step": 20780 + }, + { + "epoch": 1.2457307208340822, + "grad_norm": 0.12366114556789398, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0037, + "step": 20790 + }, + { + "epoch": 1.2463299179100007, + "grad_norm": 0.23330192267894745, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0056, + "step": 20800 + }, + { + "epoch": 1.246929114985919, + "grad_norm": 0.19991633296012878, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0031, + "step": 20810 + }, + { + "epoch": 1.2475283120618372, + "grad_norm": 0.1496160626411438, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0058, + "step": 20820 + }, + { + "epoch": 1.2481275091377555, + "grad_norm": 0.13247868418693542, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0037, + "step": 20830 + }, + { + "epoch": 1.2487267062136738, + "grad_norm": 0.19072194397449493, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0057, + "step": 20840 + }, + { + "epoch": 1.249325903289592, + "grad_norm": 0.10773085057735443, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0042, + "step": 20850 + }, + { + "epoch": 1.2499251003655103, + "grad_norm": 0.14058449864387512, + "learning_rate": 6.063685039328116e-06, + "loss": 0.005, + "step": 20860 + }, + { + "epoch": 1.2505242974414286, + "grad_norm": 0.10825464874505997, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0042, + "step": 20870 + }, + { + "epoch": 1.2511234945173468, + "grad_norm": 0.18059906363487244, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0046, + "step": 20880 + }, + { + "epoch": 1.251722691593265, + "grad_norm": 0.1713389754295349, + "learning_rate": 6.039253929027638e-06, + "loss": 0.005, + "step": 20890 + }, + { + "epoch": 1.2523218886691834, + "grad_norm": 0.23789434134960175, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0047, + "step": 20900 + }, + { + "epoch": 1.2529210857451016, + "grad_norm": 0.17626744508743286, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0041, + "step": 20910 + }, + { + "epoch": 1.25352028282102, + "grad_norm": 0.2091904729604721, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0044, + "step": 20920 + }, + { + "epoch": 1.2541194798969382, + "grad_norm": 0.17293672263622284, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0043, + "step": 20930 + }, + { + "epoch": 1.2547186769728564, + "grad_norm": 0.13156521320343018, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0039, + "step": 20940 + }, + { + "epoch": 1.2553178740487747, + "grad_norm": 0.19591976702213287, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0043, + "step": 20950 + }, + { + "epoch": 1.255917071124693, + "grad_norm": 0.16212835907936096, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0039, + "step": 20960 + }, + { + "epoch": 1.2565162682006112, + "grad_norm": 0.10661022365093231, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0037, + "step": 20970 + }, + { + "epoch": 1.2571154652765295, + "grad_norm": 0.16630858182907104, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0038, + "step": 20980 + }, + { + "epoch": 1.2577146623524478, + "grad_norm": 0.11001022905111313, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0037, + "step": 20990 + }, + { + "epoch": 1.258313859428366, + "grad_norm": 0.1888381838798523, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0044, + "step": 21000 + }, + { + "epoch": 1.2589130565042843, + "grad_norm": 0.19239328801631927, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0044, + "step": 21010 + }, + { + "epoch": 1.2595122535802026, + "grad_norm": 0.16555139422416687, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0032, + "step": 21020 + }, + { + "epoch": 1.2601114506561208, + "grad_norm": 0.19748231768608093, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0043, + "step": 21030 + }, + { + "epoch": 1.2607106477320391, + "grad_norm": 0.1546473354101181, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0049, + "step": 21040 + }, + { + "epoch": 1.2613098448079574, + "grad_norm": 0.30511707067489624, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0037, + "step": 21050 + }, + { + "epoch": 1.2619090418838756, + "grad_norm": 0.1722872257232666, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0048, + "step": 21060 + }, + { + "epoch": 1.262508238959794, + "grad_norm": 0.1784086525440216, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0049, + "step": 21070 + }, + { + "epoch": 1.2631074360357122, + "grad_norm": 0.15101182460784912, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0042, + "step": 21080 + }, + { + "epoch": 1.2637066331116305, + "grad_norm": 0.1252688318490982, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0041, + "step": 21090 + }, + { + "epoch": 1.2643058301875487, + "grad_norm": 0.15101821720600128, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0043, + "step": 21100 + }, + { + "epoch": 1.264905027263467, + "grad_norm": 0.21302345395088196, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0035, + "step": 21110 + }, + { + "epoch": 1.2655042243393853, + "grad_norm": 0.1591431051492691, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0033, + "step": 21120 + }, + { + "epoch": 1.2661034214153035, + "grad_norm": 0.16010484099388123, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0049, + "step": 21130 + }, + { + "epoch": 1.2667026184912218, + "grad_norm": 0.19287234544754028, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0037, + "step": 21140 + }, + { + "epoch": 1.26730181556714, + "grad_norm": 0.1804349720478058, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0036, + "step": 21150 + }, + { + "epoch": 1.2679010126430583, + "grad_norm": 0.14769446849822998, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0044, + "step": 21160 + }, + { + "epoch": 1.2685002097189766, + "grad_norm": 0.15914054214954376, + "learning_rate": 5.813791207086085e-06, + "loss": 0.004, + "step": 21170 + }, + { + "epoch": 1.2690994067948949, + "grad_norm": 0.19632315635681152, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0034, + "step": 21180 + }, + { + "epoch": 1.2696986038708131, + "grad_norm": 0.3017818331718445, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0046, + "step": 21190 + }, + { + "epoch": 1.2702978009467314, + "grad_norm": 0.2728461027145386, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0044, + "step": 21200 + }, + { + "epoch": 1.2708969980226497, + "grad_norm": 0.18619874119758606, + "learning_rate": 5.781966956563247e-06, + "loss": 0.004, + "step": 21210 + }, + { + "epoch": 1.271496195098568, + "grad_norm": 0.1235085129737854, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0037, + "step": 21220 + }, + { + "epoch": 1.2720953921744862, + "grad_norm": 0.15798084437847137, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0035, + "step": 21230 + }, + { + "epoch": 1.2726945892504045, + "grad_norm": 0.15713484585285187, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0036, + "step": 21240 + }, + { + "epoch": 1.2732937863263227, + "grad_norm": 0.15594886243343353, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0038, + "step": 21250 + }, + { + "epoch": 1.273892983402241, + "grad_norm": 0.1558992713689804, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0037, + "step": 21260 + }, + { + "epoch": 1.2744921804781593, + "grad_norm": 0.20599815249443054, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0054, + "step": 21270 + }, + { + "epoch": 1.2750913775540775, + "grad_norm": 0.2785670757293701, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0042, + "step": 21280 + }, + { + "epoch": 1.2756905746299958, + "grad_norm": 0.22550497949123383, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0035, + "step": 21290 + }, + { + "epoch": 1.276289771705914, + "grad_norm": 0.15210074186325073, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0035, + "step": 21300 + }, + { + "epoch": 1.2768889687818323, + "grad_norm": 0.18905121088027954, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0035, + "step": 21310 + }, + { + "epoch": 1.2774881658577506, + "grad_norm": 0.1337066888809204, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0046, + "step": 21320 + }, + { + "epoch": 1.2780873629336689, + "grad_norm": 0.23699362576007843, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0048, + "step": 21330 + }, + { + "epoch": 1.2786865600095871, + "grad_norm": 0.2480958253145218, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0037, + "step": 21340 + }, + { + "epoch": 1.2792857570855054, + "grad_norm": 0.09328999370336533, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0047, + "step": 21350 + }, + { + "epoch": 1.2798849541614237, + "grad_norm": 0.3416430950164795, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0048, + "step": 21360 + }, + { + "epoch": 1.280484151237342, + "grad_norm": 0.13258710503578186, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0032, + "step": 21370 + }, + { + "epoch": 1.2810833483132602, + "grad_norm": 0.18493984639644623, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0037, + "step": 21380 + }, + { + "epoch": 1.2816825453891785, + "grad_norm": 0.10433483123779297, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0045, + "step": 21390 + }, + { + "epoch": 1.2822817424650967, + "grad_norm": 0.18333138525485992, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0038, + "step": 21400 + }, + { + "epoch": 1.282880939541015, + "grad_norm": 0.25164106488227844, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0058, + "step": 21410 + }, + { + "epoch": 1.2834801366169333, + "grad_norm": 0.17989882826805115, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0041, + "step": 21420 + }, + { + "epoch": 1.2840793336928515, + "grad_norm": 0.1597793847322464, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0036, + "step": 21430 + }, + { + "epoch": 1.2846785307687698, + "grad_norm": 0.1543695032596588, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0036, + "step": 21440 + }, + { + "epoch": 1.285277727844688, + "grad_norm": 0.2985675036907196, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0043, + "step": 21450 + }, + { + "epoch": 1.2858769249206063, + "grad_norm": 0.1357773244380951, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0036, + "step": 21460 + }, + { + "epoch": 1.2864761219965246, + "grad_norm": 0.23978300392627716, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.005, + "step": 21470 + }, + { + "epoch": 1.2870753190724429, + "grad_norm": 0.12806151807308197, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0035, + "step": 21480 + }, + { + "epoch": 1.2876745161483611, + "grad_norm": 0.2222731113433838, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0039, + "step": 21490 + }, + { + "epoch": 1.2882737132242794, + "grad_norm": 0.16744646430015564, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0035, + "step": 21500 + }, + { + "epoch": 1.2888729103001977, + "grad_norm": 0.2162114977836609, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0048, + "step": 21510 + }, + { + "epoch": 1.289472107376116, + "grad_norm": 0.14857177436351776, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0036, + "step": 21520 + }, + { + "epoch": 1.2900713044520342, + "grad_norm": 0.21318115293979645, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0032, + "step": 21530 + }, + { + "epoch": 1.2906705015279525, + "grad_norm": 0.257682204246521, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0036, + "step": 21540 + }, + { + "epoch": 1.2912696986038708, + "grad_norm": 0.254349946975708, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0042, + "step": 21550 + }, + { + "epoch": 1.291868895679789, + "grad_norm": 0.148925319314003, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0029, + "step": 21560 + }, + { + "epoch": 1.2924680927557073, + "grad_norm": 0.1902056336402893, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0031, + "step": 21570 + }, + { + "epoch": 1.2930672898316256, + "grad_norm": 0.17580094933509827, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0026, + "step": 21580 + }, + { + "epoch": 1.2936664869075438, + "grad_norm": 0.18856695294380188, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0045, + "step": 21590 + }, + { + "epoch": 1.294265683983462, + "grad_norm": 0.17185454070568085, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0039, + "step": 21600 + }, + { + "epoch": 1.2948648810593804, + "grad_norm": 0.1997966468334198, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0043, + "step": 21610 + }, + { + "epoch": 1.2954640781352986, + "grad_norm": 0.14173944294452667, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0033, + "step": 21620 + }, + { + "epoch": 1.296063275211217, + "grad_norm": 0.20653635263442993, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0039, + "step": 21630 + }, + { + "epoch": 1.2966624722871352, + "grad_norm": 0.19571708142757416, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0026, + "step": 21640 + }, + { + "epoch": 1.2972616693630534, + "grad_norm": 0.0877508670091629, + "learning_rate": 5.438496901657042e-06, + "loss": 0.005, + "step": 21650 + }, + { + "epoch": 1.2978608664389717, + "grad_norm": 0.17305001616477966, + "learning_rate": 5.430834687545416e-06, + "loss": 0.0038, + "step": 21660 + }, + { + "epoch": 1.2984600635148902, + "grad_norm": 0.16555450856685638, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0035, + "step": 21670 + }, + { + "epoch": 1.2990592605908082, + "grad_norm": 0.15395715832710266, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0035, + "step": 21680 + }, + { + "epoch": 1.2996584576667267, + "grad_norm": 0.2430422455072403, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0032, + "step": 21690 + }, + { + "epoch": 1.3002576547426448, + "grad_norm": 0.2465265393257141, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0034, + "step": 21700 + }, + { + "epoch": 1.3008568518185633, + "grad_norm": 0.08382703363895416, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0038, + "step": 21710 + }, + { + "epoch": 1.3014560488944813, + "grad_norm": 0.3427184224128723, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0042, + "step": 21720 + }, + { + "epoch": 1.3020552459703998, + "grad_norm": 0.13029031455516815, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0039, + "step": 21730 + }, + { + "epoch": 1.3026544430463178, + "grad_norm": 0.11826448887586594, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0035, + "step": 21740 + }, + { + "epoch": 1.3032536401222363, + "grad_norm": 0.1612391620874405, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0039, + "step": 21750 + }, + { + "epoch": 1.3038528371981544, + "grad_norm": 0.21143540740013123, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0057, + "step": 21760 + }, + { + "epoch": 1.3044520342740729, + "grad_norm": 0.22977286577224731, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.005, + "step": 21770 + }, + { + "epoch": 1.305051231349991, + "grad_norm": 0.11853202432394028, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0058, + "step": 21780 + }, + { + "epoch": 1.3056504284259094, + "grad_norm": 0.24277184903621674, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0038, + "step": 21790 + }, + { + "epoch": 1.3062496255018274, + "grad_norm": 0.2625603675842285, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0048, + "step": 21800 + }, + { + "epoch": 1.306848822577746, + "grad_norm": 0.1333419382572174, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0033, + "step": 21810 + }, + { + "epoch": 1.307448019653664, + "grad_norm": 0.09627685695886612, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0035, + "step": 21820 + }, + { + "epoch": 1.3080472167295825, + "grad_norm": 0.416618674993515, + "learning_rate": 5.301584321328435e-06, + "loss": 0.0038, + "step": 21830 + }, + { + "epoch": 1.3086464138055005, + "grad_norm": 0.18699553608894348, + "learning_rate": 5.294041118587667e-06, + "loss": 0.004, + "step": 21840 + }, + { + "epoch": 1.309245610881419, + "grad_norm": 0.1827329397201538, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0039, + "step": 21850 + }, + { + "epoch": 1.309844807957337, + "grad_norm": 0.19719162583351135, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0034, + "step": 21860 + }, + { + "epoch": 1.3104440050332555, + "grad_norm": 0.09895205497741699, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0042, + "step": 21870 + }, + { + "epoch": 1.3110432021091736, + "grad_norm": 0.11187861114740372, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0036, + "step": 21880 + }, + { + "epoch": 1.311642399185092, + "grad_norm": 0.154103085398674, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0037, + "step": 21890 + }, + { + "epoch": 1.3122415962610101, + "grad_norm": 0.11124159395694733, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0038, + "step": 21900 + }, + { + "epoch": 1.3128407933369286, + "grad_norm": 0.27686378359794617, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0041, + "step": 21910 + }, + { + "epoch": 1.3134399904128466, + "grad_norm": 0.12900429964065552, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0036, + "step": 21920 + }, + { + "epoch": 1.3140391874887651, + "grad_norm": 0.26441213488578796, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0032, + "step": 21930 + }, + { + "epoch": 1.3146383845646834, + "grad_norm": 0.2187345325946808, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.004, + "step": 21940 + }, + { + "epoch": 1.3152375816406017, + "grad_norm": 0.08503159135580063, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0034, + "step": 21950 + }, + { + "epoch": 1.31583677871652, + "grad_norm": 0.12869144976139069, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0035, + "step": 21960 + }, + { + "epoch": 1.3164359757924382, + "grad_norm": 0.13212713599205017, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0027, + "step": 21970 + }, + { + "epoch": 1.3170351728683565, + "grad_norm": 0.23211228847503662, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0032, + "step": 21980 + }, + { + "epoch": 1.3176343699442747, + "grad_norm": 0.2017366737127304, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0043, + "step": 21990 + }, + { + "epoch": 1.318233567020193, + "grad_norm": 0.21221789717674255, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0039, + "step": 22000 + }, + { + "epoch": 1.3188327640961113, + "grad_norm": 0.24497511982917786, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0046, + "step": 22010 + }, + { + "epoch": 1.3194319611720295, + "grad_norm": 0.15008985996246338, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0039, + "step": 22020 + }, + { + "epoch": 1.3200311582479478, + "grad_norm": 0.15641193091869354, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0039, + "step": 22030 + }, + { + "epoch": 1.320630355323866, + "grad_norm": 0.2608455419540405, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0036, + "step": 22040 + }, + { + "epoch": 1.3212295523997843, + "grad_norm": 0.09808705747127533, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0038, + "step": 22050 + }, + { + "epoch": 1.3218287494757026, + "grad_norm": 0.18084567785263062, + "learning_rate": 5.129800405815733e-06, + "loss": 0.0045, + "step": 22060 + }, + { + "epoch": 1.3224279465516209, + "grad_norm": 0.1957635134458542, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0036, + "step": 22070 + }, + { + "epoch": 1.3230271436275391, + "grad_norm": 0.1479685753583908, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0045, + "step": 22080 + }, + { + "epoch": 1.3236263407034574, + "grad_norm": 0.14854201674461365, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0035, + "step": 22090 + }, + { + "epoch": 1.3242255377793757, + "grad_norm": 0.14744973182678223, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0033, + "step": 22100 + }, + { + "epoch": 1.324824734855294, + "grad_norm": 0.7196730375289917, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0071, + "step": 22110 + }, + { + "epoch": 1.3254239319312122, + "grad_norm": 0.22570419311523438, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0056, + "step": 22120 + }, + { + "epoch": 1.3260231290071305, + "grad_norm": 0.16870586574077606, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0042, + "step": 22130 + }, + { + "epoch": 1.3266223260830488, + "grad_norm": 0.12610554695129395, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0051, + "step": 22140 + }, + { + "epoch": 1.327221523158967, + "grad_norm": 0.11198554188013077, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0042, + "step": 22150 + }, + { + "epoch": 1.3278207202348853, + "grad_norm": 0.13166265189647675, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0037, + "step": 22160 + }, + { + "epoch": 1.3284199173108036, + "grad_norm": 0.1181526631116867, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0037, + "step": 22170 + }, + { + "epoch": 1.3290191143867218, + "grad_norm": 0.2055635005235672, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0027, + "step": 22180 + }, + { + "epoch": 1.32961831146264, + "grad_norm": 0.13400030136108398, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0028, + "step": 22190 + }, + { + "epoch": 1.3302175085385584, + "grad_norm": 0.09746947884559631, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0048, + "step": 22200 + }, + { + "epoch": 1.3308167056144766, + "grad_norm": 0.22124870121479034, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0048, + "step": 22210 + }, + { + "epoch": 1.331415902690395, + "grad_norm": 0.09961193799972534, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0038, + "step": 22220 + }, + { + "epoch": 1.3320150997663132, + "grad_norm": 0.20024695992469788, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0036, + "step": 22230 + }, + { + "epoch": 1.3326142968422314, + "grad_norm": 0.3697144687175751, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0044, + "step": 22240 + }, + { + "epoch": 1.3332134939181497, + "grad_norm": 0.1713833063840866, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0038, + "step": 22250 + }, + { + "epoch": 1.333812690994068, + "grad_norm": 0.1914745569229126, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0051, + "step": 22260 + }, + { + "epoch": 1.3344118880699862, + "grad_norm": 0.190393328666687, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0035, + "step": 22270 + }, + { + "epoch": 1.3350110851459045, + "grad_norm": 0.17361588776111603, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0038, + "step": 22280 + }, + { + "epoch": 1.3356102822218228, + "grad_norm": 0.19456325471401215, + "learning_rate": 4.961660586405147e-06, + "loss": 0.0036, + "step": 22290 + }, + { + "epoch": 1.336209479297741, + "grad_norm": 0.15772588551044464, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0039, + "step": 22300 + }, + { + "epoch": 1.3368086763736593, + "grad_norm": 0.11680205166339874, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0045, + "step": 22310 + }, + { + "epoch": 1.3374078734495776, + "grad_norm": 0.3643893599510193, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0044, + "step": 22320 + }, + { + "epoch": 1.3380070705254958, + "grad_norm": 0.1628265231847763, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0036, + "step": 22330 + }, + { + "epoch": 1.338606267601414, + "grad_norm": 0.10073156654834747, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0041, + "step": 22340 + }, + { + "epoch": 1.3392054646773324, + "grad_norm": 0.13039462268352509, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0045, + "step": 22350 + }, + { + "epoch": 1.3398046617532506, + "grad_norm": 0.12775596976280212, + "learning_rate": 4.911226880894818e-06, + "loss": 0.003, + "step": 22360 + }, + { + "epoch": 1.340403858829169, + "grad_norm": 0.1513100564479828, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0044, + "step": 22370 + }, + { + "epoch": 1.3410030559050872, + "grad_norm": 0.1346164345741272, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0036, + "step": 22380 + }, + { + "epoch": 1.3416022529810054, + "grad_norm": 0.12880294024944305, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0041, + "step": 22390 + }, + { + "epoch": 1.3422014500569237, + "grad_norm": 0.3154917359352112, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0038, + "step": 22400 + }, + { + "epoch": 1.342800647132842, + "grad_norm": 0.18458192050457, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.0057, + "step": 22410 + }, + { + "epoch": 1.3433998442087602, + "grad_norm": 0.2524041533470154, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0038, + "step": 22420 + }, + { + "epoch": 1.3439990412846785, + "grad_norm": 0.11894001811742783, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0033, + "step": 22430 + }, + { + "epoch": 1.3445982383605968, + "grad_norm": 0.1094699576497078, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0037, + "step": 22440 + }, + { + "epoch": 1.345197435436515, + "grad_norm": 0.11090611666440964, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0045, + "step": 22450 + }, + { + "epoch": 1.3457966325124333, + "grad_norm": 0.3179106116294861, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0041, + "step": 22460 + }, + { + "epoch": 1.3463958295883516, + "grad_norm": 0.09424899518489838, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0034, + "step": 22470 + }, + { + "epoch": 1.3469950266642698, + "grad_norm": 0.3028348982334137, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0035, + "step": 22480 + }, + { + "epoch": 1.3475942237401881, + "grad_norm": 0.30831560492515564, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0045, + "step": 22490 + }, + { + "epoch": 1.3481934208161064, + "grad_norm": 0.34811046719551086, + "learning_rate": 4.81141273556404e-06, + "loss": 0.005, + "step": 22500 + }, + { + "epoch": 1.3487926178920246, + "grad_norm": 0.18413113057613373, + "learning_rate": 4.804337352679613e-06, + "loss": 0.0044, + "step": 22510 + }, + { + "epoch": 1.349391814967943, + "grad_norm": 0.11229179799556732, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.004, + "step": 22520 + }, + { + "epoch": 1.3499910120438612, + "grad_norm": 0.2966957688331604, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0056, + "step": 22530 + }, + { + "epoch": 1.3505902091197795, + "grad_norm": 0.10525348782539368, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0037, + "step": 22540 + }, + { + "epoch": 1.3511894061956977, + "grad_norm": 0.1479673534631729, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0049, + "step": 22550 + }, + { + "epoch": 1.351788603271616, + "grad_norm": 0.5229315757751465, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0051, + "step": 22560 + }, + { + "epoch": 1.3523878003475343, + "grad_norm": 0.17021632194519043, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0038, + "step": 22570 + }, + { + "epoch": 1.3529869974234525, + "grad_norm": 0.10177282989025116, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0036, + "step": 22580 + }, + { + "epoch": 1.3535861944993708, + "grad_norm": 0.17768025398254395, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0034, + "step": 22590 + }, + { + "epoch": 1.354185391575289, + "grad_norm": 0.2090948224067688, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0027, + "step": 22600 + }, + { + "epoch": 1.3547845886512073, + "grad_norm": 0.1722206026315689, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0031, + "step": 22610 + }, + { + "epoch": 1.3553837857271256, + "grad_norm": 0.09709088504314423, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0037, + "step": 22620 + }, + { + "epoch": 1.3559829828030439, + "grad_norm": 0.1969165802001953, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0034, + "step": 22630 + }, + { + "epoch": 1.3565821798789621, + "grad_norm": 0.0810595229268074, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0038, + "step": 22640 + }, + { + "epoch": 1.3571813769548804, + "grad_norm": 0.22003750503063202, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0041, + "step": 22650 + }, + { + "epoch": 1.3577805740307987, + "grad_norm": 0.2809178829193115, + "learning_rate": 4.699083753549858e-06, + "loss": 0.003, + "step": 22660 + }, + { + "epoch": 1.358379771106717, + "grad_norm": 0.1343737691640854, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0026, + "step": 22670 + }, + { + "epoch": 1.3589789681826352, + "grad_norm": 0.19191010296344757, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0035, + "step": 22680 + }, + { + "epoch": 1.3595781652585535, + "grad_norm": 0.16617201268672943, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0031, + "step": 22690 + }, + { + "epoch": 1.3601773623344717, + "grad_norm": 0.24936997890472412, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0032, + "step": 22700 + }, + { + "epoch": 1.36077655941039, + "grad_norm": 0.5643696188926697, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0053, + "step": 22710 + }, + { + "epoch": 1.3613757564863083, + "grad_norm": 0.19725625216960907, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0031, + "step": 22720 + }, + { + "epoch": 1.3619749535622265, + "grad_norm": 0.1692969799041748, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0035, + "step": 22730 + }, + { + "epoch": 1.362574150638145, + "grad_norm": 0.17487913370132446, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0041, + "step": 22740 + }, + { + "epoch": 1.363173347714063, + "grad_norm": 0.25642889738082886, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0043, + "step": 22750 + }, + { + "epoch": 1.3637725447899816, + "grad_norm": 0.3692823350429535, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0039, + "step": 22760 + }, + { + "epoch": 1.3643717418658996, + "grad_norm": 0.230118989944458, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0047, + "step": 22770 + }, + { + "epoch": 1.364970938941818, + "grad_norm": 0.1609203815460205, + "learning_rate": 4.616077433849538e-06, + "loss": 0.0038, + "step": 22780 + }, + { + "epoch": 1.3655701360177361, + "grad_norm": 0.21201254427433014, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0029, + "step": 22790 + }, + { + "epoch": 1.3661693330936546, + "grad_norm": 0.10142157226800919, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0034, + "step": 22800 + }, + { + "epoch": 1.3667685301695727, + "grad_norm": 0.19121089577674866, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0033, + "step": 22810 + }, + { + "epoch": 1.3673677272454912, + "grad_norm": 0.156619131565094, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0045, + "step": 22820 + }, + { + "epoch": 1.3679669243214092, + "grad_norm": 0.14690659940242767, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0042, + "step": 22830 + }, + { + "epoch": 1.3685661213973277, + "grad_norm": 0.13466109335422516, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0041, + "step": 22840 + }, + { + "epoch": 1.3691653184732457, + "grad_norm": 0.3713383674621582, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0056, + "step": 22850 + }, + { + "epoch": 1.3697645155491642, + "grad_norm": 0.12184764444828033, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0038, + "step": 22860 + }, + { + "epoch": 1.3703637126250823, + "grad_norm": 0.23971956968307495, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0035, + "step": 22870 + }, + { + "epoch": 1.3709629097010008, + "grad_norm": 0.3320925235748291, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0052, + "step": 22880 + }, + { + "epoch": 1.3715621067769188, + "grad_norm": 0.11913793534040451, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0029, + "step": 22890 + }, + { + "epoch": 1.3721613038528373, + "grad_norm": 0.11725693941116333, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0044, + "step": 22900 + }, + { + "epoch": 1.3727605009287553, + "grad_norm": 0.1550632119178772, + "learning_rate": 4.527371771040039e-06, + "loss": 0.0049, + "step": 22910 + }, + { + "epoch": 1.3733596980046738, + "grad_norm": 0.23413509130477905, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0043, + "step": 22920 + }, + { + "epoch": 1.3739588950805919, + "grad_norm": 0.16070885956287384, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0032, + "step": 22930 + }, + { + "epoch": 1.3745580921565104, + "grad_norm": 0.12317437678575516, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0038, + "step": 22940 + }, + { + "epoch": 1.3751572892324284, + "grad_norm": 0.3462170660495758, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0048, + "step": 22950 + }, + { + "epoch": 1.375756486308347, + "grad_norm": 0.12654773890972137, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0045, + "step": 22960 + }, + { + "epoch": 1.376355683384265, + "grad_norm": 0.06262557208538055, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0026, + "step": 22970 + }, + { + "epoch": 1.3769548804601834, + "grad_norm": 0.1439850926399231, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0045, + "step": 22980 + }, + { + "epoch": 1.3775540775361017, + "grad_norm": 0.24463413655757904, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0042, + "step": 22990 + }, + { + "epoch": 1.37815327461202, + "grad_norm": 0.22048236429691315, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0036, + "step": 23000 + }, + { + "epoch": 1.3787524716879382, + "grad_norm": 0.10628963261842728, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0037, + "step": 23010 + }, + { + "epoch": 1.3793516687638565, + "grad_norm": 0.14685721695423126, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0043, + "step": 23020 + }, + { + "epoch": 1.3799508658397748, + "grad_norm": 0.18807503581047058, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0042, + "step": 23030 + }, + { + "epoch": 1.380550062915693, + "grad_norm": 0.19162075221538544, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0047, + "step": 23040 + }, + { + "epoch": 1.3811492599916113, + "grad_norm": 0.2444164752960205, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0041, + "step": 23050 + }, + { + "epoch": 1.3817484570675296, + "grad_norm": 0.12120077759027481, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0037, + "step": 23060 + }, + { + "epoch": 1.3823476541434478, + "grad_norm": 0.19946682453155518, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.003, + "step": 23070 + }, + { + "epoch": 1.3829468512193661, + "grad_norm": 0.23982395231723785, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0035, + "step": 23080 + }, + { + "epoch": 1.3835460482952844, + "grad_norm": 0.13806626200675964, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0031, + "step": 23090 + }, + { + "epoch": 1.3841452453712026, + "grad_norm": 0.2610985040664673, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0047, + "step": 23100 + }, + { + "epoch": 1.384744442447121, + "grad_norm": 0.1384919434785843, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0055, + "step": 23110 + }, + { + "epoch": 1.3853436395230392, + "grad_norm": 0.14737965166568756, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0037, + "step": 23120 + }, + { + "epoch": 1.3859428365989575, + "grad_norm": 0.1304326057434082, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0036, + "step": 23130 + }, + { + "epoch": 1.3865420336748757, + "grad_norm": 0.22288398444652557, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0035, + "step": 23140 + }, + { + "epoch": 1.387141230750794, + "grad_norm": 0.11266916245222092, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0071, + "step": 23150 + }, + { + "epoch": 1.3877404278267123, + "grad_norm": 0.15941838920116425, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0024, + "step": 23160 + }, + { + "epoch": 1.3883396249026305, + "grad_norm": 0.18921831250190735, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0026, + "step": 23170 + }, + { + "epoch": 1.3889388219785488, + "grad_norm": 0.10112889111042023, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0037, + "step": 23180 + }, + { + "epoch": 1.389538019054467, + "grad_norm": 0.1865631341934204, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0028, + "step": 23190 + }, + { + "epoch": 1.3901372161303853, + "grad_norm": 0.20046782493591309, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0042, + "step": 23200 + }, + { + "epoch": 1.3907364132063036, + "grad_norm": 0.11953336745500565, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0026, + "step": 23210 + }, + { + "epoch": 1.3913356102822219, + "grad_norm": 0.17050383985042572, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0029, + "step": 23220 + }, + { + "epoch": 1.3919348073581401, + "grad_norm": 0.28782936930656433, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0042, + "step": 23230 + }, + { + "epoch": 1.3925340044340584, + "grad_norm": 0.2104359269142151, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0034, + "step": 23240 + }, + { + "epoch": 1.3931332015099767, + "grad_norm": 0.12790441513061523, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0048, + "step": 23250 + }, + { + "epoch": 1.393732398585895, + "grad_norm": 0.12111827731132507, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0043, + "step": 23260 + }, + { + "epoch": 1.3943315956618132, + "grad_norm": 0.2542783319950104, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0043, + "step": 23270 + }, + { + "epoch": 1.3949307927377315, + "grad_norm": 0.17177502810955048, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0036, + "step": 23280 + }, + { + "epoch": 1.3955299898136497, + "grad_norm": 0.14121277630329132, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0051, + "step": 23290 + }, + { + "epoch": 1.396129186889568, + "grad_norm": 0.11357807368040085, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0033, + "step": 23300 + }, + { + "epoch": 1.3967283839654863, + "grad_norm": 0.3277477025985718, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0043, + "step": 23310 + }, + { + "epoch": 1.3973275810414045, + "grad_norm": 0.37000587582588196, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0037, + "step": 23320 + }, + { + "epoch": 1.3979267781173228, + "grad_norm": 0.11122190207242966, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0034, + "step": 23330 + }, + { + "epoch": 1.398525975193241, + "grad_norm": 0.14530375599861145, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0039, + "step": 23340 + }, + { + "epoch": 1.3991251722691593, + "grad_norm": 0.19974422454833984, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0033, + "step": 23350 + }, + { + "epoch": 1.3997243693450776, + "grad_norm": 0.15466761589050293, + "learning_rate": 4.230335566422999e-06, + "loss": 0.003, + "step": 23360 + }, + { + "epoch": 1.4003235664209959, + "grad_norm": 0.19129224121570587, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0042, + "step": 23370 + }, + { + "epoch": 1.4009227634969141, + "grad_norm": 0.2474614828824997, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0046, + "step": 23380 + }, + { + "epoch": 1.4015219605728324, + "grad_norm": 0.15569351613521576, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0048, + "step": 23390 + }, + { + "epoch": 1.4021211576487507, + "grad_norm": 0.09572251886129379, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0044, + "step": 23400 + }, + { + "epoch": 1.402720354724669, + "grad_norm": 0.13737086951732635, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0043, + "step": 23410 + }, + { + "epoch": 1.4033195518005872, + "grad_norm": 0.12266672402620316, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0039, + "step": 23420 + }, + { + "epoch": 1.4039187488765055, + "grad_norm": 0.09208404272794724, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0039, + "step": 23430 + }, + { + "epoch": 1.4045179459524237, + "grad_norm": 0.16571840643882751, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0047, + "step": 23440 + }, + { + "epoch": 1.405117143028342, + "grad_norm": 0.3071173131465912, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0039, + "step": 23450 + }, + { + "epoch": 1.4057163401042603, + "grad_norm": 0.09059276431798935, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0031, + "step": 23460 + }, + { + "epoch": 1.4063155371801785, + "grad_norm": 0.16070133447647095, + "learning_rate": 4.160146936563338e-06, + "loss": 0.004, + "step": 23470 + }, + { + "epoch": 1.4069147342560968, + "grad_norm": 0.12942227721214294, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0027, + "step": 23480 + }, + { + "epoch": 1.407513931332015, + "grad_norm": 0.13913804292678833, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0048, + "step": 23490 + }, + { + "epoch": 1.4081131284079333, + "grad_norm": 0.206321582198143, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0053, + "step": 23500 + }, + { + "epoch": 1.4087123254838516, + "grad_norm": 0.20973987877368927, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0041, + "step": 23510 + }, + { + "epoch": 1.4093115225597699, + "grad_norm": 0.23191478848457336, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0063, + "step": 23520 + }, + { + "epoch": 1.4099107196356881, + "grad_norm": 0.18233250081539154, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0029, + "step": 23530 + }, + { + "epoch": 1.4105099167116064, + "grad_norm": 0.133034810423851, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0043, + "step": 23540 + }, + { + "epoch": 1.4111091137875247, + "grad_norm": 0.10777711123228073, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0028, + "step": 23550 + }, + { + "epoch": 1.411708310863443, + "grad_norm": 0.14128559827804565, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0034, + "step": 23560 + }, + { + "epoch": 1.4123075079393612, + "grad_norm": 0.13215866684913635, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0036, + "step": 23570 + }, + { + "epoch": 1.4129067050152795, + "grad_norm": 0.18918493390083313, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0043, + "step": 23580 + }, + { + "epoch": 1.4135059020911978, + "grad_norm": 0.14459657669067383, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0045, + "step": 23590 + }, + { + "epoch": 1.414105099167116, + "grad_norm": 0.17287056148052216, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0039, + "step": 23600 + }, + { + "epoch": 1.4147042962430343, + "grad_norm": 0.13909804821014404, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0039, + "step": 23610 + }, + { + "epoch": 1.4153034933189526, + "grad_norm": 0.14798089861869812, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0037, + "step": 23620 + }, + { + "epoch": 1.4159026903948708, + "grad_norm": 0.10916659235954285, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0023, + "step": 23630 + }, + { + "epoch": 1.416501887470789, + "grad_norm": 0.1151762530207634, + "learning_rate": 4.053587511509546e-06, + "loss": 0.005, + "step": 23640 + }, + { + "epoch": 1.4171010845467074, + "grad_norm": 0.14232765138149261, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0032, + "step": 23650 + }, + { + "epoch": 1.4177002816226256, + "grad_norm": 0.09513483196496964, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0043, + "step": 23660 + }, + { + "epoch": 1.418299478698544, + "grad_norm": 0.09156285226345062, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0039, + "step": 23670 + }, + { + "epoch": 1.4188986757744622, + "grad_norm": 0.1405397206544876, + "learning_rate": 4.028855757736123e-06, + "loss": 0.004, + "step": 23680 + }, + { + "epoch": 1.4194978728503804, + "grad_norm": 0.15840958058834076, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0037, + "step": 23690 + }, + { + "epoch": 1.4200970699262987, + "grad_norm": 0.190508171916008, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0037, + "step": 23700 + }, + { + "epoch": 1.420696267002217, + "grad_norm": 0.15277954936027527, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0028, + "step": 23710 + }, + { + "epoch": 1.4212954640781352, + "grad_norm": 0.14111991226673126, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0034, + "step": 23720 + }, + { + "epoch": 1.4218946611540535, + "grad_norm": 0.31528833508491516, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0044, + "step": 23730 + }, + { + "epoch": 1.4224938582299718, + "grad_norm": 0.1420607715845108, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0039, + "step": 23740 + }, + { + "epoch": 1.42309305530589, + "grad_norm": 0.1340852528810501, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0032, + "step": 23750 + }, + { + "epoch": 1.4236922523818083, + "grad_norm": 0.11166475713253021, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0033, + "step": 23760 + }, + { + "epoch": 1.4242914494577266, + "grad_norm": 0.13635945320129395, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0028, + "step": 23770 + }, + { + "epoch": 1.4248906465336448, + "grad_norm": 0.15865778923034668, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0036, + "step": 23780 + }, + { + "epoch": 1.4254898436095633, + "grad_norm": 0.08569981157779694, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0026, + "step": 23790 + }, + { + "epoch": 1.4260890406854814, + "grad_norm": 0.1041082963347435, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0033, + "step": 23800 + }, + { + "epoch": 1.4266882377613999, + "grad_norm": 0.17262709140777588, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0041, + "step": 23810 + }, + { + "epoch": 1.427287434837318, + "grad_norm": 0.20455610752105713, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0035, + "step": 23820 + }, + { + "epoch": 1.4278866319132364, + "grad_norm": 0.15869568288326263, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0027, + "step": 23830 + }, + { + "epoch": 1.4284858289891544, + "grad_norm": 0.14855770766735077, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0042, + "step": 23840 + }, + { + "epoch": 1.429085026065073, + "grad_norm": 0.08842955529689789, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0028, + "step": 23850 + }, + { + "epoch": 1.429684223140991, + "grad_norm": 0.18251122534275055, + "learning_rate": 3.919189353330104e-06, + "loss": 0.003, + "step": 23860 + }, + { + "epoch": 1.4302834202169095, + "grad_norm": 0.24990014731884003, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0038, + "step": 23870 + }, + { + "epoch": 1.4308826172928275, + "grad_norm": 0.1088186502456665, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0036, + "step": 23880 + }, + { + "epoch": 1.431481814368746, + "grad_norm": 0.09780745953321457, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0042, + "step": 23890 + }, + { + "epoch": 1.432081011444664, + "grad_norm": 0.1625395119190216, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0033, + "step": 23900 + }, + { + "epoch": 1.4326802085205825, + "grad_norm": 0.16848890483379364, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0033, + "step": 23910 + }, + { + "epoch": 1.4332794055965006, + "grad_norm": 0.19756828248500824, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0051, + "step": 23920 + }, + { + "epoch": 1.433878602672419, + "grad_norm": 0.15720513463020325, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0033, + "step": 23930 + }, + { + "epoch": 1.4344777997483371, + "grad_norm": 0.22365699708461761, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0028, + "step": 23940 + }, + { + "epoch": 1.4350769968242556, + "grad_norm": 0.07928138971328735, + "learning_rate": 3.865363184624925e-06, + "loss": 0.003, + "step": 23950 + }, + { + "epoch": 1.4356761939001736, + "grad_norm": 0.26314112544059753, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0049, + "step": 23960 + }, + { + "epoch": 1.4362753909760921, + "grad_norm": 0.1249697357416153, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0036, + "step": 23970 + }, + { + "epoch": 1.4368745880520102, + "grad_norm": 0.09758924692869186, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0031, + "step": 23980 + }, + { + "epoch": 1.4374737851279287, + "grad_norm": 0.08506497740745544, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0037, + "step": 23990 + }, + { + "epoch": 1.4380729822038467, + "grad_norm": 0.1978219896554947, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0036, + "step": 24000 + }, + { + "epoch": 1.4386721792797652, + "grad_norm": 0.15215060114860535, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0033, + "step": 24010 + }, + { + "epoch": 1.4392713763556833, + "grad_norm": 0.1608658879995346, + "learning_rate": 3.823967005382315e-06, + "loss": 0.0034, + "step": 24020 + }, + { + "epoch": 1.4398705734316017, + "grad_norm": 0.10854586958885193, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0026, + "step": 24030 + }, + { + "epoch": 1.4404697705075198, + "grad_norm": 0.1394745409488678, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0036, + "step": 24040 + }, + { + "epoch": 1.4410689675834383, + "grad_norm": 0.0879194363951683, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0034, + "step": 24050 + }, + { + "epoch": 1.4416681646593565, + "grad_norm": 0.11169253289699554, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0037, + "step": 24060 + }, + { + "epoch": 1.4422673617352748, + "grad_norm": 0.12410115450620651, + "learning_rate": 3.794650811106129e-06, + "loss": 0.0033, + "step": 24070 + }, + { + "epoch": 1.442866558811193, + "grad_norm": 0.13719962537288666, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.0032, + "step": 24080 + }, + { + "epoch": 1.4434657558871113, + "grad_norm": 0.10031221807003021, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0046, + "step": 24090 + }, + { + "epoch": 1.4440649529630296, + "grad_norm": 0.1156797707080841, + "learning_rate": 3.777162510056721e-06, + "loss": 0.0042, + "step": 24100 + }, + { + "epoch": 1.4446641500389479, + "grad_norm": 0.1494375318288803, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0036, + "step": 24110 + }, + { + "epoch": 1.4452633471148661, + "grad_norm": 0.08620154112577438, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0034, + "step": 24120 + }, + { + "epoch": 1.4458625441907844, + "grad_norm": 0.16659799218177795, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0053, + "step": 24130 + }, + { + "epoch": 1.4464617412667027, + "grad_norm": 0.1313968300819397, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.0046, + "step": 24140 + }, + { + "epoch": 1.447060938342621, + "grad_norm": 0.21495603024959564, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0037, + "step": 24150 + }, + { + "epoch": 1.4476601354185392, + "grad_norm": 0.11284582316875458, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0033, + "step": 24160 + }, + { + "epoch": 1.4482593324944575, + "grad_norm": 0.18478819727897644, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0038, + "step": 24170 + }, + { + "epoch": 1.4488585295703758, + "grad_norm": 0.12338980287313461, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0029, + "step": 24180 + }, + { + "epoch": 1.449457726646294, + "grad_norm": 0.09782207757234573, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0037, + "step": 24190 + }, + { + "epoch": 1.4500569237222123, + "grad_norm": 0.10959567129611969, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0029, + "step": 24200 + }, + { + "epoch": 1.4506561207981306, + "grad_norm": 0.17048455774784088, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0035, + "step": 24210 + }, + { + "epoch": 1.4512553178740488, + "grad_norm": 0.12739142775535583, + "learning_rate": 3.707974016467e-06, + "loss": 0.0028, + "step": 24220 + }, + { + "epoch": 1.451854514949967, + "grad_norm": 0.19227802753448486, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0045, + "step": 24230 + }, + { + "epoch": 1.4524537120258854, + "grad_norm": 0.11818226426839828, + "learning_rate": 3.696562092850226e-06, + "loss": 0.0032, + "step": 24240 + }, + { + "epoch": 1.4530529091018036, + "grad_norm": 0.10820474475622177, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0026, + "step": 24250 + }, + { + "epoch": 1.453652106177722, + "grad_norm": 0.11386270821094513, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0026, + "step": 24260 + }, + { + "epoch": 1.4542513032536402, + "grad_norm": 0.23488907516002655, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0035, + "step": 24270 + }, + { + "epoch": 1.4548505003295584, + "grad_norm": 0.12526266276836395, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0037, + "step": 24280 + }, + { + "epoch": 1.4554496974054767, + "grad_norm": 0.22899770736694336, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0035, + "step": 24290 + }, + { + "epoch": 1.456048894481395, + "grad_norm": 0.13044586777687073, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0039, + "step": 24300 + }, + { + "epoch": 1.4566480915573132, + "grad_norm": 0.3652730882167816, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0041, + "step": 24310 + }, + { + "epoch": 1.4572472886332315, + "grad_norm": 0.1416187435388565, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.0036, + "step": 24320 + }, + { + "epoch": 1.4578464857091498, + "grad_norm": 0.11176013946533203, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0037, + "step": 24330 + }, + { + "epoch": 1.458445682785068, + "grad_norm": 0.09744516015052795, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0037, + "step": 24340 + }, + { + "epoch": 1.4590448798609863, + "grad_norm": 0.11925745010375977, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0034, + "step": 24350 + }, + { + "epoch": 1.4596440769369046, + "grad_norm": 0.0942603051662445, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0031, + "step": 24360 + }, + { + "epoch": 1.4602432740128228, + "grad_norm": 0.12849931418895721, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.0042, + "step": 24370 + }, + { + "epoch": 1.460842471088741, + "grad_norm": 0.11910247802734375, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0025, + "step": 24380 + }, + { + "epoch": 1.4614416681646594, + "grad_norm": 0.09603044390678406, + "learning_rate": 3.612069140022124e-06, + "loss": 0.004, + "step": 24390 + }, + { + "epoch": 1.4620408652405776, + "grad_norm": 0.1962766945362091, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0038, + "step": 24400 + }, + { + "epoch": 1.462640062316496, + "grad_norm": 0.15775476396083832, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0043, + "step": 24410 + }, + { + "epoch": 1.4632392593924142, + "grad_norm": 0.1549777239561081, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0041, + "step": 24420 + }, + { + "epoch": 1.4638384564683324, + "grad_norm": 0.24444808065891266, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0029, + "step": 24430 + }, + { + "epoch": 1.4644376535442507, + "grad_norm": 0.12734061479568481, + "learning_rate": 3.584337233394337e-06, + "loss": 0.003, + "step": 24440 + }, + { + "epoch": 1.465036850620169, + "grad_norm": 0.23149384558200836, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0042, + "step": 24450 + }, + { + "epoch": 1.4656360476960872, + "grad_norm": 0.1598765254020691, + "learning_rate": 3.573305344104808e-06, + "loss": 0.0041, + "step": 24460 + }, + { + "epoch": 1.4662352447720055, + "grad_norm": 0.12173855304718018, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0046, + "step": 24470 + }, + { + "epoch": 1.4668344418479238, + "grad_norm": 0.09653043001890182, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0032, + "step": 24480 + }, + { + "epoch": 1.467433638923842, + "grad_norm": 0.13262024521827698, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.003, + "step": 24490 + }, + { + "epoch": 1.4680328359997603, + "grad_norm": 0.2603001892566681, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0031, + "step": 24500 + }, + { + "epoch": 1.4686320330756786, + "grad_norm": 0.24721759557724, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0028, + "step": 24510 + }, + { + "epoch": 1.4692312301515968, + "grad_norm": 0.11963216960430145, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0047, + "step": 24520 + }, + { + "epoch": 1.4698304272275151, + "grad_norm": 0.12025906145572662, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.0024, + "step": 24530 + }, + { + "epoch": 1.4704296243034334, + "grad_norm": 0.1969287395477295, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0033, + "step": 24540 + }, + { + "epoch": 1.4710288213793516, + "grad_norm": 0.24025285243988037, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0047, + "step": 24550 + }, + { + "epoch": 1.47162801845527, + "grad_norm": 0.07612641155719757, + "learning_rate": 3.518669865884119e-06, + "loss": 0.0026, + "step": 24560 + }, + { + "epoch": 1.4722272155311882, + "grad_norm": 0.18313643336296082, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0038, + "step": 24570 + }, + { + "epoch": 1.4728264126071064, + "grad_norm": 0.3311282694339752, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0036, + "step": 24580 + }, + { + "epoch": 1.4734256096830247, + "grad_norm": 0.16643930971622467, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0034, + "step": 24590 + }, + { + "epoch": 1.474024806758943, + "grad_norm": 0.11099164932966232, + "learning_rate": 3.497061149826966e-06, + "loss": 0.003, + "step": 24600 + }, + { + "epoch": 1.4746240038348613, + "grad_norm": 0.11017951369285583, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0031, + "step": 24610 + }, + { + "epoch": 1.4752232009107795, + "grad_norm": 0.17948199808597565, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0037, + "step": 24620 + }, + { + "epoch": 1.4758223979866978, + "grad_norm": 0.1002451479434967, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.0042, + "step": 24630 + }, + { + "epoch": 1.476421595062616, + "grad_norm": 0.13393986225128174, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0038, + "step": 24640 + }, + { + "epoch": 1.4770207921385343, + "grad_norm": 0.0963628888130188, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0027, + "step": 24650 + }, + { + "epoch": 1.4776199892144526, + "grad_norm": 0.14946860074996948, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.0039, + "step": 24660 + }, + { + "epoch": 1.4782191862903709, + "grad_norm": 0.2011580467224121, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.0045, + "step": 24670 + }, + { + "epoch": 1.4788183833662891, + "grad_norm": 0.12523533403873444, + "learning_rate": 3.454266765790622e-06, + "loss": 0.0033, + "step": 24680 + }, + { + "epoch": 1.4794175804422074, + "grad_norm": 0.22948165237903595, + "learning_rate": 3.448957251110008e-06, + "loss": 0.0031, + "step": 24690 + }, + { + "epoch": 1.4800167775181257, + "grad_norm": 0.24120132625102997, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0046, + "step": 24700 + }, + { + "epoch": 1.480615974594044, + "grad_norm": 0.30398526787757874, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0043, + "step": 24710 + }, + { + "epoch": 1.4812151716699622, + "grad_norm": 0.13554388284683228, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0033, + "step": 24720 + }, + { + "epoch": 1.4818143687458805, + "grad_norm": 0.14989149570465088, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.003, + "step": 24730 + }, + { + "epoch": 1.4824135658217987, + "grad_norm": 0.15678660571575165, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0037, + "step": 24740 + }, + { + "epoch": 1.483012762897717, + "grad_norm": 0.29919424653053284, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.0042, + "step": 24750 + }, + { + "epoch": 1.4836119599736353, + "grad_norm": 0.08935242891311646, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.004, + "step": 24760 + }, + { + "epoch": 1.4842111570495535, + "grad_norm": 0.22928708791732788, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0042, + "step": 24770 + }, + { + "epoch": 1.4848103541254718, + "grad_norm": 0.18873436748981476, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0036, + "step": 24780 + }, + { + "epoch": 1.48540955120139, + "grad_norm": 0.0956149622797966, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0037, + "step": 24790 + }, + { + "epoch": 1.4860087482773083, + "grad_norm": 0.13334470987319946, + "learning_rate": 3.391138816571675e-06, + "loss": 0.0034, + "step": 24800 + }, + { + "epoch": 1.4866079453532266, + "grad_norm": 0.13492803275585175, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0033, + "step": 24810 + }, + { + "epoch": 1.4872071424291449, + "grad_norm": 0.13227517902851105, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0032, + "step": 24820 + }, + { + "epoch": 1.4878063395050631, + "grad_norm": 0.11342936754226685, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0042, + "step": 24830 + }, + { + "epoch": 1.4884055365809814, + "grad_norm": 0.3178110122680664, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0038, + "step": 24840 + }, + { + "epoch": 1.4890047336568997, + "grad_norm": 0.04432455077767372, + "learning_rate": 3.36521439484193e-06, + "loss": 0.0028, + "step": 24850 + }, + { + "epoch": 1.4896039307328182, + "grad_norm": 0.09680923074483871, + "learning_rate": 3.36005636574796e-06, + "loss": 0.0036, + "step": 24860 + }, + { + "epoch": 1.4902031278087362, + "grad_norm": 0.2477794885635376, + "learning_rate": 3.354907302553392e-06, + "loss": 0.004, + "step": 24870 + }, + { + "epoch": 1.4908023248846547, + "grad_norm": 0.11931425333023071, + "learning_rate": 3.349767211300933e-06, + "loss": 0.004, + "step": 24880 + }, + { + "epoch": 1.4914015219605727, + "grad_norm": 0.1410735696554184, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0032, + "step": 24890 + }, + { + "epoch": 1.4920007190364912, + "grad_norm": 0.16996408998966217, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.0041, + "step": 24900 + }, + { + "epoch": 1.4925999161124093, + "grad_norm": 0.1275407373905182, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.0038, + "step": 24910 + }, + { + "epoch": 1.4931991131883278, + "grad_norm": 0.10107860714197159, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0036, + "step": 24920 + }, + { + "epoch": 1.4937983102642458, + "grad_norm": 0.10196204483509064, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0032, + "step": 24930 + }, + { + "epoch": 1.4943975073401643, + "grad_norm": 0.10152500867843628, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0038, + "step": 24940 + }, + { + "epoch": 1.4949967044160823, + "grad_norm": 0.19691230356693268, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.0045, + "step": 24950 + }, + { + "epoch": 1.4955959014920008, + "grad_norm": 0.33672890067100525, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0038, + "step": 24960 + }, + { + "epoch": 1.4961950985679189, + "grad_norm": 0.09857437759637833, + "learning_rate": 3.303911119253872e-06, + "loss": 0.004, + "step": 24970 + }, + { + "epoch": 1.4967942956438374, + "grad_norm": 0.13289818167686462, + "learning_rate": 3.298861077451818e-06, + "loss": 0.003, + "step": 24980 + }, + { + "epoch": 1.4973934927197554, + "grad_norm": 0.18509522080421448, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0064, + "step": 24990 + }, + { + "epoch": 1.497992689795674, + "grad_norm": 0.11460676789283752, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0024, + "step": 25000 + }, + { + "epoch": 1.498591886871592, + "grad_norm": 0.12012742459774017, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.0039, + "step": 25010 + }, + { + "epoch": 1.4991910839475104, + "grad_norm": 0.356365442276001, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.0035, + "step": 25020 + }, + { + "epoch": 1.4997902810234285, + "grad_norm": 0.5451288223266602, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0068, + "step": 25030 + }, + { + "epoch": 1.500389478099347, + "grad_norm": 0.1067429855465889, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0034, + "step": 25040 + }, + { + "epoch": 1.500988675175265, + "grad_norm": 0.2349347621202469, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0041, + "step": 25050 + }, + { + "epoch": 1.5015878722511835, + "grad_norm": 0.09102735668420792, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0032, + "step": 25060 + }, + { + "epoch": 1.5021870693271016, + "grad_norm": 0.11968998610973358, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0034, + "step": 25070 + }, + { + "epoch": 1.50278626640302, + "grad_norm": 0.1355520486831665, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0034, + "step": 25080 + }, + { + "epoch": 1.503385463478938, + "grad_norm": 0.11785157769918442, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0044, + "step": 25090 + }, + { + "epoch": 1.5039846605548566, + "grad_norm": 0.12043727189302444, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0047, + "step": 25100 + }, + { + "epoch": 1.5045838576307746, + "grad_norm": 0.13475126028060913, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0033, + "step": 25110 + }, + { + "epoch": 1.5051830547066931, + "grad_norm": 0.12776954472064972, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0032, + "step": 25120 + }, + { + "epoch": 1.5057822517826112, + "grad_norm": 0.10374128818511963, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0054, + "step": 25130 + }, + { + "epoch": 1.5063814488585296, + "grad_norm": 0.08750293403863907, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0035, + "step": 25140 + }, + { + "epoch": 1.5069806459344477, + "grad_norm": 0.1284732222557068, + "learning_rate": 3.214397932123149e-06, + "loss": 0.0047, + "step": 25150 + }, + { + "epoch": 1.5075798430103662, + "grad_norm": 0.12900014221668243, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0042, + "step": 25160 + }, + { + "epoch": 1.5081790400862842, + "grad_norm": 0.11983122676610947, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0031, + "step": 25170 + }, + { + "epoch": 1.5087782371622027, + "grad_norm": 0.20311471819877625, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0045, + "step": 25180 + }, + { + "epoch": 1.5093774342381208, + "grad_norm": 0.1965232491493225, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.0047, + "step": 25190 + }, + { + "epoch": 1.5099766313140393, + "grad_norm": 0.10592305660247803, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0031, + "step": 25200 + }, + { + "epoch": 1.5105758283899573, + "grad_norm": 0.10558371245861053, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0032, + "step": 25210 + }, + { + "epoch": 1.5111750254658758, + "grad_norm": 0.12083200365304947, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0025, + "step": 25220 + }, + { + "epoch": 1.5117742225417938, + "grad_norm": 0.2367735505104065, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0024, + "step": 25230 + }, + { + "epoch": 1.5123734196177123, + "grad_norm": 0.1387612670660019, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.004, + "step": 25240 + }, + { + "epoch": 1.5129726166936306, + "grad_norm": 0.18766231834888458, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0035, + "step": 25250 + }, + { + "epoch": 1.5135718137695489, + "grad_norm": 0.18110574781894684, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0031, + "step": 25260 + }, + { + "epoch": 1.5141710108454671, + "grad_norm": 0.1886875331401825, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.003, + "step": 25270 + }, + { + "epoch": 1.5147702079213854, + "grad_norm": 0.09323479980230331, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.004, + "step": 25280 + }, + { + "epoch": 1.5153694049973037, + "grad_norm": 0.1508265882730484, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0039, + "step": 25290 + }, + { + "epoch": 1.515968602073222, + "grad_norm": 0.11250200122594833, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0039, + "step": 25300 + }, + { + "epoch": 1.5165677991491402, + "grad_norm": 0.23230062425136566, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.004, + "step": 25310 + }, + { + "epoch": 1.5171669962250585, + "grad_norm": 0.179047629237175, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.004, + "step": 25320 + }, + { + "epoch": 1.5177661933009767, + "grad_norm": 0.13797952234745026, + "learning_rate": 3.127844986891409e-06, + "loss": 0.0042, + "step": 25330 + }, + { + "epoch": 1.518365390376895, + "grad_norm": 0.12740616500377655, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0027, + "step": 25340 + }, + { + "epoch": 1.5189645874528133, + "grad_norm": 0.11396504938602448, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.0037, + "step": 25350 + }, + { + "epoch": 1.5195637845287315, + "grad_norm": 0.12815812230110168, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.0033, + "step": 25360 + }, + { + "epoch": 1.5201629816046498, + "grad_norm": 0.17100073397159576, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0039, + "step": 25370 + }, + { + "epoch": 1.520762178680568, + "grad_norm": 0.09657446295022964, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0031, + "step": 25380 + }, + { + "epoch": 1.5213613757564863, + "grad_norm": 0.3235829472541809, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0049, + "step": 25390 + }, + { + "epoch": 1.5219605728324046, + "grad_norm": 0.17849496006965637, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0033, + "step": 25400 + }, + { + "epoch": 1.5225597699083229, + "grad_norm": 0.16907230019569397, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0028, + "step": 25410 + }, + { + "epoch": 1.5231589669842411, + "grad_norm": 0.26099368929862976, + "learning_rate": 3.085688933413021e-06, + "loss": 0.003, + "step": 25420 + }, + { + "epoch": 1.5237581640601594, + "grad_norm": 0.21024562418460846, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0035, + "step": 25430 + }, + { + "epoch": 1.5243573611360777, + "grad_norm": 0.10564325749874115, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.0032, + "step": 25440 + }, + { + "epoch": 1.524956558211996, + "grad_norm": 0.10607697814702988, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0037, + "step": 25450 + }, + { + "epoch": 1.5255557552879142, + "grad_norm": 0.20698976516723633, + "learning_rate": 3.067194157156521e-06, + "loss": 0.003, + "step": 25460 + }, + { + "epoch": 1.5261549523638325, + "grad_norm": 0.20934849977493286, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0033, + "step": 25470 + }, + { + "epoch": 1.5267541494397507, + "grad_norm": 0.12407243996858597, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0037, + "step": 25480 + }, + { + "epoch": 1.527353346515669, + "grad_norm": 0.13003374636173248, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.0032, + "step": 25490 + }, + { + "epoch": 1.5279525435915873, + "grad_norm": 0.15529648959636688, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0048, + "step": 25500 + }, + { + "epoch": 1.5285517406675055, + "grad_norm": 0.12824782729148865, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.0031, + "step": 25510 + }, + { + "epoch": 1.5291509377434238, + "grad_norm": 0.12616124749183655, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.0037, + "step": 25520 + }, + { + "epoch": 1.529750134819342, + "grad_norm": 0.2119731307029724, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0037, + "step": 25530 + }, + { + "epoch": 1.5303493318952603, + "grad_norm": 0.22325192391872406, + "learning_rate": 3.030651808761638e-06, + "loss": 0.0035, + "step": 25540 + }, + { + "epoch": 1.5309485289711786, + "grad_norm": 0.10937803238630295, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0037, + "step": 25550 + }, + { + "epoch": 1.5315477260470969, + "grad_norm": 0.3106321692466736, + "learning_rate": 3.021609639602321e-06, + "loss": 0.0034, + "step": 25560 + }, + { + "epoch": 1.5321469231230151, + "grad_norm": 0.2864716649055481, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.0037, + "step": 25570 + }, + { + "epoch": 1.5327461201989334, + "grad_norm": 0.10637935250997543, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.0037, + "step": 25580 + }, + { + "epoch": 1.5333453172748517, + "grad_norm": 0.11078158766031265, + "learning_rate": 3.008116622200155e-06, + "loss": 0.0034, + "step": 25590 + }, + { + "epoch": 1.53394451435077, + "grad_norm": 0.06270865350961685, + "learning_rate": 3.003637700546652e-06, + "loss": 0.003, + "step": 25600 + }, + { + "epoch": 1.5345437114266882, + "grad_norm": 0.12176132947206497, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0043, + "step": 25610 + }, + { + "epoch": 1.5351429085026065, + "grad_norm": 0.16978275775909424, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0032, + "step": 25620 + }, + { + "epoch": 1.5357421055785248, + "grad_norm": 0.2582871913909912, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.0036, + "step": 25630 + }, + { + "epoch": 1.536341302654443, + "grad_norm": 0.27402547001838684, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0031, + "step": 25640 + }, + { + "epoch": 1.5369404997303613, + "grad_norm": 0.15350353717803955, + "learning_rate": 2.981383959667165e-06, + "loss": 0.004, + "step": 25650 + }, + { + "epoch": 1.5375396968062796, + "grad_norm": 0.0939447432756424, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0033, + "step": 25660 + }, + { + "epoch": 1.5381388938821978, + "grad_norm": 0.16549192368984222, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0048, + "step": 25670 + }, + { + "epoch": 1.538738090958116, + "grad_norm": 0.11002931743860245, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0028, + "step": 25680 + }, + { + "epoch": 1.5393372880340344, + "grad_norm": 0.17383548617362976, + "learning_rate": 2.963750320724704e-06, + "loss": 0.0032, + "step": 25690 + }, + { + "epoch": 1.5399364851099526, + "grad_norm": 0.18648599088191986, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0039, + "step": 25700 + }, + { + "epoch": 1.540535682185871, + "grad_norm": 0.2366044819355011, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0036, + "step": 25710 + }, + { + "epoch": 1.5411348792617892, + "grad_norm": 0.1678195595741272, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.0032, + "step": 25720 + }, + { + "epoch": 1.5417340763377074, + "grad_norm": 0.31918013095855713, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0045, + "step": 25730 + }, + { + "epoch": 1.5423332734136257, + "grad_norm": 0.14635732769966125, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.0039, + "step": 25740 + }, + { + "epoch": 1.542932470489544, + "grad_norm": 0.19166909158229828, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0028, + "step": 25750 + }, + { + "epoch": 1.5435316675654622, + "grad_norm": 0.11960610002279282, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0029, + "step": 25760 + }, + { + "epoch": 1.5441308646413805, + "grad_norm": 0.06636705994606018, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0025, + "step": 25770 + }, + { + "epoch": 1.5447300617172988, + "grad_norm": 0.17033624649047852, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0028, + "step": 25780 + }, + { + "epoch": 1.5453292587932173, + "grad_norm": 0.07974246889352798, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.003, + "step": 25790 + }, + { + "epoch": 1.5459284558691353, + "grad_norm": 0.1188567653298378, + "learning_rate": 2.916036854664115e-06, + "loss": 0.0043, + "step": 25800 + }, + { + "epoch": 1.5465276529450538, + "grad_norm": 0.11378541588783264, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0028, + "step": 25810 + }, + { + "epoch": 1.5471268500209718, + "grad_norm": 0.11495907604694366, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0037, + "step": 25820 + }, + { + "epoch": 1.5477260470968903, + "grad_norm": 0.144247367978096, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0031, + "step": 25830 + }, + { + "epoch": 1.5483252441728084, + "grad_norm": 0.14722205698490143, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0046, + "step": 25840 + }, + { + "epoch": 1.5489244412487269, + "grad_norm": 0.10647077113389969, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.0026, + "step": 25850 + }, + { + "epoch": 1.549523638324645, + "grad_norm": 0.17438668012619019, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0032, + "step": 25860 + }, + { + "epoch": 1.5501228354005634, + "grad_norm": 0.17071637511253357, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0035, + "step": 25870 + }, + { + "epoch": 1.5507220324764814, + "grad_norm": 0.2201206386089325, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0041, + "step": 25880 + }, + { + "epoch": 1.5513212295524, + "grad_norm": 0.14397655427455902, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0037, + "step": 25890 + }, + { + "epoch": 1.551920426628318, + "grad_norm": 0.055822595953941345, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0035, + "step": 25900 + }, + { + "epoch": 1.5525196237042365, + "grad_norm": 0.13084810972213745, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0046, + "step": 25910 + }, + { + "epoch": 1.5531188207801545, + "grad_norm": 0.3321281373500824, + "learning_rate": 2.865295218604555e-06, + "loss": 0.0038, + "step": 25920 + }, + { + "epoch": 1.553718017856073, + "grad_norm": 0.1274777501821518, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0041, + "step": 25930 + }, + { + "epoch": 1.554317214931991, + "grad_norm": 0.09797787666320801, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0034, + "step": 25940 + }, + { + "epoch": 1.5549164120079095, + "grad_norm": 0.1270579695701599, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0042, + "step": 25950 + }, + { + "epoch": 1.5555156090838276, + "grad_norm": 0.09015227854251862, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0034, + "step": 25960 + }, + { + "epoch": 1.556114806159746, + "grad_norm": 0.12557077407836914, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.0037, + "step": 25970 + }, + { + "epoch": 1.5567140032356641, + "grad_norm": 0.2725144922733307, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0036, + "step": 25980 + }, + { + "epoch": 1.5573132003115826, + "grad_norm": 0.13758502900600433, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0039, + "step": 25990 + }, + { + "epoch": 1.5579123973875006, + "grad_norm": 0.19999243319034576, + "learning_rate": 2.832230653119002e-06, + "loss": 0.0038, + "step": 26000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.624037727047516e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..19d22af7b7d6155175015b5c3c5b452030d153ea --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-26000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccf8e16588ffacf58cd09ed0241d355125d76c992d11c15a4bc8ee94db38dc3b +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9d8b74a2a35520e5c6d5bbf5ffdade6c4519906d --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0087d47a6c5b0d07f33a68c381c22caec5e0a0de6df974e91974ecad76589ef +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c4057667e2599d2e61134a4e3ac5618ac9f96a50 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb092b3f0a637ad30c9f40563b57c8ca596c1d751aa5bff279208d9fe237b47e +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a277aea35bcd4c155a2cc2684db54f025c559a5b --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8a26015247ae60cbc1698de147272cbb0d73be19067b7d3c856d178168d9e9b +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..0da6184f9732635317d9591566929a0f088174db --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -24.608807465362545, + -30.57493604888916, + -14.421680474472046, + -1.8400005650520326, + -2.2583390679359434, + -1.9374337060928344, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 2.947746359062201, + 22.348905650329584, + 21.642364361572263, + 2.36660552740097, + 4.0908002225875855, + 3.2823701507568366, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + -6.435277462005615, + -1.046771764755249, + 3.5443263053894043, + 0.010237408801913261, + 0.7088965773582458, + 0.433538019657135, + 0.11327514797449112, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 7.037599563598633, + 16.91518783569336, + 8.290277481079102, + 0.6919190883636475, + 1.1289485692977905, + 0.9604002833366394, + 0.9935636520385742, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.702568125152588, + -21.763728466033935, + -21.216347326660156, + -2.3684931322097778, + -4.066458044528961, + -3.2888745792388914, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.601868363571164, + 30.525507734680176, + 14.354210775756833, + 1.8357849156379702, + 2.250663768482209, + 1.934181491851806, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.545124530792236, + 1.3164341449737549, + -3.4697155952453613, + -0.00962071679532528, + -0.7082296013832092, + -0.43808361887931824, + 0.13391299545764923, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.980162143707275, + 16.702543258666992, + 8.168180465698242, + 0.6913491487503052, + 1.1232151985168457, + 0.9606267809867859, + 0.990993082523346, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff8c0ecf109af377331583e4079865e7d8037bc8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 5 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d37e4d3fa61809ce619eb85cd07bd04f11d64c35 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/trainer_state.json @@ -0,0 +1,19634 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.6777518125711546, + "eval_steps": 500, + "global_step": 28000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 2.688621997833252, + "learning_rate": 1.8e-07, + "loss": 0.1495, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.1722424030303955, + "learning_rate": 3.8e-07, + "loss": 0.1358, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 2.3095974922180176, + "learning_rate": 5.800000000000001e-07, + "loss": 0.1268, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 2.131070852279663, + "learning_rate": 7.8e-07, + "loss": 0.1224, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 2.273555278778076, + "learning_rate": 9.800000000000001e-07, + "loss": 0.118, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 1.3571869134902954, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.111, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 1.6004165410995483, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0826, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 1.0413638353347778, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.0657, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 1.1965473890304565, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.0493, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 1.1422100067138672, + "learning_rate": 1.98e-06, + "loss": 0.0444, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 0.6911118626594543, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.0457, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 0.6770259737968445, + "learning_rate": 2.38e-06, + "loss": 0.0257, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 0.4811704456806183, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.0208, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 0.7260023951530457, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.0203, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 0.4369716942310333, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.0174, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 0.4100959300994873, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.0133, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 1.0024627447128296, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.0149, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.4598183035850525, + "learning_rate": 3.58e-06, + "loss": 0.0143, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 0.7042055130004883, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.0143, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 0.7677909731864929, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0151, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 0.45090702176094055, + "learning_rate": 4.18e-06, + "loss": 0.0113, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 0.4400976598262787, + "learning_rate": 4.38e-06, + "loss": 0.0155, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 0.2424178272485733, + "learning_rate": 4.58e-06, + "loss": 0.0113, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 0.4720967411994934, + "learning_rate": 4.78e-06, + "loss": 0.0166, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 0.41622042655944824, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0104, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 0.6915765404701233, + "learning_rate": 5.18e-06, + "loss": 0.0108, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.25931113958358765, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0104, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.42486071586608887, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0084, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.3798843324184418, + "learning_rate": 5.78e-06, + "loss": 0.0107, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.3281213343143463, + "learning_rate": 5.98e-06, + "loss": 0.0081, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 0.3394489884376526, + "learning_rate": 6.18e-06, + "loss": 0.01, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 0.38298189640045166, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0098, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 0.3188078999519348, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0104, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.3152049779891968, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0087, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.34163472056388855, + "learning_rate": 6.98e-06, + "loss": 0.01, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 0.43860143423080444, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0065, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.2845093309879303, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0086, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 0.4009752869606018, + "learning_rate": 7.58e-06, + "loss": 0.0099, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.37756970524787903, + "learning_rate": 7.78e-06, + "loss": 0.0097, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.38135284185409546, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0076, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 0.3145769536495209, + "learning_rate": 8.18e-06, + "loss": 0.0106, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 0.32534345984458923, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0069, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.24024507403373718, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0089, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 0.32857799530029297, + "learning_rate": 8.78e-06, + "loss": 0.0105, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.28823110461235046, + "learning_rate": 8.98e-06, + "loss": 0.0101, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 0.32506972551345825, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0126, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 0.19875723123550415, + "learning_rate": 9.38e-06, + "loss": 0.0081, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.3245992958545685, + "learning_rate": 9.58e-06, + "loss": 0.0099, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.24933603405952454, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0117, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.3154098391532898, + "learning_rate": 9.980000000000001e-06, + "loss": 0.009, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.3685779273509979, + "learning_rate": 1.018e-05, + "loss": 0.0101, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 0.7251449823379517, + "learning_rate": 1.038e-05, + "loss": 0.0119, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 0.3183727264404297, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.009, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.3737810254096985, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0089, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.45293235778808594, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.011, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 0.3476772606372833, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.008, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.38373252749443054, + "learning_rate": 1.138e-05, + "loss": 0.0088, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 0.2530902624130249, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.008, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 0.19455896317958832, + "learning_rate": 1.178e-05, + "loss": 0.008, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.3315221071243286, + "learning_rate": 1.198e-05, + "loss": 0.0102, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.23430880904197693, + "learning_rate": 1.218e-05, + "loss": 0.007, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.4636307656764984, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0075, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.3785994052886963, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0109, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.2804955542087555, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0099, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.393702894449234, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0132, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.400641530752182, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0099, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 0.24428881704807281, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0076, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 0.4449252188205719, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0103, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.406582236289978, + "learning_rate": 1.378e-05, + "loss": 0.0098, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.36386170983314514, + "learning_rate": 1.398e-05, + "loss": 0.0088, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.38196030259132385, + "learning_rate": 1.418e-05, + "loss": 0.01, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.28740620613098145, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.008, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.3616485297679901, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0094, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.4004146158695221, + "learning_rate": 1.478e-05, + "loss": 0.009, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.4585514962673187, + "learning_rate": 1.498e-05, + "loss": 0.0092, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.20028235018253326, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0138, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 0.46603646874427795, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0139, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.3518030047416687, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0116, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.22323082387447357, + "learning_rate": 1.578e-05, + "loss": 0.0097, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.26777058839797974, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0081, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.32380548119544983, + "learning_rate": 1.618e-05, + "loss": 0.0087, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.5248059630393982, + "learning_rate": 1.638e-05, + "loss": 0.0102, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.3495309054851532, + "learning_rate": 1.658e-05, + "loss": 0.0121, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.3551771342754364, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.5039486289024353, + "learning_rate": 1.698e-05, + "loss": 0.0094, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.3826751410961151, + "learning_rate": 1.718e-05, + "loss": 0.0107, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.46699973940849304, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0122, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3312668204307556, + "learning_rate": 1.758e-05, + "loss": 0.0087, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 0.28113219141960144, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0121, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.49752357602119446, + "learning_rate": 1.798e-05, + "loss": 0.0101, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.4177795350551605, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0096, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.34015583992004395, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0082, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.4612225890159607, + "learning_rate": 1.858e-05, + "loss": 0.0084, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.3813643753528595, + "learning_rate": 1.878e-05, + "loss": 0.012, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 0.27937838435173035, + "learning_rate": 1.898e-05, + "loss": 0.0104, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.4471273422241211, + "learning_rate": 1.918e-05, + "loss": 0.0125, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.4010440707206726, + "learning_rate": 1.938e-05, + "loss": 0.0106, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.41607654094696045, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0107, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 0.3589233458042145, + "learning_rate": 1.978e-05, + "loss": 0.0081, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.5726460814476013, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0111, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.36717164516448975, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0102, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.47284170985221863, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.01, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.5372244119644165, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0117, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.40928924083709717, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0088, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.4905182421207428, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0107, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.3709850609302521, + "learning_rate": 1.999981616897523e-05, + "loss": 0.01, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 0.6419615745544434, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0095, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.4986196458339691, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0127, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.5523516535758972, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0115, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.5443158745765686, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0113, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 0.5146775245666504, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0101, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.2972394824028015, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0092, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.4030104875564575, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0097, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 0.4765481650829315, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0136, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.4051239788532257, + "learning_rate": 1.999882759038658e-05, + "loss": 0.0113, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.3703782558441162, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0108, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5248176455497742, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0112, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.3100311756134033, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0083, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.45929211378097534, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0114, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 0.5695507526397705, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0095, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.5395359992980957, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0151, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.5106327533721924, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0124, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.3423260450363159, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0132, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.32126766443252563, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.011, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.5105165839195251, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0085, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 0.31927764415740967, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0088, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 0.4421865940093994, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0093, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.2930506765842438, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0091, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.2920694053173065, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0085, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.2661049962043762, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0081, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 0.3047257661819458, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0083, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.2774506211280823, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0091, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.2554785907268524, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0096, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.5792570114135742, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0108, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.3250623941421509, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0125, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 0.5885359048843384, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0117, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.37988749146461487, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.009, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.3751101493835449, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0099, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.31976667046546936, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0097, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 0.37007251381874084, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0079, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.4624205231666565, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0103, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 0.3769538700580597, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0094, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.25460657477378845, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0076, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.3976004719734192, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0109, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.2983521521091461, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0113, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.25581008195877075, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0101, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.29260268807411194, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0102, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.3522181808948517, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0105, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.36269208788871765, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0103, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.40412119030952454, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0116, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.24089744687080383, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0119, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.4667617082595825, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0084, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.30139675736427307, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0101, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.38486286997795105, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0097, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.3526909649372101, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0071, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.3023934066295624, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0125, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.2796316146850586, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0072, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.25742489099502563, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0089, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.3626627027988434, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.01, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.3032572567462921, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0084, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.23514018952846527, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0086, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.3835832476615906, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0091, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.5170259475708008, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0146, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 0.8983817100524902, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0112, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.26260825991630554, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0086, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.481942743062973, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0126, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.311187207698822, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0064, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.3346790373325348, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0073, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.33836621046066284, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0085, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.3678463101387024, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0098, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.6136184334754944, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0154, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.39811593294143677, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0112, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6973778009414673, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0099, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.4773237109184265, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0091, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.3776084780693054, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.009, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 0.5061993598937988, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0097, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.41183987259864807, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.009, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 0.31513598561286926, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0112, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.4571514129638672, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0097, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.3183996379375458, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.01, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.2978666126728058, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0089, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.4791043698787689, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0087, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.5216032266616821, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0124, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.44693392515182495, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0092, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 0.41371819376945496, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0111, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.3593288064002991, + "learning_rate": 1.996106060741973e-05, + "loss": 0.014, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 0.4550306499004364, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0098, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.3510669469833374, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0066, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.2778814136981964, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0108, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.32210350036621094, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0067, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.42160800099372864, + "learning_rate": 1.995639934033493e-05, + "loss": 0.012, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.49051347374916077, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0102, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.3643694519996643, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.009, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.3717772960662842, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0076, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.32102280855178833, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0081, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.36725476384162903, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0102, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.39626258611679077, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0078, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.4183773696422577, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0105, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.3494930863380432, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0078, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.6155357956886292, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0119, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.34380587935447693, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.0105, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.5476253032684326, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.01, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 0.37999996542930603, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0094, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 0.3124147057533264, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0125, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.4887244999408722, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.01, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5969874858856201, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0106, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 0.4295594096183777, + "learning_rate": 1.993971819309759e-05, + "loss": 0.007, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.3899303078651428, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0096, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.3912282884120941, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0075, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.5355616807937622, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0093, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.29141828417778015, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0129, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.24389855563640594, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.009, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.4070908725261688, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0085, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.26783379912376404, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0071, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.2644960880279541, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0089, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.35223162174224854, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0093, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.47337162494659424, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0095, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.25418519973754883, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0093, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 0.36384159326553345, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0082, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.30014440417289734, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0081, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.41121408343315125, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0081, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.5576186776161194, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.008, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.35785913467407227, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0083, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.3306240439414978, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0084, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 0.37215736508369446, + "learning_rate": 1.991774193879505e-05, + "loss": 0.012, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.5504099726676941, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0088, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.24932143092155457, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.007, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.5866615176200867, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0088, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.5174368619918823, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0121, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.2345893532037735, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0095, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.2683233916759491, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0068, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.2471713274717331, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0085, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.5090919733047485, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0108, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.2857886552810669, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0078, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.23729385435581207, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0096, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.30867621302604675, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0088, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.42522960901260376, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0103, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.37170591950416565, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0105, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.3672806918621063, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0121, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.4048611521720886, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.01, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.24768167734146118, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0125, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 0.5003495812416077, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0125, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.4303686022758484, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0084, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.3701602518558502, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0101, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.38272005319595337, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0075, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.2844183146953583, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0105, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.31114980578422546, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0095, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.3436568081378937, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0113, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.273001104593277, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0076, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.2653564512729645, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0077, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.3115384578704834, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0132, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.25932809710502625, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0083, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.28656521439552307, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0066, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.31808462738990784, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.0115, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.18877890706062317, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0092, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.3685394525527954, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0091, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.3878263533115387, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0082, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 0.284507691860199, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0085, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.3473755121231079, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0081, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.39935287833213806, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0081, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.34282153844833374, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0076, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.3581090271472931, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0087, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.37332627177238464, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0089, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 0.5224587321281433, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0089, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.42577075958251953, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0108, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.4602234959602356, + "learning_rate": 1.985504281027289e-05, + "loss": 0.014, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.4852961003780365, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0091, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.4437471628189087, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0112, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.37050408124923706, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0068, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.3345497250556946, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0069, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.36727628111839294, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0081, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 0.37056809663772583, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0152, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.5640603303909302, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0085, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 0.3653910160064697, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0078, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.2954258322715759, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0083, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6086210012435913, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0082, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 0.5260390043258667, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0105, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.3067379295825958, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.0092, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.3480100929737091, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0088, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.26472753286361694, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0067, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.5254784226417542, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0146, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.35744136571884155, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0098, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.36186468601226807, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0084, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 0.35203835368156433, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0115, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.30590811371803284, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0108, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.34612980484962463, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0082, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.2946765720844269, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0075, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.33707642555236816, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.007, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.2572688162326813, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0099, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.3901146352291107, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0185, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.4349755644798279, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0084, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.2383752018213272, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0092, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.46043846011161804, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0073, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.24630354344844818, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0062, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.5232640504837036, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0067, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 0.3850713074207306, + "learning_rate": 1.979809151602651e-05, + "loss": 0.014, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 0.44703760743141174, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0081, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.3762659728527069, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0099, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.4593638479709625, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0093, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.40554332733154297, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0125, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.33439910411834717, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0081, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.2623269855976105, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0062, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.22419600188732147, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0078, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.37183159589767456, + "learning_rate": 1.978133252131276e-05, + "loss": 0.01, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.27857136726379395, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0089, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.27683520317077637, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0069, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.45064759254455566, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0076, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.24215294420719147, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.0071, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.5163891315460205, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.0078, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.3922234773635864, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0077, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.19653558731079102, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0063, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.17621839046478271, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0084, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.6482162475585938, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0075, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.32759004831314087, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0088, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.33347561955451965, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0073, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.42883744835853577, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0084, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 0.3348788917064667, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0082, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.28349289298057556, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0102, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.2733197510242462, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0074, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.3263874351978302, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.01, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.295757532119751, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0071, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5598515868186951, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0093, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.425937294960022, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0083, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.2442379742860794, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0087, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.3378766179084778, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0163, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5137761831283569, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0099, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.3825916647911072, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0096, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.32084307074546814, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0066, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.3979593515396118, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0077, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.3103732764720917, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0067, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.5531997084617615, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0131, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5423216819763184, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0121, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.5038735270500183, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0087, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.44273868203163147, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.008, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.335232675075531, + "learning_rate": 1.971017390295979e-05, + "loss": 0.009, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.4746256470680237, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.26807400584220886, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0075, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.35464033484458923, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0123, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.33803898096084595, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0094, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 0.20334473252296448, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0101, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.34386369585990906, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0081, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.38781842589378357, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0088, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.25994163751602173, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0079, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.3342406451702118, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0091, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.3120318353176117, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0079, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.3556351661682129, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0073, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.21421445906162262, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0095, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.39498451352119446, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0087, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.5480947494506836, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0079, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.16734588146209717, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0072, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.3987548351287842, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0083, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.3929785490036011, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0096, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.2884303331375122, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0102, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.3338335454463959, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0092, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.47452738881111145, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0093, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.25584715604782104, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0068, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 0.3038389980792999, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0076, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.4123639464378357, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0101, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.38520675897598267, + "learning_rate": 1.964833301001045e-05, + "loss": 0.014, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.3355116844177246, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0094, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.3479195535182953, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0105, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.2700177729129791, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0076, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.2166757434606552, + "learning_rate": 1.963745667883003e-05, + "loss": 0.008, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 0.18578873574733734, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0071, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.26316413283348083, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0079, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.28762468695640564, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0115, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 0.3712877631187439, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0074, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.2862299382686615, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0072, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.2730867564678192, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0101, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.327648401260376, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0092, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.41153189539909363, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0083, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.32522135972976685, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0095, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.22764958441257477, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0085, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.3491888642311096, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.009, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.3123551607131958, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0103, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.1881783902645111, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0085, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.40902259945869446, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0089, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.382953941822052, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0088, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 0.23950865864753723, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0064, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.3419397175312042, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0118, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.42207059264183044, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0091, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.40754130482673645, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0087, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.2390766590833664, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0069, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.2974188029766083, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.0091, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.2993582785129547, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0092, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.42652204632759094, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0068, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.3138194680213928, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.009, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.38833311200141907, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0083, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.4015152156352997, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0081, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.42086881399154663, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.007, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.26732996106147766, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0071, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.5763937830924988, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0101, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.2955382764339447, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0075, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.4625638723373413, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0094, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.29631468653678894, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0096, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.46335819363594055, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0103, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.3183141350746155, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.008, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.26456212997436523, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0083, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 0.40924879908561707, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0097, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 0.3981763422489166, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0094, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.36437541246414185, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0064, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.2935962378978729, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0081, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.3478807210922241, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0079, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.3460087180137634, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0069, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.2706817090511322, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0088, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.2674945890903473, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0083, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.2268197238445282, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0072, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.3216208219528198, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0092, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.3226968050003052, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0101, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.2743329405784607, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0075, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.32573118805885315, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0094, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 0.53167325258255, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0099, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.3915646970272064, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0089, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.4526256322860718, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0101, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.323249489068985, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0094, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4046335816383362, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0088, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.34745559096336365, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0078, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.30308133363723755, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0071, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.37923407554626465, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0076, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 0.26785972714424133, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0093, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.2778306305408478, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0083, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.611038088798523, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0098, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.4114893078804016, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0111, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.2732110023498535, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0076, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.2964401841163635, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0095, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.40240928530693054, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0097, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.3901022672653198, + "learning_rate": 1.944152646499645e-05, + "loss": 0.008, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.38001132011413574, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0109, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.35937973856925964, + "learning_rate": 1.943474465322135e-05, + "loss": 0.007, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.2745327651500702, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0075, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.1598518043756485, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.007, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.401614785194397, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0115, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.4127846360206604, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0068, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.22147920727729797, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0061, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.28602245450019836, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0067, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.22147324681282043, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0076, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.2550548315048218, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0088, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.24113087356090546, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0076, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.3658410608768463, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0075, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.3856262266635895, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0112, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.33494284749031067, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0075, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.3767516314983368, + "learning_rate": 1.938969919958475e-05, + "loss": 0.01, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.24380649626255035, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.009, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.30575039982795715, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0079, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.32913386821746826, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.009, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.29845312237739563, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0099, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.44377902150154114, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0092, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.34614384174346924, + "learning_rate": 1.936834723687526e-05, + "loss": 0.009, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.3316318690776825, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0096, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.4076138734817505, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0081, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 0.30320486426353455, + "learning_rate": 1.935753861926916e-05, + "loss": 0.015, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.32243025302886963, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.011, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.323745459318161, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0077, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5750753283500671, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0088, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.22709843516349792, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0101, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.3067542314529419, + "learning_rate": 1.933932815280178e-05, + "loss": 0.007, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.392337828874588, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0089, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.43343180418014526, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0073, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.4371345341205597, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0078, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.35214635729789734, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0077, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.3259161412715912, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0074, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.3849303722381592, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0066, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.3968902826309204, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0091, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.33016201853752136, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0095, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.3859156668186188, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.008, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.3020654618740082, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.007, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.44503262639045715, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0105, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.3908904194831848, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0073, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.39256253838539124, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0078, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.352611243724823, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0077, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.39203983545303345, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0081, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.23835115134716034, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0066, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.24996638298034668, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0098, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.29537609219551086, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0082, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.2898835837841034, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0077, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.4040369391441345, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0083, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.3501318395137787, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0094, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5462452173233032, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0097, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.4217568337917328, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0072, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.18295089900493622, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0083, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.3695569336414337, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0078, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.37818798422813416, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0089, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.29818472266197205, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0084, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.3328498303890228, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.01, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.340724378824234, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0075, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.2966301441192627, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0063, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.30677109956741333, + "learning_rate": 1.922098355206593e-05, + "loss": 0.008, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.2091839611530304, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.0078, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.4229014217853546, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0115, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.40779992938041687, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0075, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.378817081451416, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.008, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.29796919226646423, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0092, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.2702767252922058, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0076, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.31349876523017883, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0085, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 0.30500444769859314, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0093, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.2860834002494812, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0061, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.26036593317985535, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0099, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.19049863517284393, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0075, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.3235284388065338, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0083, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.364092618227005, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.011, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.2409065216779709, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0092, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.36907926201820374, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.008, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.3230077922344208, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0073, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.191047802567482, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0063, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.3346494436264038, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0082, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.21352025866508484, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0075, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.5505086779594421, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0095, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.34264758229255676, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0083, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.20266413688659668, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0074, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.24938757717609406, + "learning_rate": 1.912718096497034e-05, + "loss": 0.007, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.4140026569366455, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0086, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.4424414038658142, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0104, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 0.5327904224395752, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0115, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 0.35958340764045715, + "learning_rate": 1.911035077753307e-05, + "loss": 0.01, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.2547682523727417, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0066, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.3701247274875641, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0115, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.34443217515945435, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0077, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.20353800058364868, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0061, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5660653114318848, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0091, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.26445311307907104, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0073, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.5561402440071106, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0071, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.3700469434261322, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0083, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.35783904790878296, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.008, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.3238641619682312, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0081, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.25247740745544434, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0099, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.435730904340744, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.008, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.37758126854896545, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0068, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.33323949575424194, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.0094, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.4356318712234497, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0093, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 0.37893903255462646, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0058, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.4411139190196991, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0085, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.3852006793022156, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0087, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4287096858024597, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0107, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.43085435032844543, + "learning_rate": 1.902392195640386e-05, + "loss": 0.009, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 0.2709400951862335, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0066, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.358126163482666, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0082, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.25320038199424744, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0077, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 0.31440937519073486, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0077, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.25246965885162354, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0079, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.28420332074165344, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0101, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.25251317024230957, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0075, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.19744229316711426, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0069, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4457854628562927, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0073, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.36817625164985657, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0096, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.3394709825515747, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0073, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.2909093201160431, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0065, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.20237651467323303, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0057, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.29520732164382935, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0072, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.25512900948524475, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0096, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.45816823840141296, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0073, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.33459368348121643, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0096, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.21619321405887604, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0063, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.25518253445625305, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0067, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.2273867279291153, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.007, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.2864684462547302, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0074, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.3077942728996277, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0075, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 0.40526703000068665, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0079, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.41480064392089844, + "learning_rate": 1.891523933768891e-05, + "loss": 0.01, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.2750788629055023, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0064, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 0.29671600461006165, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0095, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 0.24160107970237732, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0069, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 0.2949109971523285, + "learning_rate": 1.889660337749874e-05, + "loss": 0.007, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.2847975492477417, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0059, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.30052465200424194, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0067, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.36128419637680054, + "learning_rate": 1.888252908366661e-05, + "loss": 0.014, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.36974236369132996, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0064, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.43730056285858154, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0084, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.3145422339439392, + "learning_rate": 1.88683715346172e-05, + "loss": 0.008, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.35473865270614624, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0091, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.2501350939273834, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.008, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.34808069467544556, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0099, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.45218509435653687, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.0068, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.34530994296073914, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0098, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.38257333636283875, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0101, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.3040159344673157, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0079, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.3323517143726349, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0068, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.2639414370059967, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0078, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.3493870794773102, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0081, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.5838330984115601, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0091, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 0.428803026676178, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0087, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.3654572069644928, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0114, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.3295663297176361, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0075, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.3469060957431793, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0074, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.3366406261920929, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0066, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.32569241523742676, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0054, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3086700737476349, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0086, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.38562801480293274, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0092, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.3523421585559845, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0085, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.2278694063425064, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0063, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.32141822576522827, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0147, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.3375259041786194, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0077, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4483063220977783, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0062, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.3667140007019043, + "learning_rate": 1.874717450126662e-05, + "loss": 0.008, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.3419000506401062, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0079, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.36556369066238403, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0079, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.33135318756103516, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0064, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.4458329975605011, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0091, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.34939518570899963, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0072, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.34424352645874023, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0077, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.3460613191127777, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0113, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.38822048902511597, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0066, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.35550639033317566, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0083, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 0.30869176983833313, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0087, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.38202086091041565, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0081, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.25744789838790894, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0074, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.29700344800949097, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0082, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.305786669254303, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0076, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.3291271924972534, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0071, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.26111704111099243, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0074, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.348176509141922, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0086, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.27502793073654175, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0076, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.2831551432609558, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0092, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.39652079343795776, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0066, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.3885122239589691, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0087, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.37296077609062195, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0104, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.33606627583503723, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0086, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.3855937421321869, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0097, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.3322301506996155, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0076, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 0.33322253823280334, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.009, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.22358210384845734, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0088, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.5901851058006287, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0088, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.4703235328197479, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0084, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.20072896778583527, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0077, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.3537980616092682, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0098, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.3123277723789215, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0068, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.35979342460632324, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0065, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.38628828525543213, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0074, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.3498038053512573, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0074, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.20784054696559906, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0059, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.1811107099056244, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0085, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.43317103385925293, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0064, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.3815033435821533, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0064, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.35989734530448914, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.008, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.46118423342704773, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.012, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.25334376096725464, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0078, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.26764073967933655, + "learning_rate": 1.852547637090483e-05, + "loss": 0.01, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.2785920202732086, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0066, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.41587865352630615, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0061, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.42850133776664734, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.009, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.32369133830070496, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0091, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.2930110692977905, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0069, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.3199067711830139, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0085, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 0.4349478483200073, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0078, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 0.3054976165294647, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0061, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.2826739251613617, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0068, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.25106528401374817, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.007, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.25897887349128723, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0076, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.26398584246635437, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.0069, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.41751599311828613, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0083, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.17239610850811005, + "learning_rate": 1.844974808419918e-05, + "loss": 0.006, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3300461173057556, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0051, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.2645586133003235, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0068, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.24550332129001617, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0071, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.2889944911003113, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0091, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.476601779460907, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0066, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.35630306601524353, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0074, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.35651877522468567, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0084, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.3889803886413574, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0079, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4214278757572174, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.009, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.30540233850479126, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0083, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.3624532222747803, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0076, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.32963570952415466, + "learning_rate": 1.838347361898993e-05, + "loss": 0.01, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.3533381521701813, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0064, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.3011729419231415, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0065, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.4733760952949524, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0089, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.38553985953330994, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0059, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.2560643255710602, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0073, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.39531010389328003, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0106, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.2701983153820038, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0086, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.352717787027359, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0096, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.29157745838165283, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0073, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.4267994165420532, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0075, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.36308032274246216, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0075, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.33457428216934204, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0103, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.3717971444129944, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0069, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 0.21432936191558838, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0081, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.2878777086734772, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0057, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.4453850984573364, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0095, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.36917057633399963, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0063, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.3252313733100891, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0082, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.2529674470424652, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0057, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.2816419303417206, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0097, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.6464210152626038, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0076, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.33034399151802063, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0069, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.27335023880004883, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0078, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 0.3158395290374756, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0064, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.5128306746482849, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0087, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 0.24884961545467377, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0084, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.324278324842453, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0075, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6472476124763489, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0093, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.21269051730632782, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0066, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.29203882813453674, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0074, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.30436405539512634, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0087, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5066608190536499, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0081, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.32647472620010376, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0066, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.2804315388202667, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0066, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.24779941141605377, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0074, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.34001022577285767, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0101, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.2611280381679535, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0082, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.3129233717918396, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0079, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.2822776734828949, + "learning_rate": 1.815952390818299e-05, + "loss": 0.0098, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.36969345808029175, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0064, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.33959338068962097, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0088, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.2628033459186554, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0062, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.38812723755836487, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0061, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.26403307914733887, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0055, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 0.3789900541305542, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0081, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.28676870465278625, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0127, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.606293797492981, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0082, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.37321826815605164, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0063, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.368115097284317, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0091, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.3368416726589203, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0068, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.23466472327709198, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.006, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.3796599507331848, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0169, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.2202090471982956, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0099, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.5006175637245178, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0086, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.3673453629016876, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0083, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4379428029060364, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.006, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.43015891313552856, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0084, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.2806220054626465, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0061, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.23545289039611816, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0062, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 0.32115358114242554, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0075, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.3217777907848358, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0062, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.3224331736564636, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0072, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.31703537702560425, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0082, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 0.4175204932689667, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.008, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.22969186305999756, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0084, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.3421284258365631, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0077, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.32668444514274597, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0071, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.2729822099208832, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0068, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.33153197169303894, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0074, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.4678424000740051, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0076, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.23711496591567993, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0076, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.3230719566345215, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0084, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.32328692078590393, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0075, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.566879153251648, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0072, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.26277920603752136, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0062, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.339163601398468, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0082, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.23408609628677368, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0061, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.2942394018173218, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0065, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 0.3774799704551697, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0063, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.2847958207130432, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0072, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.2577030062675476, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0088, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.2883673906326294, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0075, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.3596307933330536, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0073, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.30285483598709106, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0076, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.2933914363384247, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0077, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.7666468024253845, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0102, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.31347739696502686, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0072, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.3435507118701935, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0081, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.3266170620918274, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0058, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.284027099609375, + "learning_rate": 1.784745142605655e-05, + "loss": 0.005, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.19972574710845947, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0072, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.2587524950504303, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0067, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.2922254204750061, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0064, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.17053507268428802, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0092, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.2850453555583954, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0073, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.2844892144203186, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0075, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.28969481587409973, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0079, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.4704195261001587, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0102, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.2652505338191986, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0077, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.2656702399253845, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0118, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.2282119244337082, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0086, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.30130353569984436, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0062, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.2295757234096527, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0066, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.25287938117980957, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0065, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.3274557292461395, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0076, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.34377023577690125, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0079, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.36259520053863525, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0055, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.24462608993053436, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0067, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.3615039587020874, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0088, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.40002626180648804, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0086, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.3362888991832733, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0062, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.33698126673698425, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0087, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.3287750482559204, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.0068, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.23409898579120636, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0063, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.23275460302829742, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0066, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.35324692726135254, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0068, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.2781875729560852, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0066, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.3083304166793823, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0069, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.22543831169605255, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0066, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.22566530108451843, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0066, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.3640650808811188, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0073, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.35346123576164246, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0069, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 0.30858153104782104, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0076, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.30895760655403137, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0074, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.30667638778686523, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0082, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.3134152889251709, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0086, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.21407048404216766, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0077, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.3456077575683594, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0083, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.4259016513824463, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.009, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.38690924644470215, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0094, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.31742537021636963, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0065, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.3568819463253021, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0077, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.3771888315677643, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0073, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.25528469681739807, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0067, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.36028411984443665, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0064, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.41987329721450806, + "learning_rate": 1.754802282200567e-05, + "loss": 0.007, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.18902993202209473, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0064, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.1859915405511856, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0086, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.1778331696987152, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0052, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4222147464752197, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.007, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.26806506514549255, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0074, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.34431734681129456, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0056, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.41732800006866455, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0079, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.3027847409248352, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0054, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.47592151165008545, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0066, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9539707899093628, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0095, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.4084669351577759, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0082, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.3052361309528351, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0072, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.23123528063297272, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.009, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.20356184244155884, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0073, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.048543930053711, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.3017459213733673, + "learning_rate": 1.74400239259128e-05, + "loss": 0.007, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.3679676353931427, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0085, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.20339734852313995, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0087, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.3523346781730652, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0076, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.4162348210811615, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0063, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.3293565511703491, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0067, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.24455691874027252, + "learning_rate": 1.739902378104222e-05, + "loss": 0.007, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 0.17645037174224854, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0051, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.2554231286048889, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0076, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.20006878674030304, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0076, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.27911216020584106, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0082, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.5701723694801331, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0081, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.222118079662323, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0072, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.2762138843536377, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0049, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 1.4110082387924194, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0114, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.31313180923461914, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0078, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.20941513776779175, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0079, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.3963930308818817, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0053, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.2066672146320343, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0049, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.3919369876384735, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0082, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.2544628083705902, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.0054, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.31123557686805725, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0078, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.24768301844596863, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0051, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.26674744486808777, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0052, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.27382466197013855, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0075, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.23384103178977966, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.0059, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.3531075417995453, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0068, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.34425088763237, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0066, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.2716144323348999, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0058, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.35163211822509766, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0071, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.23585639894008636, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0072, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.28066661953926086, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0068, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.3146689832210541, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0071, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.37553170323371887, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.008, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.18403242528438568, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0068, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.3904851973056793, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0072, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.4481397867202759, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0074, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.31124234199523926, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0074, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.3815377354621887, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0084, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.2909438908100128, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0074, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.3408021330833435, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0074, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.23902025818824768, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0076, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.2194853127002716, + "learning_rate": 1.714740708672306e-05, + "loss": 0.006, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 0.4337097108364105, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0092, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.4132380783557892, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0078, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.3434816598892212, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0076, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.25129666924476624, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0058, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.45458248257637024, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0064, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.5350340008735657, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.009, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 0.28008121252059937, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0073, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.33276447653770447, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0064, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37103456258773804, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0078, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 0.4689319133758545, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0073, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.3622629642486572, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.006, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.2822306156158447, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0073, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.19226481020450592, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0059, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.27806007862091064, + "learning_rate": 1.704700993266678e-05, + "loss": 0.007, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.25948378443717957, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0076, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.5857216715812683, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0095, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.30467140674591064, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0073, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.2067701816558838, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0068, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 0.5653601288795471, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0087, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.3107249140739441, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0065, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4027363061904907, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0098, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.2757766544818878, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0091, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.30397671461105347, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0061, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.28112074732780457, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0063, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.14751966297626495, + "learning_rate": 1.696714953556411e-05, + "loss": 0.008, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.2988373935222626, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0055, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.2706286311149597, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0066, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.3612031042575836, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.006, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.25386789441108704, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0065, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.3170768916606903, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0056, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.4776926338672638, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0059, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.34828829765319824, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0088, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.20440815389156342, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0066, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.2943046987056732, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0068, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.16982606053352356, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0073, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.5607914924621582, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0085, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.35823172330856323, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0064, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.23943926393985748, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0068, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.24083787202835083, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0056, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.37987980246543884, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0062, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.35953620076179504, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0069, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.22255095839500427, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0071, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.4121200442314148, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0098, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.2377164363861084, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0076, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.2298472374677658, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0064, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.40824711322784424, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0066, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.33295100927352905, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.007, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.3978032171726227, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0077, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.27672451734542847, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.006, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.2591206729412079, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0089, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.1749347746372223, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0051, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.18699893355369568, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0056, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.240631103515625, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0089, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.3650512993335724, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0075, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.3503545820713043, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0067, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.3086877167224884, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0061, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.41695648431777954, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0064, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.33144691586494446, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0067, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.2679164409637451, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0072, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.22681233286857605, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0071, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.36362454295158386, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0067, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.20192845165729523, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0067, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.3895004093647003, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0055, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.22510671615600586, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0069, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.19641445577144623, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0101, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.2914806008338928, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0076, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.3187137544155121, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0059, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.3116552233695984, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0095, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.2597426772117615, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0058, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.21480600535869598, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0055, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.23912057280540466, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.006, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.317941278219223, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0064, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.58933025598526, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0095, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.21906700730323792, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0105, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.23899045586585999, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0059, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.2969389259815216, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0124, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.3514954447746277, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0066, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.18145518004894257, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0077, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.3087640404701233, + "learning_rate": 1.656303606359183e-05, + "loss": 0.006, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.3532063364982605, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0055, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.34000685811042786, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0096, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.24904295802116394, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0073, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.36314642429351807, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.008, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.20241902768611908, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.009, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.3215351700782776, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0075, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 0.4313117563724518, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0081, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.48170387744903564, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0071, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.3369109630584717, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0066, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.34541958570480347, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0058, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.2493886947631836, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0058, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.22845667600631714, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0096, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.2695702016353607, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0055, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 0.28211796283721924, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0052, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.1901162564754486, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0058, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.2701025605201721, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0061, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.36527693271636963, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0072, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.3061700463294983, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0067, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.5612105131149292, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0087, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.23399880528450012, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0072, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.314933180809021, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0078, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.35548436641693115, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0094, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.37685567140579224, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0084, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.3190719783306122, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0065, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.26337119936943054, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0063, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.3518264889717102, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0072, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.3185817003250122, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0068, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.2995646893978119, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0064, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.3110463619232178, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0063, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.24277286231517792, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0064, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.17603862285614014, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0061, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.28089356422424316, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0076, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.2855492830276489, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0047, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.3247278928756714, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0058, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.18349547684192657, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0061, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.30654969811439514, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.007, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.2674420177936554, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0067, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.38177546858787537, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0091, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.33796218037605286, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0068, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.3754856586456299, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0063, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.21820858120918274, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.007, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.36184942722320557, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0061, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.32240399718284607, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0063, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 0.24755406379699707, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0059, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.397858589887619, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0064, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.389072448015213, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0063, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.3368140757083893, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0071, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.29631632566452026, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0062, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.24265453219413757, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0076, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.19892603158950806, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0064, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.1852462887763977, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0051, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.1886446475982666, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0075, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.25982722640037537, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0068, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.3376137614250183, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0058, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.33173730969429016, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0064, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.3177517354488373, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0072, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.3385971784591675, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0066, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.29163679480552673, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0073, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.2335229516029358, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0056, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.24502214789390564, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0054, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.2009458988904953, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0061, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.3341793715953827, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0082, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.3872147798538208, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0063, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.29940876364707947, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0073, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.4895729720592499, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0086, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.4485950469970703, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.0053, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.22961653769016266, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0077, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.24187293648719788, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.005, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.3535212278366089, + "learning_rate": 1.601916647245149e-05, + "loss": 0.007, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.26539868116378784, + "learning_rate": 1.601107070706339e-05, + "loss": 0.008, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.43096065521240234, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0076, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.16919535398483276, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0058, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.2383720725774765, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0064, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.36103156208992004, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0067, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.2657287120819092, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0072, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.21437199413776398, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0065, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.34000417590141296, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0046, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.4855337142944336, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0068, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.3178497850894928, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0064, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.3171309530735016, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0067, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.3364340662956238, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0067, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2272711992263794, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0069, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.29505178332328796, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0078, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.3755042552947998, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0081, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.2983969449996948, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0085, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.3112468421459198, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0072, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.1950412392616272, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0061, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.2153436243534088, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0065, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.25062650442123413, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0079, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.1407836377620697, + "learning_rate": 1.584793312377278e-05, + "loss": 0.005, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.17276513576507568, + "learning_rate": 1.583971586792325e-05, + "loss": 0.006, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.47983887791633606, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0076, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.28724750876426697, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0076, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.3224884569644928, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0079, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.37969788908958435, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0063, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.48106926679611206, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0071, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.3555319905281067, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0075, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.19486083090305328, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.006, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.42018064856529236, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0074, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.3075830936431885, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0071, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.20921990275382996, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0063, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.20436584949493408, + "learning_rate": 1.574895332125391e-05, + "loss": 0.006, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.28120604157447815, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0071, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.22980183362960815, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0078, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.24825431406497955, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0064, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.22042447328567505, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0071, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.249199777841568, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0076, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.32628607749938965, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0057, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.35151633620262146, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0059, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.29098865389823914, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0064, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.24006013572216034, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0058, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.2797141671180725, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0073, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.2963006794452667, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0058, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.19539053738117218, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0053, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.2686854898929596, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0051, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.35952430963516235, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0071, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.21042552590370178, + "learning_rate": 1.562410199183484e-05, + "loss": 0.005, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.27942436933517456, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0068, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.17137926816940308, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0063, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.20331411063671112, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0047, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.15683002769947052, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0052, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.14726290106773376, + "learning_rate": 1.558221191857467e-05, + "loss": 0.006, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.2940376400947571, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0068, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.4059796929359436, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0067, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.2587816119194031, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0086, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.3462979793548584, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0078, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.5607128739356995, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0079, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.24189788103103638, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0052, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 0.23362945020198822, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0073, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.22395116090774536, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0059, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.3514958322048187, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0064, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.25395795702934265, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0081, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.2948741018772125, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0051, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.22298739850521088, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0038, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.46948447823524475, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0097, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.2992243468761444, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0083, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.18001538515090942, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0055, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.23337051272392273, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0066, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.2863878905773163, + "learning_rate": 1.543878746906905e-05, + "loss": 0.006, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.23027309775352478, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0072, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.21359150111675262, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0064, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.3878735601902008, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0069, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.29146283864974976, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.007, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.21782676875591278, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0051, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.45582008361816406, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0063, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.4554077982902527, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0067, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.2254059612751007, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0064, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.13952374458312988, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0061, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.23241721093654633, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0072, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.3424162268638611, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0058, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.21074503660202026, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0057, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.33662086725234985, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0056, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.24403709173202515, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0073, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.27195101976394653, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0058, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.34224429726600647, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0072, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.29089581966400146, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0053, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3397226333618164, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0066, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.30517837405204773, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0092, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.3485032021999359, + "learning_rate": 1.52681291800283e-05, + "loss": 0.007, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.31346458196640015, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0045, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.1864607185125351, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.006, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.20976679027080536, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0053, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.22616958618164062, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0059, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.14772117137908936, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0073, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.33677151799201965, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0059, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.32354292273521423, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0061, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.21409569680690765, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0064, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.4659721851348877, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0061, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 0.32267874479293823, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0064, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.5019848942756653, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0061, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.32694318890571594, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0076, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.3013843297958374, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0068, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.1973707377910614, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0059, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22204430401325226, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0056, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.3365449607372284, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0059, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.3398677110671997, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.007, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.27888917922973633, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0062, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.2814931273460388, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0069, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.3317541182041168, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.006, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.21940776705741882, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0052, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.239700049161911, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0059, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.19117280840873718, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0071, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.21827168762683868, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0056, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.25645333528518677, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0085, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.30847233533859253, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0055, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.3127819895744324, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0058, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.30181658267974854, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0075, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.34778207540512085, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0077, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.18988046050071716, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0048, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.3479195833206177, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0045, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.24158424139022827, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0051, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.14698052406311035, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0053, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.4441753625869751, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0065, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.28078633546829224, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0064, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.29406028985977173, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0048, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3856968581676483, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0067, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.36528849601745605, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0062, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.34250667691230774, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0053, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.2862832844257355, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0055, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.3683549761772156, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0091, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.26892581582069397, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0069, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.2220073938369751, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0052, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.18825116753578186, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0065, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.28731998801231384, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0069, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.26817163825035095, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0058, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.44162800908088684, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0065, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 0.2990165948867798, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0074, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.20428279042243958, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0053, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.2918189465999603, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0056, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.30408942699432373, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0063, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.2593521177768707, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0061, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.34048640727996826, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0054, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.2438877820968628, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0059, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.5205245018005371, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0065, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.3658570349216461, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0061, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.23279106616973877, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0039, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.2704083323478699, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0054, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.1849551945924759, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0061, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.21807430684566498, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0059, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.47879981994628906, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0061, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.24125567078590393, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0056, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.25820469856262207, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0053, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.30664944648742676, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0075, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.3646678030490875, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0057, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.2534210979938507, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0045, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.2125798910856247, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0074, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 0.4387839734554291, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0072, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.337387353181839, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.01, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.23150259256362915, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0072, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.3243090808391571, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0076, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.26716119050979614, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.006, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.15551891922950745, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0061, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.1841796338558197, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0058, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 0.3119230270385742, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.006, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.2633327841758728, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0059, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.24567869305610657, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0055, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.3697315454483032, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0061, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.1941021829843521, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0052, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.2610131502151489, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.007, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.24856074154376984, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0062, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.27259066700935364, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0052, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.20962993800640106, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0055, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.4015270471572876, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0062, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.22935271263122559, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0063, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.29984018206596375, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0059, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.35775551199913025, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0079, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.15501125156879425, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0054, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.3543296158313751, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0072, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.1982075721025467, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0064, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.2616399824619293, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0062, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.2612541615962982, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0064, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3081730008125305, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0055, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.24024926126003265, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0083, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.20793405175209045, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0055, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.21445533633232117, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0058, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.24078251421451569, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0059, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.36214157938957214, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0061, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.2583295702934265, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0054, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.2641732394695282, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0069, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.2179708331823349, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0049, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.27418699860572815, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0049, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.3894921839237213, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0076, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.3912152945995331, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0063, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.16886518895626068, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0059, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.2731325626373291, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0073, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.3299262225627899, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.007, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.2671407163143158, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0058, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.2701479196548462, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0059, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.3803080916404724, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0061, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.2621704041957855, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0061, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.27780428528785706, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0065, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.3326016962528229, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0081, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.3632255792617798, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0069, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.24395202100276947, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0065, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.3215671181678772, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0066, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.2625272572040558, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0065, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.31547197699546814, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0043, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.1893424689769745, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0059, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.27042335271835327, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0059, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.22597061097621918, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0063, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.1742873191833496, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0062, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.16797663271427155, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0048, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.42558521032333374, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0075, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.37216684222221375, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0061, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.19943472743034363, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0065, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.2211161106824875, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0075, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.2680184245109558, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0052, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.2402123361825943, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0051, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.1881084442138672, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0066, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.26134756207466125, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0063, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.3185539245605469, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0062, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.3118845820426941, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0061, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.22595946490764618, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.007, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.2627023458480835, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0067, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.2984865605831146, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0051, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.25496092438697815, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0057, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.3078263998031616, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0074, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.17885653674602509, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0057, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.37737196683883667, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0058, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.21651378273963928, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0053, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.1974128633737564, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0059, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.17184904217720032, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0058, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.3074864447116852, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0059, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.28784239292144775, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0061, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.3435216546058655, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0065, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.38048845529556274, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0057, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.1875533014535904, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0052, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.48555630445480347, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0063, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 0.25066429376602173, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0055, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.2763892412185669, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0059, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.21217335760593414, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0092, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.23555652797222137, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0064, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.14828811585903168, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.006, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 0.27303484082221985, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0047, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.14681454002857208, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0067, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.43693456053733826, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0081, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.2940906286239624, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0059, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.20382657647132874, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0074, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.25655868649482727, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0069, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.31879740953445435, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0062, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4898712933063507, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0051, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.17142456769943237, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0061, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.14010348916053772, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0045, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.26882827281951904, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0056, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.2636195421218872, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0048, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.24932081997394562, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0045, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.3367895185947418, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0049, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.15173649787902832, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0053, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.34083831310272217, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0072, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3327343165874481, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0048, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.36545902490615845, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0076, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.22761192917823792, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0067, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.19272181391716003, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0072, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.2881070375442505, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.006, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.32841676473617554, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0063, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.19850151240825653, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0052, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.31401291489601135, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0052, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.4023345112800598, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0058, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.25802844762802124, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0051, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.19678954780101776, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0053, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.4545653164386749, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0073, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.36174362897872925, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0068, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.31692951917648315, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0063, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.3470834195613861, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0064, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.29541268944740295, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0062, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.26377183198928833, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.006, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.2019137591123581, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0058, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.45156505703926086, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.007, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.15810425579547882, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.006, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.20093902945518494, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.006, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.28989917039871216, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0062, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.39454182982444763, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0063, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.25967612862586975, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0069, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.2058791220188141, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0065, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.26367849111557007, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0074, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.2432256042957306, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0054, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.19844679534435272, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0048, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.16757237911224365, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0052, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.2988821566104889, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0047, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.2231496274471283, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0048, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.265029639005661, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0048, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.41179928183555603, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.0049, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.33498677611351013, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0052, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.2323407232761383, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0048, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.27306419610977173, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0061, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.2791977822780609, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0088, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.453421026468277, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0073, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.3209727108478546, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0063, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.2572932839393616, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0056, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.19572272896766663, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0051, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.2831172049045563, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0057, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.21267575025558472, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0059, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.3220005929470062, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0057, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.2515857517719269, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0063, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.18344618380069733, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0052, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.34515154361724854, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0052, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.16711464524269104, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0054, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.3027217984199524, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.006, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.31168296933174133, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.007, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5778804421424866, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0056, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.2591782212257385, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0061, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.2449295073747635, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0046, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 0.19733767211437225, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0054, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.14837461709976196, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0053, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.3784295916557312, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0054, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.2400134950876236, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0054, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.17671307921409607, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0051, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.2664073705673218, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.006, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.25426605343818665, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0062, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.26733267307281494, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0049, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.46151378750801086, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.006, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.17070212960243225, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0062, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.42009514570236206, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0052, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.20439159870147705, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0053, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.25189417600631714, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0066, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.21402288973331451, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0072, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.294109046459198, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0061, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.29355865716934204, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0061, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.2937833368778229, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0061, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.1926010102033615, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0056, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.21794214844703674, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0065, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.23409108817577362, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0067, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.4696379005908966, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0062, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.28415724635124207, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0061, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.22433705627918243, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0064, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3090682923793793, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0056, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.23742817342281342, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0057, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.2670089900493622, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0052, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.2810697555541992, + "learning_rate": 1.299277443549658e-05, + "loss": 0.007, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.44233059883117676, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0069, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.14227768778800964, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0064, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.298776239156723, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0072, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.2882034480571747, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0064, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.23135380446910858, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0064, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.2870500981807709, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.005, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.24524538218975067, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0064, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.2949783504009247, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0081, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.2215491235256195, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0069, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.26351356506347656, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0082, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.1909482628107071, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0052, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.13428187370300293, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0068, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.2125115543603897, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0048, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.27032148838043213, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0056, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.20981402695178986, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0069, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.24961373209953308, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0073, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.13643066585063934, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0054, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.25289252400398254, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0071, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.4061530828475952, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.006, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.29924723505973816, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0055, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.37029367685317993, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0053, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.37273409962654114, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0066, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.18242980539798737, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0054, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.18563945591449738, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0044, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.32972440123558044, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0045, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 0.3327874541282654, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0065, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.2077408730983734, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0055, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.1813255399465561, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0055, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.17811767756938934, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0055, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.20526157319545746, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0043, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.112189382314682, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0055, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.29082757234573364, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0099, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.23212411999702454, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0067, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.17449915409088135, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0047, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.3327349126338959, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0047, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.2709571123123169, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0056, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.19788618385791779, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0063, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.22075456380844116, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0064, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.2943982779979706, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0057, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.1718410849571228, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0056, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.3546068072319031, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0055, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.18132814764976501, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0047, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.17795684933662415, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0048, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.22964486479759216, + "learning_rate": 1.257232766480803e-05, + "loss": 0.005, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.3259448707103729, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0072, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.18410101532936096, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0045, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.28669047355651855, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0056, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.25986725091934204, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0055, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.1731722205877304, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0053, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.17501944303512573, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.005, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.2749968469142914, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0046, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.26125603914260864, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0055, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.22476239502429962, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0103, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.26169249415397644, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0067, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.19236186146736145, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0048, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.26535508036613464, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0055, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.2534106373786926, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0052, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.29464206099510193, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0076, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.3711875081062317, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0059, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.26430103182792664, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0055, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.27274343371391296, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.006, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.15951389074325562, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0069, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.33735600113868713, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0064, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.19443227350711823, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0051, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.1960541307926178, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0049, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21133695542812347, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0066, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.22702853381633759, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.006, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.22489185631275177, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0061, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.33164891600608826, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0067, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.22196516394615173, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.0055, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.19532594084739685, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0048, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.41902172565460205, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0064, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.30388328433036804, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0052, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.2507944703102112, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0051, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.30817684531211853, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0052, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.27485454082489014, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.006, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.14287802577018738, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0047, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 0.14513961970806122, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.0049, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.3345814645290375, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0051, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.2974685728549957, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0049, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.3455393612384796, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0062, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.16792115569114685, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.005, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.3038713335990906, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.005, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.2928559184074402, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0054, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.2317439168691635, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0039, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.3498123586177826, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0067, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.2850436866283417, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0045, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.18316122889518738, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0089, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.34362390637397766, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0066, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.13047993183135986, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0057, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.3403606116771698, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0055, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.27717292308807373, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0043, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.27412480115890503, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0049, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.1914675235748291, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0075, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.3778243958950043, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0084, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.20566068589687347, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.007, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.1868937760591507, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0051, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.24719548225402832, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.005, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.20591633021831512, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0053, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4353996217250824, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.005, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.31571000814437866, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.005, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.14182177186012268, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0048, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.3461489975452423, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0062, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.17980965971946716, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0043, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.28671878576278687, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0048, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.18663623929023743, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0072, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.25223061442375183, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0063, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.20179906487464905, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.37325599789619446, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0079, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.18855971097946167, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0052, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.2992260754108429, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0051, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.18020357191562653, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0046, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.2106374204158783, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0044, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3749687373638153, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0068, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.1616801619529724, + "learning_rate": 1.188676298665799e-05, + "loss": 0.007, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.20882001519203186, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0143, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.16600479185581207, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0052, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.406480073928833, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0051, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.27349016070365906, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0056, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.2340608835220337, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0044, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.3165459632873535, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0042, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.19552721083164215, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0047, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.21882636845111847, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0061, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.23699741065502167, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0052, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.283207505941391, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0053, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.2782933712005615, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0062, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.3389151096343994, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0074, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.25642505288124084, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0061, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.19476772844791412, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0067, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.1992277055978775, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0057, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.21006375551223755, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0058, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.18808932602405548, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0073, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.258075475692749, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0052, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.29291409254074097, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0052, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.19002115726470947, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0041, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.4246057868003845, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.006, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 0.16166792809963226, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.005, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.35779255628585815, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0065, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.20405125617980957, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0082, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.23229332268238068, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0095, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.21156901121139526, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0074, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.22334401309490204, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0051, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.18344342708587646, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0048, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.22982414066791534, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0056, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.24991759657859802, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0046, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.27965986728668213, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0045, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.309841126203537, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0054, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.20964398980140686, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0044, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.45226722955703735, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0057, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.17177052795886993, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0064, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.8886939287185669, + "learning_rate": 1.153689339251154e-05, + "loss": 0.008, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.14726528525352478, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0066, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.32135209441185, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0064, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.22926779091358185, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0052, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.21345189213752747, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0047, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.31324461102485657, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0072, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.2185574620962143, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0047, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.36229151487350464, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0042, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.3479749262332916, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0053, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.23806153237819672, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0065, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.30633601546287537, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0079, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.2326052039861679, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0063, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 0.1756114363670349, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0064, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.18622055649757385, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0045, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.3261238932609558, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0059, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.16155003011226654, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0057, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.22661013901233673, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0046, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.24310468137264252, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0044, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.16182619333267212, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0056, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.1656215786933899, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0039, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.2945510447025299, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0049, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.24436083436012268, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0058, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.34221476316452026, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0069, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.26235878467559814, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0055, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.23333275318145752, + "learning_rate": 1.130316049722011e-05, + "loss": 0.005, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.23382601141929626, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0057, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 0.1693800389766693, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0058, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.3740929067134857, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.005, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.26146796345710754, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0038, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.13361674547195435, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0053, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8631370663642883, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0085, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.2952764630317688, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0054, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.23047442734241486, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0054, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.25271645188331604, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0059, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.3246142864227295, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0066, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.31531205773353577, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0045, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4806351959705353, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0089, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.15645328164100647, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0051, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.29767802357673645, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0044, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.23338516056537628, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0055, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.20454354584217072, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0049, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.2087928056716919, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.004, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.18911990523338318, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0058, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.16931432485580444, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0053, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.3027138411998749, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0055, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.22635169327259064, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0039, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.26646292209625244, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0047, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.20067426562309265, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0054, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.22507227957248688, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0076, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.18533077836036682, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.005, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.1757635474205017, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0077, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.2326493263244629, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.006, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.2661048471927643, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0048, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.3285987079143524, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0047, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.3764145076274872, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.005, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.19637148082256317, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0048, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 0.16601431369781494, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.005, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.12405529618263245, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0036, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.21413138508796692, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0043, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.3323937952518463, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0057, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.20915299654006958, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0054, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.28372666239738464, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0048, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.32995301485061646, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0051, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.2148507684469223, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0061, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.22549118101596832, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.005, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.19749189913272858, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0049, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.250184565782547, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0065, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.23174546658992767, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.2707926034927368, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0049, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.175989031791687, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0058, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.2267833948135376, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0044, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.3495822846889496, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0048, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.2051204890012741, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0063, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.22149987518787384, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0058, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.21434035897254944, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0046, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.2996143400669098, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0065, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.22886960208415985, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0053, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.3317148685455322, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.005, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.45717868208885193, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0062, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.1223258301615715, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0051, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.2037084549665451, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0046, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.3772616982460022, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0045, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.30312252044677734, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0069, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.14988413453102112, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0047, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.3409348130226135, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0069, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.2308650016784668, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0049, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.15572187304496765, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0051, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.1962181180715561, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0049, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.337464302778244, + "learning_rate": 1.067930046280971e-05, + "loss": 0.005, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.17047251760959625, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0045, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.3098141849040985, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0043, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.17919068038463593, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0052, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.3461310863494873, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.006, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.37006744742393494, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0066, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.19726566970348358, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.005, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.1319705843925476, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0049, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.2131422460079193, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0055, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.1435563862323761, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0067, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.24024318158626556, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0055, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.1511068344116211, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0052, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.16795606911182404, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0047, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.1475641280412674, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0046, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.21277494728565216, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0048, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.2511015832424164, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0043, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.24675171077251434, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0059, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.2560728192329407, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0055, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.30879196524620056, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.005, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.1838868409395218, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0052, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.1673516035079956, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.20293423533439636, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0047, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.25513023138046265, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0052, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.26149800419807434, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0045, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.27551159262657166, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0041, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.2508440911769867, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0043, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.2889135181903839, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0043, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.1755184680223465, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0051, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.2095116674900055, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.33451047539711, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0079, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.44589516520500183, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0064, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.24158142507076263, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0047, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.15632936358451843, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.006, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.10808487981557846, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0065, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.1782998889684677, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0046, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.16395118832588196, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.004, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 0.30205732583999634, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0058, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.1561775654554367, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.004, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.1649634838104248, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0062, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.15428072214126587, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0043, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.11285894364118576, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0067, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.3470291793346405, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0056, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.16610246896743774, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0051, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.29931193590164185, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0051, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.15366005897521973, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.005, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.2352767139673233, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0057, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.19226962327957153, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0042, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.1903623789548874, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0044, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.4167932868003845, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0071, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.2913760840892792, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0046, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.2632276713848114, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0063, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.21258050203323364, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0043, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.19750680029392242, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.0032, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.2896588444709778, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0045, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3017624020576477, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0074, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.18355949223041534, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0051, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.16483789682388306, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0056, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.2190672904253006, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0043, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.28435200452804565, + "learning_rate": 1.011517750003287e-05, + "loss": 0.005, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.2564929723739624, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0049, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.2592712342739105, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0048, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.18716935813426971, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0047, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.18236829340457916, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0049, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.27956655621528625, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0056, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.13664546608924866, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0048, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.21617569029331207, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0052, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.2196502536535263, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0054, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.20864732563495636, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0041, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.38381293416023254, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.005, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.1605401486158371, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0045, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.2079813927412033, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0051, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.2110205590724945, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0054, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.2421400547027588, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0048, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.41358140110969543, + "learning_rate": 9.969762660447491e-06, + "loss": 0.006, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.23386628925800323, + "learning_rate": 9.960077585586335e-06, + "loss": 0.005, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.20425592362880707, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0059, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.21164651215076447, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0042, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.1642364114522934, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0034, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.18716906011104584, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0044, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.15626995265483856, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0044, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.18394386768341064, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0044, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.3590037524700165, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0073, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.2103291153907776, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0051, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.19865299761295319, + "learning_rate": 9.87296819358355e-06, + "loss": 0.006, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.2052467316389084, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0065, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.31245940923690796, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0049, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.2959006726741791, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0042, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.33695659041404724, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0071, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.20898328721523285, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0062, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.3500119149684906, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0049, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.3926694095134735, + "learning_rate": 9.805290087509098e-06, + "loss": 0.007, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.24234539270401, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0039, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.1705496460199356, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0056, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.2907398045063019, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0048, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.2366454005241394, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0047, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.25498414039611816, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0046, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.163838192820549, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0048, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.1613040417432785, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0048, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.3639470338821411, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0042, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.22151169180870056, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0043, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.13474372029304504, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0051, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.2601003050804138, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0038, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.20202822983264923, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0046, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.18514803051948547, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0061, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.16678287088871002, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0038, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.17608965933322906, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0041, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.26356828212738037, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0059, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.297612726688385, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0047, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.16363881528377533, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.12642459571361542, + "learning_rate": 9.621949874438232e-06, + "loss": 0.004, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.3339644968509674, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0052, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.20784282684326172, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0046, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.28467273712158203, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0047, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.3124372661113739, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0051, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.3490087389945984, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0047, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.15114343166351318, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0051, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.41157594323158264, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0058, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.40405890345573425, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0045, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.1149911880493164, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0087, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.18746539950370789, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0058, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.1327875554561615, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0049, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.1530160903930664, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0038, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.2663615047931671, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0049, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.3390499949455261, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0046, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.2461002618074417, + "learning_rate": 9.477616135359713e-06, + "loss": 0.006, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.2141093611717224, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0049, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.20443470776081085, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0052, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.14927290380001068, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0039, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.3012462854385376, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0047, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.33484792709350586, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0045, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.19986321032047272, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0041, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.21612870693206787, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0043, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.19541047513484955, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0044, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.24203962087631226, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0049, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.1470087766647339, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0049, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.2336059808731079, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0048, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.32893121242523193, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0044, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.32034680247306824, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0055, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.27538758516311646, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0049, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.18869644403457642, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0065, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.2719379961490631, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0047, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.2850756347179413, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0043, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.19997543096542358, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0068, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.19222821295261383, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0044, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.16414248943328857, + "learning_rate": 9.285803018919292e-06, + "loss": 0.004, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.23754803836345673, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0039, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.2682085335254669, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0048, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.18268488347530365, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0046, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.14906349778175354, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0034, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.19079554080963135, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0041, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.09538780897855759, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0043, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.19193744659423828, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0044, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.1366361379623413, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0049, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.29436588287353516, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0052, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.24179348349571228, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0047, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.236627459526062, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0061, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.1719210296869278, + "learning_rate": 9.171095634265995e-06, + "loss": 0.0054, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.2724406123161316, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0048, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.09852395206689835, + "learning_rate": 9.152007262148612e-06, + "loss": 0.004, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.23493632674217224, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0049, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.20697079598903656, + "learning_rate": 9.132927564918328e-06, + "loss": 0.0047, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.16597376763820648, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0048, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.23542962968349457, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0046, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.18859006464481354, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0054, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.16773538291454315, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0044, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.2122378647327423, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0042, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.18205690383911133, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0046, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.1791398823261261, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0043, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.4446735680103302, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0052, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.32150915265083313, + "learning_rate": 9.047178679583151e-06, + "loss": 0.005, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.15855731070041656, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0045, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.19377414882183075, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0057, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.25969046354293823, + "learning_rate": 9.018636566864313e-06, + "loss": 0.006, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.2349981814622879, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0073, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.1853523701429367, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0051, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.22417226433753967, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0058, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.1969340741634369, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0058, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.18523764610290527, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0059, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.28188323974609375, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0052, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.18134717643260956, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0048, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.15660132467746735, + "learning_rate": 8.942627394858978e-06, + "loss": 0.004, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.3179869055747986, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0044, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.14007267355918884, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0043, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.31531354784965515, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0062, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.1867508888244629, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0054, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4172282814979553, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0056, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.21233956515789032, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0054, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.13055016100406647, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0048, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.24662990868091583, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0054, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.1877284198999405, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0045, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.20158089697360992, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0052, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.23169469833374023, + "learning_rate": 8.83836825410936e-06, + "loss": 0.0048, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.27991265058517456, + "learning_rate": 8.828905148874785e-06, + "loss": 0.008, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.3321090638637543, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0063, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.14790703356266022, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0033, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.1504756361246109, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0052, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.2211659848690033, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0038, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.1777208149433136, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0041, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.2586643397808075, + "learning_rate": 8.772180411864604e-06, + "loss": 0.006, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.2705499529838562, + "learning_rate": 8.762735374981932e-06, + "loss": 0.0047, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.16527540981769562, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0037, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.24313445389270782, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0057, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.16705767810344696, + "learning_rate": 8.734416061983528e-06, + "loss": 0.004, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.20638783276081085, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0052, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.26159438490867615, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0039, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.30387070775032043, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0038, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.24292278289794922, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0042, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.3707493543624878, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0056, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.41142478585243225, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0044, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.22052627801895142, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0047, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.14626234769821167, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0047, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.25504666566848755, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0046, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.2020457535982132, + "learning_rate": 8.640192851412488e-06, + "loss": 0.006, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.2440478354692459, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0047, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.12040785700082779, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0044, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.25539812445640564, + "learning_rate": 8.611979388060327e-06, + "loss": 0.006, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.20701228082180023, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0041, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.24188214540481567, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0063, + "step": 18000 + }, + { + "epoch": 1.0791539337288034, + "grad_norm": 0.24987974762916565, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0063, + "step": 18010 + }, + { + "epoch": 1.0797531308047217, + "grad_norm": 0.20973123610019684, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0049, + "step": 18020 + }, + { + "epoch": 1.08035232788064, + "grad_norm": 0.19898714125156403, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0061, + "step": 18030 + }, + { + "epoch": 1.0809515249565582, + "grad_norm": 0.21703247725963593, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0056, + "step": 18040 + }, + { + "epoch": 1.0815507220324765, + "grad_norm": 0.18688541650772095, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0054, + "step": 18050 + }, + { + "epoch": 1.0821499191083948, + "grad_norm": 0.30194586515426636, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0049, + "step": 18060 + }, + { + "epoch": 1.082749116184313, + "grad_norm": 0.17975366115570068, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0046, + "step": 18070 + }, + { + "epoch": 1.0833483132602313, + "grad_norm": 0.25966599583625793, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0044, + "step": 18080 + }, + { + "epoch": 1.0839475103361496, + "grad_norm": 0.1702205240726471, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0058, + "step": 18090 + }, + { + "epoch": 1.0845467074120678, + "grad_norm": 0.18940114974975586, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0052, + "step": 18100 + }, + { + "epoch": 1.085145904487986, + "grad_norm": 0.18239127099514008, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0047, + "step": 18110 + }, + { + "epoch": 1.0857451015639044, + "grad_norm": 0.14571616053581238, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0046, + "step": 18120 + }, + { + "epoch": 1.0863442986398226, + "grad_norm": 0.17203395068645477, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0038, + "step": 18130 + }, + { + "epoch": 1.086943495715741, + "grad_norm": 0.249881312251091, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0056, + "step": 18140 + }, + { + "epoch": 1.0875426927916592, + "grad_norm": 0.296194463968277, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0044, + "step": 18150 + }, + { + "epoch": 1.0881418898675774, + "grad_norm": 0.21376049518585205, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0052, + "step": 18160 + }, + { + "epoch": 1.0887410869434957, + "grad_norm": 0.2952374815940857, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0036, + "step": 18170 + }, + { + "epoch": 1.089340284019414, + "grad_norm": 0.20862646400928497, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0051, + "step": 18180 + }, + { + "epoch": 1.0899394810953322, + "grad_norm": 0.17828255891799927, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0053, + "step": 18190 + }, + { + "epoch": 1.0905386781712505, + "grad_norm": 0.20771050453186035, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0038, + "step": 18200 + }, + { + "epoch": 1.0911378752471688, + "grad_norm": 0.3046565651893616, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0059, + "step": 18210 + }, + { + "epoch": 1.091737072323087, + "grad_norm": 0.12605167925357819, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0046, + "step": 18220 + }, + { + "epoch": 1.0923362693990053, + "grad_norm": 0.13702887296676636, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0038, + "step": 18230 + }, + { + "epoch": 1.0929354664749236, + "grad_norm": 0.11569058150053024, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0042, + "step": 18240 + }, + { + "epoch": 1.0935346635508418, + "grad_norm": 0.27488255500793457, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0054, + "step": 18250 + }, + { + "epoch": 1.09413386062676, + "grad_norm": 0.30820342898368835, + "learning_rate": 8.349909816537207e-06, + "loss": 0.005, + "step": 18260 + }, + { + "epoch": 1.0947330577026784, + "grad_norm": 0.3108576536178589, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0056, + "step": 18270 + }, + { + "epoch": 1.0953322547785966, + "grad_norm": 0.16087505221366882, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0044, + "step": 18280 + }, + { + "epoch": 1.095931451854515, + "grad_norm": 0.27139320969581604, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0055, + "step": 18290 + }, + { + "epoch": 1.0965306489304332, + "grad_norm": 0.17057007551193237, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0036, + "step": 18300 + }, + { + "epoch": 1.0971298460063514, + "grad_norm": 0.13946233689785004, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0057, + "step": 18310 + }, + { + "epoch": 1.0977290430822697, + "grad_norm": 0.2342602014541626, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0038, + "step": 18320 + }, + { + "epoch": 1.098328240158188, + "grad_norm": 0.17249339818954468, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0047, + "step": 18330 + }, + { + "epoch": 1.0989274372341062, + "grad_norm": 0.2641673684120178, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0044, + "step": 18340 + }, + { + "epoch": 1.0995266343100245, + "grad_norm": 0.18304336071014404, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0041, + "step": 18350 + }, + { + "epoch": 1.1001258313859428, + "grad_norm": 0.25955966114997864, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0045, + "step": 18360 + }, + { + "epoch": 1.100725028461861, + "grad_norm": 0.2159314751625061, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0038, + "step": 18370 + }, + { + "epoch": 1.1013242255377793, + "grad_norm": 0.254371702671051, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0043, + "step": 18380 + }, + { + "epoch": 1.1019234226136976, + "grad_norm": 0.10616741329431534, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0036, + "step": 18390 + }, + { + "epoch": 1.1025226196896158, + "grad_norm": 0.38598379492759705, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0065, + "step": 18400 + }, + { + "epoch": 1.1031218167655341, + "grad_norm": 0.3797863721847534, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0048, + "step": 18410 + }, + { + "epoch": 1.1037210138414524, + "grad_norm": 0.2059139758348465, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0062, + "step": 18420 + }, + { + "epoch": 1.1043202109173706, + "grad_norm": 0.19991335272789001, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0043, + "step": 18430 + }, + { + "epoch": 1.104919407993289, + "grad_norm": 0.17376656830310822, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0047, + "step": 18440 + }, + { + "epoch": 1.1055186050692072, + "grad_norm": 0.17102457582950592, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0056, + "step": 18450 + }, + { + "epoch": 1.1061178021451255, + "grad_norm": 0.501983642578125, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0065, + "step": 18460 + }, + { + "epoch": 1.1067169992210437, + "grad_norm": 0.40338510274887085, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0055, + "step": 18470 + }, + { + "epoch": 1.107316196296962, + "grad_norm": 0.10511627048254013, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0052, + "step": 18480 + }, + { + "epoch": 1.1079153933728803, + "grad_norm": 0.2610682249069214, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0038, + "step": 18490 + }, + { + "epoch": 1.1085145904487985, + "grad_norm": 0.09666074812412262, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0058, + "step": 18500 + }, + { + "epoch": 1.1091137875247168, + "grad_norm": 0.19014683365821838, + "learning_rate": 8.117972135268806e-06, + "loss": 0.005, + "step": 18510 + }, + { + "epoch": 1.109712984600635, + "grad_norm": 0.2999255657196045, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 1.1103121816765533, + "grad_norm": 0.20351538062095642, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0049, + "step": 18530 + }, + { + "epoch": 1.1109113787524716, + "grad_norm": 0.1562410295009613, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0034, + "step": 18540 + }, + { + "epoch": 1.1115105758283899, + "grad_norm": 0.14160799980163574, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0035, + "step": 18550 + }, + { + "epoch": 1.1121097729043081, + "grad_norm": 0.10796743631362915, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0056, + "step": 18560 + }, + { + "epoch": 1.1127089699802264, + "grad_norm": 0.28861188888549805, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0043, + "step": 18570 + }, + { + "epoch": 1.1133081670561449, + "grad_norm": 0.3835368752479553, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0037, + "step": 18580 + }, + { + "epoch": 1.113907364132063, + "grad_norm": 0.21850043535232544, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0038, + "step": 18590 + }, + { + "epoch": 1.1145065612079814, + "grad_norm": 0.2950346767902374, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0068, + "step": 18600 + }, + { + "epoch": 1.1151057582838995, + "grad_norm": 0.13051068782806396, + "learning_rate": 8.025779439806006e-06, + "loss": 0.0041, + "step": 18610 + }, + { + "epoch": 1.115704955359818, + "grad_norm": 0.11036359518766403, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0074, + "step": 18620 + }, + { + "epoch": 1.116304152435736, + "grad_norm": 0.35306516289711, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0087, + "step": 18630 + }, + { + "epoch": 1.1169033495116545, + "grad_norm": 0.29782727360725403, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0045, + "step": 18640 + }, + { + "epoch": 1.1175025465875728, + "grad_norm": 0.20690713822841644, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0042, + "step": 18650 + }, + { + "epoch": 1.118101743663491, + "grad_norm": 0.16064110398292542, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0038, + "step": 18660 + }, + { + "epoch": 1.1187009407394093, + "grad_norm": 0.2477649450302124, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0042, + "step": 18670 + }, + { + "epoch": 1.1193001378153276, + "grad_norm": 0.25939393043518066, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0045, + "step": 18680 + }, + { + "epoch": 1.1198993348912458, + "grad_norm": 0.3345301151275635, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0045, + "step": 18690 + }, + { + "epoch": 1.120498531967164, + "grad_norm": 0.19570066034793854, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0052, + "step": 18700 + }, + { + "epoch": 1.1210977290430824, + "grad_norm": 0.09655601531267166, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0044, + "step": 18710 + }, + { + "epoch": 1.1216969261190006, + "grad_norm": 0.13345655798912048, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0031, + "step": 18720 + }, + { + "epoch": 1.122296123194919, + "grad_norm": 0.3130756616592407, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0072, + "step": 18730 + }, + { + "epoch": 1.1228953202708372, + "grad_norm": 0.16259168088436127, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0036, + "step": 18740 + }, + { + "epoch": 1.1234945173467554, + "grad_norm": 0.2581227123737335, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0037, + "step": 18750 + }, + { + "epoch": 1.1240937144226737, + "grad_norm": 0.36706119775772095, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0043, + "step": 18760 + }, + { + "epoch": 1.124692911498592, + "grad_norm": 0.1705426573753357, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0069, + "step": 18770 + }, + { + "epoch": 1.1252921085745102, + "grad_norm": 0.4281153380870819, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0057, + "step": 18780 + }, + { + "epoch": 1.1258913056504285, + "grad_norm": 0.25743696093559265, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0036, + "step": 18790 + }, + { + "epoch": 1.1264905027263468, + "grad_norm": 0.17692404985427856, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0044, + "step": 18800 + }, + { + "epoch": 1.127089699802265, + "grad_norm": 0.17617255449295044, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0043, + "step": 18810 + }, + { + "epoch": 1.1276888968781833, + "grad_norm": 0.193951815366745, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0042, + "step": 18820 + }, + { + "epoch": 1.1282880939541016, + "grad_norm": 0.2187023162841797, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0047, + "step": 18830 + }, + { + "epoch": 1.1288872910300198, + "grad_norm": 0.21488729119300842, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0039, + "step": 18840 + }, + { + "epoch": 1.129486488105938, + "grad_norm": 0.13388743996620178, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0043, + "step": 18850 + }, + { + "epoch": 1.1300856851818564, + "grad_norm": 0.26977118849754333, + "learning_rate": 7.796848308199681e-06, + "loss": 0.004, + "step": 18860 + }, + { + "epoch": 1.1306848822577746, + "grad_norm": 0.40695786476135254, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0049, + "step": 18870 + }, + { + "epoch": 1.131284079333693, + "grad_norm": 0.29070621728897095, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0056, + "step": 18880 + }, + { + "epoch": 1.1318832764096112, + "grad_norm": 0.2745647728443146, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0056, + "step": 18890 + }, + { + "epoch": 1.1324824734855294, + "grad_norm": 0.20881050825119019, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0057, + "step": 18900 + }, + { + "epoch": 1.1330816705614477, + "grad_norm": 0.17475518584251404, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0041, + "step": 18910 + }, + { + "epoch": 1.133680867637366, + "grad_norm": 0.2414310723543167, + "learning_rate": 7.742248115573104e-06, + "loss": 0.004, + "step": 18920 + }, + { + "epoch": 1.1342800647132842, + "grad_norm": 0.20051640272140503, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0042, + "step": 18930 + }, + { + "epoch": 1.1348792617892025, + "grad_norm": 0.18383435904979706, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0038, + "step": 18940 + }, + { + "epoch": 1.1354784588651208, + "grad_norm": 0.16546988487243652, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0041, + "step": 18950 + }, + { + "epoch": 1.136077655941039, + "grad_norm": 0.17165544629096985, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0057, + "step": 18960 + }, + { + "epoch": 1.1366768530169573, + "grad_norm": 0.25065234303474426, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0048, + "step": 18970 + }, + { + "epoch": 1.1372760500928756, + "grad_norm": 0.19762223958969116, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0038, + "step": 18980 + }, + { + "epoch": 1.1378752471687938, + "grad_norm": 0.23894545435905457, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0033, + "step": 18990 + }, + { + "epoch": 1.1384744442447121, + "grad_norm": 0.2860289216041565, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0053, + "step": 19000 + }, + { + "epoch": 1.1390736413206304, + "grad_norm": 0.3699626624584198, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0061, + "step": 19010 + }, + { + "epoch": 1.1396728383965486, + "grad_norm": 0.2370971292257309, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0043, + "step": 19020 + }, + { + "epoch": 1.140272035472467, + "grad_norm": 0.19790691137313843, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0042, + "step": 19030 + }, + { + "epoch": 1.1408712325483852, + "grad_norm": 0.14648208022117615, + "learning_rate": 7.633462930388875e-06, + "loss": 0.005, + "step": 19040 + }, + { + "epoch": 1.1414704296243035, + "grad_norm": 0.105158232152462, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0032, + "step": 19050 + }, + { + "epoch": 1.1420696267002217, + "grad_norm": 0.24994254112243652, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0042, + "step": 19060 + }, + { + "epoch": 1.14266882377614, + "grad_norm": 0.30648791790008545, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0058, + "step": 19070 + }, + { + "epoch": 1.1432680208520583, + "grad_norm": 0.16284243762493134, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0047, + "step": 19080 + }, + { + "epoch": 1.1438672179279765, + "grad_norm": 0.14919471740722656, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0045, + "step": 19090 + }, + { + "epoch": 1.1444664150038948, + "grad_norm": 0.14879491925239563, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0047, + "step": 19100 + }, + { + "epoch": 1.145065612079813, + "grad_norm": 0.11741457879543304, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0041, + "step": 19110 + }, + { + "epoch": 1.1456648091557313, + "grad_norm": 0.09406878799200058, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0029, + "step": 19120 + }, + { + "epoch": 1.1462640062316496, + "grad_norm": 0.20860706269741058, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0054, + "step": 19130 + }, + { + "epoch": 1.1468632033075679, + "grad_norm": 0.24234607815742493, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0047, + "step": 19140 + }, + { + "epoch": 1.1474624003834861, + "grad_norm": 0.27025938034057617, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0042, + "step": 19150 + }, + { + "epoch": 1.1480615974594044, + "grad_norm": 0.15129081904888153, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0046, + "step": 19160 + }, + { + "epoch": 1.1486607945353227, + "grad_norm": 0.11173490434885025, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0035, + "step": 19170 + }, + { + "epoch": 1.149259991611241, + "grad_norm": 0.2204807698726654, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0036, + "step": 19180 + }, + { + "epoch": 1.1498591886871592, + "grad_norm": 0.20111115276813507, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0087, + "step": 19190 + }, + { + "epoch": 1.1504583857630775, + "grad_norm": 0.213748961687088, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0045, + "step": 19200 + }, + { + "epoch": 1.1510575828389957, + "grad_norm": 0.21150177717208862, + "learning_rate": 7.480328799175369e-06, + "loss": 0.004, + "step": 19210 + }, + { + "epoch": 1.151656779914914, + "grad_norm": 0.2450210005044937, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0036, + "step": 19220 + }, + { + "epoch": 1.1522559769908323, + "grad_norm": 0.16161729395389557, + "learning_rate": 7.4623904967312e-06, + "loss": 0.004, + "step": 19230 + }, + { + "epoch": 1.1528551740667505, + "grad_norm": 0.15077564120292664, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0038, + "step": 19240 + }, + { + "epoch": 1.1534543711426688, + "grad_norm": 0.3078431487083435, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0051, + "step": 19250 + }, + { + "epoch": 1.154053568218587, + "grad_norm": 0.15213221311569214, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0032, + "step": 19260 + }, + { + "epoch": 1.1546527652945053, + "grad_norm": 0.12404917925596237, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0042, + "step": 19270 + }, + { + "epoch": 1.1552519623704236, + "grad_norm": 0.18779516220092773, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0041, + "step": 19280 + }, + { + "epoch": 1.1558511594463419, + "grad_norm": 0.4039568603038788, + "learning_rate": 7.408675563767873e-06, + "loss": 0.005, + "step": 19290 + }, + { + "epoch": 1.1564503565222601, + "grad_norm": 0.2045651078224182, + "learning_rate": 7.399737764864619e-06, + "loss": 0.0057, + "step": 19300 + }, + { + "epoch": 1.1570495535981784, + "grad_norm": 0.3885338306427002, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0049, + "step": 19310 + }, + { + "epoch": 1.1576487506740967, + "grad_norm": 0.253049373626709, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0059, + "step": 19320 + }, + { + "epoch": 1.158247947750015, + "grad_norm": 0.250356525182724, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0076, + "step": 19330 + }, + { + "epoch": 1.1588471448259332, + "grad_norm": 0.3269367814064026, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0112, + "step": 19340 + }, + { + "epoch": 1.1594463419018515, + "grad_norm": 0.15401138365268707, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0052, + "step": 19350 + }, + { + "epoch": 1.1600455389777697, + "grad_norm": 0.1631775051355362, + "learning_rate": 7.346200065486093e-06, + "loss": 0.004, + "step": 19360 + }, + { + "epoch": 1.160644736053688, + "grad_norm": 0.17112085223197937, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0038, + "step": 19370 + }, + { + "epoch": 1.1612439331296063, + "grad_norm": 0.24018551409244537, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0056, + "step": 19380 + }, + { + "epoch": 1.1618431302055245, + "grad_norm": 0.17964349687099457, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0057, + "step": 19390 + }, + { + "epoch": 1.1624423272814428, + "grad_norm": 0.1747465431690216, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0053, + "step": 19400 + }, + { + "epoch": 1.163041524357361, + "grad_norm": 0.21299205720424652, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0038, + "step": 19410 + }, + { + "epoch": 1.1636407214332793, + "grad_norm": 0.13219258189201355, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0057, + "step": 19420 + }, + { + "epoch": 1.1642399185091976, + "grad_norm": 1.0558332204818726, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0066, + "step": 19430 + }, + { + "epoch": 1.1648391155851159, + "grad_norm": 0.2154799997806549, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0041, + "step": 19440 + }, + { + "epoch": 1.1654383126610341, + "grad_norm": 0.13665339350700378, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0044, + "step": 19450 + }, + { + "epoch": 1.1660375097369524, + "grad_norm": 0.2101723700761795, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0039, + "step": 19460 + }, + { + "epoch": 1.1666367068128707, + "grad_norm": 0.13208501040935516, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0054, + "step": 19470 + }, + { + "epoch": 1.167235903888789, + "grad_norm": 0.09342823177576065, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0032, + "step": 19480 + }, + { + "epoch": 1.1678351009647072, + "grad_norm": 0.22464905679225922, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0055, + "step": 19490 + }, + { + "epoch": 1.1684342980406255, + "grad_norm": 0.17030438780784607, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0042, + "step": 19500 + }, + { + "epoch": 1.1690334951165438, + "grad_norm": 0.17673689126968384, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0055, + "step": 19510 + }, + { + "epoch": 1.169632692192462, + "grad_norm": 0.24041922390460968, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0048, + "step": 19520 + }, + { + "epoch": 1.1702318892683803, + "grad_norm": 0.14808662235736847, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0031, + "step": 19530 + }, + { + "epoch": 1.1708310863442986, + "grad_norm": 0.2489791214466095, + "learning_rate": 7.186522173441719e-06, + "loss": 0.004, + "step": 19540 + }, + { + "epoch": 1.1714302834202168, + "grad_norm": 0.19468742609024048, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0042, + "step": 19550 + }, + { + "epoch": 1.172029480496135, + "grad_norm": 0.15028323233127594, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0061, + "step": 19560 + }, + { + "epoch": 1.1726286775720534, + "grad_norm": 0.13852037489414215, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0045, + "step": 19570 + }, + { + "epoch": 1.1732278746479716, + "grad_norm": 0.1401798278093338, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0063, + "step": 19580 + }, + { + "epoch": 1.17382707172389, + "grad_norm": 0.1831122189760208, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0034, + "step": 19590 + }, + { + "epoch": 1.1744262687998082, + "grad_norm": 0.2867920994758606, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0044, + "step": 19600 + }, + { + "epoch": 1.1750254658757266, + "grad_norm": 0.13363438844680786, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0038, + "step": 19610 + }, + { + "epoch": 1.1756246629516447, + "grad_norm": 0.20085696876049042, + "learning_rate": 7.116016051769541e-06, + "loss": 0.004, + "step": 19620 + }, + { + "epoch": 1.1762238600275632, + "grad_norm": 0.1598372906446457, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0042, + "step": 19630 + }, + { + "epoch": 1.1768230571034812, + "grad_norm": 0.09672598540782928, + "learning_rate": 7.098434895408162e-06, + "loss": 0.004, + "step": 19640 + }, + { + "epoch": 1.1774222541793997, + "grad_norm": 0.18206225335597992, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0048, + "step": 19650 + }, + { + "epoch": 1.1780214512553178, + "grad_norm": 0.1818019449710846, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0038, + "step": 19660 + }, + { + "epoch": 1.1786206483312363, + "grad_norm": 0.21658800542354584, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0044, + "step": 19670 + }, + { + "epoch": 1.1792198454071543, + "grad_norm": 0.08513368666172028, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0038, + "step": 19680 + }, + { + "epoch": 1.1798190424830728, + "grad_norm": 0.10634194314479828, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0044, + "step": 19690 + }, + { + "epoch": 1.1804182395589908, + "grad_norm": 0.12106078863143921, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0037, + "step": 19700 + }, + { + "epoch": 1.1810174366349093, + "grad_norm": 0.11508465558290482, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0036, + "step": 19710 + }, + { + "epoch": 1.1816166337108274, + "grad_norm": 0.20805053412914276, + "learning_rate": 7.028294242074066e-06, + "loss": 0.004, + "step": 19720 + }, + { + "epoch": 1.1822158307867459, + "grad_norm": 0.23920200765132904, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0045, + "step": 19730 + }, + { + "epoch": 1.1828150278626641, + "grad_norm": 0.1300375908613205, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0045, + "step": 19740 + }, + { + "epoch": 1.1834142249385824, + "grad_norm": 0.23444809019565582, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0036, + "step": 19750 + }, + { + "epoch": 1.1840134220145007, + "grad_norm": 0.2636217772960663, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0044, + "step": 19760 + }, + { + "epoch": 1.184612619090419, + "grad_norm": 0.31166398525238037, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.005, + "step": 19770 + }, + { + "epoch": 1.1852118161663372, + "grad_norm": 0.32881107926368713, + "learning_rate": 6.975884226362e-06, + "loss": 0.0055, + "step": 19780 + }, + { + "epoch": 1.1858110132422555, + "grad_norm": 0.41748252511024475, + "learning_rate": 6.967165692827958e-06, + "loss": 0.006, + "step": 19790 + }, + { + "epoch": 1.1864102103181737, + "grad_norm": 0.1588834673166275, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0039, + "step": 19800 + }, + { + "epoch": 1.187009407394092, + "grad_norm": 0.23697984218597412, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0039, + "step": 19810 + }, + { + "epoch": 1.1876086044700103, + "grad_norm": 0.19356773793697357, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0061, + "step": 19820 + }, + { + "epoch": 1.1882078015459285, + "grad_norm": 0.16373206675052643, + "learning_rate": 6.932338988482141e-06, + "loss": 0.004, + "step": 19830 + }, + { + "epoch": 1.1888069986218468, + "grad_norm": 0.1331700086593628, + "learning_rate": 6.923644220932124e-06, + "loss": 0.004, + "step": 19840 + }, + { + "epoch": 1.189406195697765, + "grad_norm": 0.4039696753025055, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0057, + "step": 19850 + }, + { + "epoch": 1.1900053927736833, + "grad_norm": 0.30325421690940857, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0065, + "step": 19860 + }, + { + "epoch": 1.1906045898496016, + "grad_norm": 0.21767468750476837, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0038, + "step": 19870 + }, + { + "epoch": 1.1912037869255199, + "grad_norm": 0.17474445700645447, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0056, + "step": 19880 + }, + { + "epoch": 1.1918029840014381, + "grad_norm": 0.17118008434772491, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0045, + "step": 19890 + }, + { + "epoch": 1.1924021810773564, + "grad_norm": 0.44261473417282104, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0063, + "step": 19900 + }, + { + "epoch": 1.1930013781532747, + "grad_norm": 0.18502798676490784, + "learning_rate": 6.862915366041247e-06, + "loss": 0.004, + "step": 19910 + }, + { + "epoch": 1.193600575229193, + "grad_norm": 0.19384194910526276, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0036, + "step": 19920 + }, + { + "epoch": 1.1941997723051112, + "grad_norm": 0.1448352187871933, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0044, + "step": 19930 + }, + { + "epoch": 1.1947989693810295, + "grad_norm": 0.3728172779083252, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0038, + "step": 19940 + }, + { + "epoch": 1.1953981664569477, + "grad_norm": 0.31421783566474915, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0043, + "step": 19950 + }, + { + "epoch": 1.195997363532866, + "grad_norm": 0.28181371092796326, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0045, + "step": 19960 + }, + { + "epoch": 1.1965965606087843, + "grad_norm": 0.2249889373779297, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0041, + "step": 19970 + }, + { + "epoch": 1.1971957576847025, + "grad_norm": 0.26402008533477783, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0043, + "step": 19980 + }, + { + "epoch": 1.1977949547606208, + "grad_norm": 0.22621415555477142, + "learning_rate": 6.793802468038111e-06, + "loss": 0.004, + "step": 19990 + }, + { + "epoch": 1.198394151836539, + "grad_norm": 0.2681289315223694, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0045, + "step": 20000 + }, + { + "epoch": 1.1989933489124573, + "grad_norm": 0.17681041359901428, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0037, + "step": 20010 + }, + { + "epoch": 1.1995925459883756, + "grad_norm": 0.16526542603969574, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0032, + "step": 20020 + }, + { + "epoch": 1.2001917430642939, + "grad_norm": 0.30313149094581604, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0046, + "step": 20030 + }, + { + "epoch": 1.2007909401402121, + "grad_norm": 0.17628541588783264, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0065, + "step": 20040 + }, + { + "epoch": 1.2013901372161304, + "grad_norm": 0.1840096414089203, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0051, + "step": 20050 + }, + { + "epoch": 1.2019893342920487, + "grad_norm": 0.146232470870018, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0035, + "step": 20060 + }, + { + "epoch": 1.202588531367967, + "grad_norm": 0.4804438352584839, + "learning_rate": 6.725005485342219e-06, + "loss": 0.005, + "step": 20070 + }, + { + "epoch": 1.2031877284438852, + "grad_norm": 0.2245558500289917, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0039, + "step": 20080 + }, + { + "epoch": 1.2037869255198035, + "grad_norm": 0.21845588088035583, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0053, + "step": 20090 + }, + { + "epoch": 1.2043861225957218, + "grad_norm": 0.1743943691253662, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0037, + "step": 20100 + }, + { + "epoch": 1.20498531967164, + "grad_norm": 0.16978098452091217, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0036, + "step": 20110 + }, + { + "epoch": 1.2055845167475583, + "grad_norm": 0.27158796787261963, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0043, + "step": 20120 + }, + { + "epoch": 1.2061837138234766, + "grad_norm": 0.13516400754451752, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0048, + "step": 20130 + }, + { + "epoch": 1.2067829108993948, + "grad_norm": 0.1645064353942871, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0038, + "step": 20140 + }, + { + "epoch": 1.207382107975313, + "grad_norm": 0.07616083323955536, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0046, + "step": 20150 + }, + { + "epoch": 1.2079813050512314, + "grad_norm": 0.13306911289691925, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0039, + "step": 20160 + }, + { + "epoch": 1.2085805021271496, + "grad_norm": 0.19445037841796875, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0044, + "step": 20170 + }, + { + "epoch": 1.209179699203068, + "grad_norm": 0.18423207104206085, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0049, + "step": 20180 + }, + { + "epoch": 1.2097788962789862, + "grad_norm": 0.19280213117599487, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0043, + "step": 20190 + }, + { + "epoch": 1.2103780933549044, + "grad_norm": 0.25472623109817505, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0033, + "step": 20200 + }, + { + "epoch": 1.2109772904308227, + "grad_norm": 0.16799427568912506, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0031, + "step": 20210 + }, + { + "epoch": 1.211576487506741, + "grad_norm": 0.2097395807504654, + "learning_rate": 6.596880604028027e-06, + "loss": 0.004, + "step": 20220 + }, + { + "epoch": 1.2121756845826592, + "grad_norm": 0.31450021266937256, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0047, + "step": 20230 + }, + { + "epoch": 1.2127748816585775, + "grad_norm": 0.16530238091945648, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0034, + "step": 20240 + }, + { + "epoch": 1.2133740787344958, + "grad_norm": 0.2506805956363678, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0038, + "step": 20250 + }, + { + "epoch": 1.213973275810414, + "grad_norm": 0.1876160055398941, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0035, + "step": 20260 + }, + { + "epoch": 1.2145724728863323, + "grad_norm": 0.23704354465007782, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0041, + "step": 20270 + }, + { + "epoch": 1.2151716699622506, + "grad_norm": 0.13814999163150787, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0042, + "step": 20280 + }, + { + "epoch": 1.2157708670381688, + "grad_norm": 0.1164403185248375, + "learning_rate": 6.53748481975927e-06, + "loss": 0.0042, + "step": 20290 + }, + { + "epoch": 1.216370064114087, + "grad_norm": 0.23078426718711853, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0038, + "step": 20300 + }, + { + "epoch": 1.2169692611900054, + "grad_norm": 0.21749110519886017, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0046, + "step": 20310 + }, + { + "epoch": 1.2175684582659236, + "grad_norm": 0.24972137808799744, + "learning_rate": 6.512107839793337e-06, + "loss": 0.0041, + "step": 20320 + }, + { + "epoch": 1.218167655341842, + "grad_norm": 0.2491082102060318, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0043, + "step": 20330 + }, + { + "epoch": 1.2187668524177602, + "grad_norm": 0.14915086328983307, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0048, + "step": 20340 + }, + { + "epoch": 1.2193660494936784, + "grad_norm": 0.2794116735458374, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0035, + "step": 20350 + }, + { + "epoch": 1.2199652465695967, + "grad_norm": 0.13765662908554077, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0047, + "step": 20360 + }, + { + "epoch": 1.220564443645515, + "grad_norm": 0.14874878525733948, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0042, + "step": 20370 + }, + { + "epoch": 1.2211636407214332, + "grad_norm": 0.1800280064344406, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0057, + "step": 20380 + }, + { + "epoch": 1.2217628377973515, + "grad_norm": 0.17518648505210876, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0049, + "step": 20390 + }, + { + "epoch": 1.2223620348732698, + "grad_norm": 0.16315865516662598, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0045, + "step": 20400 + }, + { + "epoch": 1.222961231949188, + "grad_norm": 0.3590790033340454, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0039, + "step": 20410 + }, + { + "epoch": 1.2235604290251063, + "grad_norm": 0.14534324407577515, + "learning_rate": 6.427861749601945e-06, + "loss": 0.004, + "step": 20420 + }, + { + "epoch": 1.2241596261010246, + "grad_norm": 0.1662825047969818, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0042, + "step": 20430 + }, + { + "epoch": 1.2247588231769428, + "grad_norm": 0.27466440200805664, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0045, + "step": 20440 + }, + { + "epoch": 1.2253580202528611, + "grad_norm": 0.1323469579219818, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0047, + "step": 20450 + }, + { + "epoch": 1.2259572173287794, + "grad_norm": 0.12367355078458786, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0077, + "step": 20460 + }, + { + "epoch": 1.2265564144046976, + "grad_norm": 0.18238325417041779, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0058, + "step": 20470 + }, + { + "epoch": 1.227155611480616, + "grad_norm": 0.2733745574951172, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0038, + "step": 20480 + }, + { + "epoch": 1.2277548085565342, + "grad_norm": 0.3367181420326233, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0039, + "step": 20490 + }, + { + "epoch": 1.2283540056324525, + "grad_norm": 0.20671530067920685, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0034, + "step": 20500 + }, + { + "epoch": 1.2289532027083707, + "grad_norm": 0.23353071510791779, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0033, + "step": 20510 + }, + { + "epoch": 1.229552399784289, + "grad_norm": 0.21081902086734772, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0031, + "step": 20520 + }, + { + "epoch": 1.2301515968602073, + "grad_norm": 0.3426077365875244, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0049, + "step": 20530 + }, + { + "epoch": 1.2307507939361255, + "grad_norm": 0.3905622959136963, + "learning_rate": 6.327475567095824e-06, + "loss": 0.004, + "step": 20540 + }, + { + "epoch": 1.2313499910120438, + "grad_norm": 0.1888400912284851, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0041, + "step": 20550 + }, + { + "epoch": 1.231949188087962, + "grad_norm": 0.23982487618923187, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0041, + "step": 20560 + }, + { + "epoch": 1.2325483851638803, + "grad_norm": 0.2061331421136856, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0046, + "step": 20570 + }, + { + "epoch": 1.2331475822397986, + "grad_norm": 0.17000116407871246, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0033, + "step": 20580 + }, + { + "epoch": 1.2337467793157169, + "grad_norm": 0.15905790030956268, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0049, + "step": 20590 + }, + { + "epoch": 1.2343459763916351, + "grad_norm": 0.16794419288635254, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0052, + "step": 20600 + }, + { + "epoch": 1.2349451734675534, + "grad_norm": 0.3003343641757965, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0061, + "step": 20610 + }, + { + "epoch": 1.2355443705434717, + "grad_norm": 0.1429288536310196, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0042, + "step": 20620 + }, + { + "epoch": 1.23614356761939, + "grad_norm": 0.18542084097862244, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0047, + "step": 20630 + }, + { + "epoch": 1.2367427646953082, + "grad_norm": 0.2692892253398895, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0035, + "step": 20640 + }, + { + "epoch": 1.2373419617712265, + "grad_norm": 0.23286236822605133, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0037, + "step": 20650 + }, + { + "epoch": 1.2379411588471447, + "grad_norm": 0.0963423103094101, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0041, + "step": 20660 + }, + { + "epoch": 1.238540355923063, + "grad_norm": 0.1425798237323761, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0043, + "step": 20670 + }, + { + "epoch": 1.2391395529989815, + "grad_norm": 0.0960182398557663, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0046, + "step": 20680 + }, + { + "epoch": 1.2397387500748995, + "grad_norm": 0.2674477994441986, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0043, + "step": 20690 + }, + { + "epoch": 1.240337947150818, + "grad_norm": 0.16276703774929047, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0041, + "step": 20700 + }, + { + "epoch": 1.240937144226736, + "grad_norm": 0.24255621433258057, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.003, + "step": 20710 + }, + { + "epoch": 1.2415363413026546, + "grad_norm": 0.20395220816135406, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0054, + "step": 20720 + }, + { + "epoch": 1.2421355383785726, + "grad_norm": 0.12099681794643402, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0082, + "step": 20730 + }, + { + "epoch": 1.242734735454491, + "grad_norm": 0.14017170667648315, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0042, + "step": 20740 + }, + { + "epoch": 1.2433339325304091, + "grad_norm": 0.28132137656211853, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0043, + "step": 20750 + }, + { + "epoch": 1.2439331296063276, + "grad_norm": 0.27220970392227173, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0039, + "step": 20760 + }, + { + "epoch": 1.2445323266822457, + "grad_norm": 0.23647353053092957, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0058, + "step": 20770 + }, + { + "epoch": 1.2451315237581642, + "grad_norm": 0.20623824000358582, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0053, + "step": 20780 + }, + { + "epoch": 1.2457307208340822, + "grad_norm": 0.12366114556789398, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0037, + "step": 20790 + }, + { + "epoch": 1.2463299179100007, + "grad_norm": 0.23330192267894745, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0056, + "step": 20800 + }, + { + "epoch": 1.246929114985919, + "grad_norm": 0.19991633296012878, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0031, + "step": 20810 + }, + { + "epoch": 1.2475283120618372, + "grad_norm": 0.1496160626411438, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0058, + "step": 20820 + }, + { + "epoch": 1.2481275091377555, + "grad_norm": 0.13247868418693542, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0037, + "step": 20830 + }, + { + "epoch": 1.2487267062136738, + "grad_norm": 0.19072194397449493, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0057, + "step": 20840 + }, + { + "epoch": 1.249325903289592, + "grad_norm": 0.10773085057735443, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0042, + "step": 20850 + }, + { + "epoch": 1.2499251003655103, + "grad_norm": 0.14058449864387512, + "learning_rate": 6.063685039328116e-06, + "loss": 0.005, + "step": 20860 + }, + { + "epoch": 1.2505242974414286, + "grad_norm": 0.10825464874505997, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0042, + "step": 20870 + }, + { + "epoch": 1.2511234945173468, + "grad_norm": 0.18059906363487244, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0046, + "step": 20880 + }, + { + "epoch": 1.251722691593265, + "grad_norm": 0.1713389754295349, + "learning_rate": 6.039253929027638e-06, + "loss": 0.005, + "step": 20890 + }, + { + "epoch": 1.2523218886691834, + "grad_norm": 0.23789434134960175, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0047, + "step": 20900 + }, + { + "epoch": 1.2529210857451016, + "grad_norm": 0.17626744508743286, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0041, + "step": 20910 + }, + { + "epoch": 1.25352028282102, + "grad_norm": 0.2091904729604721, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0044, + "step": 20920 + }, + { + "epoch": 1.2541194798969382, + "grad_norm": 0.17293672263622284, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0043, + "step": 20930 + }, + { + "epoch": 1.2547186769728564, + "grad_norm": 0.13156521320343018, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0039, + "step": 20940 + }, + { + "epoch": 1.2553178740487747, + "grad_norm": 0.19591976702213287, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0043, + "step": 20950 + }, + { + "epoch": 1.255917071124693, + "grad_norm": 0.16212835907936096, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0039, + "step": 20960 + }, + { + "epoch": 1.2565162682006112, + "grad_norm": 0.10661022365093231, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0037, + "step": 20970 + }, + { + "epoch": 1.2571154652765295, + "grad_norm": 0.16630858182907104, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0038, + "step": 20980 + }, + { + "epoch": 1.2577146623524478, + "grad_norm": 0.11001022905111313, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0037, + "step": 20990 + }, + { + "epoch": 1.258313859428366, + "grad_norm": 0.1888381838798523, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0044, + "step": 21000 + }, + { + "epoch": 1.2589130565042843, + "grad_norm": 0.19239328801631927, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0044, + "step": 21010 + }, + { + "epoch": 1.2595122535802026, + "grad_norm": 0.16555139422416687, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0032, + "step": 21020 + }, + { + "epoch": 1.2601114506561208, + "grad_norm": 0.19748231768608093, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0043, + "step": 21030 + }, + { + "epoch": 1.2607106477320391, + "grad_norm": 0.1546473354101181, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0049, + "step": 21040 + }, + { + "epoch": 1.2613098448079574, + "grad_norm": 0.30511707067489624, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0037, + "step": 21050 + }, + { + "epoch": 1.2619090418838756, + "grad_norm": 0.1722872257232666, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0048, + "step": 21060 + }, + { + "epoch": 1.262508238959794, + "grad_norm": 0.1784086525440216, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0049, + "step": 21070 + }, + { + "epoch": 1.2631074360357122, + "grad_norm": 0.15101182460784912, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0042, + "step": 21080 + }, + { + "epoch": 1.2637066331116305, + "grad_norm": 0.1252688318490982, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0041, + "step": 21090 + }, + { + "epoch": 1.2643058301875487, + "grad_norm": 0.15101821720600128, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0043, + "step": 21100 + }, + { + "epoch": 1.264905027263467, + "grad_norm": 0.21302345395088196, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0035, + "step": 21110 + }, + { + "epoch": 1.2655042243393853, + "grad_norm": 0.1591431051492691, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0033, + "step": 21120 + }, + { + "epoch": 1.2661034214153035, + "grad_norm": 0.16010484099388123, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0049, + "step": 21130 + }, + { + "epoch": 1.2667026184912218, + "grad_norm": 0.19287234544754028, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0037, + "step": 21140 + }, + { + "epoch": 1.26730181556714, + "grad_norm": 0.1804349720478058, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0036, + "step": 21150 + }, + { + "epoch": 1.2679010126430583, + "grad_norm": 0.14769446849822998, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0044, + "step": 21160 + }, + { + "epoch": 1.2685002097189766, + "grad_norm": 0.15914054214954376, + "learning_rate": 5.813791207086085e-06, + "loss": 0.004, + "step": 21170 + }, + { + "epoch": 1.2690994067948949, + "grad_norm": 0.19632315635681152, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0034, + "step": 21180 + }, + { + "epoch": 1.2696986038708131, + "grad_norm": 0.3017818331718445, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0046, + "step": 21190 + }, + { + "epoch": 1.2702978009467314, + "grad_norm": 0.2728461027145386, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0044, + "step": 21200 + }, + { + "epoch": 1.2708969980226497, + "grad_norm": 0.18619874119758606, + "learning_rate": 5.781966956563247e-06, + "loss": 0.004, + "step": 21210 + }, + { + "epoch": 1.271496195098568, + "grad_norm": 0.1235085129737854, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0037, + "step": 21220 + }, + { + "epoch": 1.2720953921744862, + "grad_norm": 0.15798084437847137, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0035, + "step": 21230 + }, + { + "epoch": 1.2726945892504045, + "grad_norm": 0.15713484585285187, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0036, + "step": 21240 + }, + { + "epoch": 1.2732937863263227, + "grad_norm": 0.15594886243343353, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0038, + "step": 21250 + }, + { + "epoch": 1.273892983402241, + "grad_norm": 0.1558992713689804, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0037, + "step": 21260 + }, + { + "epoch": 1.2744921804781593, + "grad_norm": 0.20599815249443054, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0054, + "step": 21270 + }, + { + "epoch": 1.2750913775540775, + "grad_norm": 0.2785670757293701, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0042, + "step": 21280 + }, + { + "epoch": 1.2756905746299958, + "grad_norm": 0.22550497949123383, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0035, + "step": 21290 + }, + { + "epoch": 1.276289771705914, + "grad_norm": 0.15210074186325073, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0035, + "step": 21300 + }, + { + "epoch": 1.2768889687818323, + "grad_norm": 0.18905121088027954, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0035, + "step": 21310 + }, + { + "epoch": 1.2774881658577506, + "grad_norm": 0.1337066888809204, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0046, + "step": 21320 + }, + { + "epoch": 1.2780873629336689, + "grad_norm": 0.23699362576007843, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0048, + "step": 21330 + }, + { + "epoch": 1.2786865600095871, + "grad_norm": 0.2480958253145218, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0037, + "step": 21340 + }, + { + "epoch": 1.2792857570855054, + "grad_norm": 0.09328999370336533, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0047, + "step": 21350 + }, + { + "epoch": 1.2798849541614237, + "grad_norm": 0.3416430950164795, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0048, + "step": 21360 + }, + { + "epoch": 1.280484151237342, + "grad_norm": 0.13258710503578186, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0032, + "step": 21370 + }, + { + "epoch": 1.2810833483132602, + "grad_norm": 0.18493984639644623, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0037, + "step": 21380 + }, + { + "epoch": 1.2816825453891785, + "grad_norm": 0.10433483123779297, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0045, + "step": 21390 + }, + { + "epoch": 1.2822817424650967, + "grad_norm": 0.18333138525485992, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0038, + "step": 21400 + }, + { + "epoch": 1.282880939541015, + "grad_norm": 0.25164106488227844, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0058, + "step": 21410 + }, + { + "epoch": 1.2834801366169333, + "grad_norm": 0.17989882826805115, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0041, + "step": 21420 + }, + { + "epoch": 1.2840793336928515, + "grad_norm": 0.1597793847322464, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0036, + "step": 21430 + }, + { + "epoch": 1.2846785307687698, + "grad_norm": 0.1543695032596588, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0036, + "step": 21440 + }, + { + "epoch": 1.285277727844688, + "grad_norm": 0.2985675036907196, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0043, + "step": 21450 + }, + { + "epoch": 1.2858769249206063, + "grad_norm": 0.1357773244380951, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0036, + "step": 21460 + }, + { + "epoch": 1.2864761219965246, + "grad_norm": 0.23978300392627716, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.005, + "step": 21470 + }, + { + "epoch": 1.2870753190724429, + "grad_norm": 0.12806151807308197, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0035, + "step": 21480 + }, + { + "epoch": 1.2876745161483611, + "grad_norm": 0.2222731113433838, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0039, + "step": 21490 + }, + { + "epoch": 1.2882737132242794, + "grad_norm": 0.16744646430015564, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0035, + "step": 21500 + }, + { + "epoch": 1.2888729103001977, + "grad_norm": 0.2162114977836609, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0048, + "step": 21510 + }, + { + "epoch": 1.289472107376116, + "grad_norm": 0.14857177436351776, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0036, + "step": 21520 + }, + { + "epoch": 1.2900713044520342, + "grad_norm": 0.21318115293979645, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0032, + "step": 21530 + }, + { + "epoch": 1.2906705015279525, + "grad_norm": 0.257682204246521, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0036, + "step": 21540 + }, + { + "epoch": 1.2912696986038708, + "grad_norm": 0.254349946975708, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0042, + "step": 21550 + }, + { + "epoch": 1.291868895679789, + "grad_norm": 0.148925319314003, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0029, + "step": 21560 + }, + { + "epoch": 1.2924680927557073, + "grad_norm": 0.1902056336402893, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0031, + "step": 21570 + }, + { + "epoch": 1.2930672898316256, + "grad_norm": 0.17580094933509827, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0026, + "step": 21580 + }, + { + "epoch": 1.2936664869075438, + "grad_norm": 0.18856695294380188, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0045, + "step": 21590 + }, + { + "epoch": 1.294265683983462, + "grad_norm": 0.17185454070568085, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0039, + "step": 21600 + }, + { + "epoch": 1.2948648810593804, + "grad_norm": 0.1997966468334198, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0043, + "step": 21610 + }, + { + "epoch": 1.2954640781352986, + "grad_norm": 0.14173944294452667, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0033, + "step": 21620 + }, + { + "epoch": 1.296063275211217, + "grad_norm": 0.20653635263442993, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0039, + "step": 21630 + }, + { + "epoch": 1.2966624722871352, + "grad_norm": 0.19571708142757416, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0026, + "step": 21640 + }, + { + "epoch": 1.2972616693630534, + "grad_norm": 0.0877508670091629, + "learning_rate": 5.438496901657042e-06, + "loss": 0.005, + "step": 21650 + }, + { + "epoch": 1.2978608664389717, + "grad_norm": 0.17305001616477966, + "learning_rate": 5.430834687545416e-06, + "loss": 0.0038, + "step": 21660 + }, + { + "epoch": 1.2984600635148902, + "grad_norm": 0.16555450856685638, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0035, + "step": 21670 + }, + { + "epoch": 1.2990592605908082, + "grad_norm": 0.15395715832710266, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0035, + "step": 21680 + }, + { + "epoch": 1.2996584576667267, + "grad_norm": 0.2430422455072403, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0032, + "step": 21690 + }, + { + "epoch": 1.3002576547426448, + "grad_norm": 0.2465265393257141, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0034, + "step": 21700 + }, + { + "epoch": 1.3008568518185633, + "grad_norm": 0.08382703363895416, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0038, + "step": 21710 + }, + { + "epoch": 1.3014560488944813, + "grad_norm": 0.3427184224128723, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0042, + "step": 21720 + }, + { + "epoch": 1.3020552459703998, + "grad_norm": 0.13029031455516815, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0039, + "step": 21730 + }, + { + "epoch": 1.3026544430463178, + "grad_norm": 0.11826448887586594, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0035, + "step": 21740 + }, + { + "epoch": 1.3032536401222363, + "grad_norm": 0.1612391620874405, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0039, + "step": 21750 + }, + { + "epoch": 1.3038528371981544, + "grad_norm": 0.21143540740013123, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0057, + "step": 21760 + }, + { + "epoch": 1.3044520342740729, + "grad_norm": 0.22977286577224731, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.005, + "step": 21770 + }, + { + "epoch": 1.305051231349991, + "grad_norm": 0.11853202432394028, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0058, + "step": 21780 + }, + { + "epoch": 1.3056504284259094, + "grad_norm": 0.24277184903621674, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0038, + "step": 21790 + }, + { + "epoch": 1.3062496255018274, + "grad_norm": 0.2625603675842285, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0048, + "step": 21800 + }, + { + "epoch": 1.306848822577746, + "grad_norm": 0.1333419382572174, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0033, + "step": 21810 + }, + { + "epoch": 1.307448019653664, + "grad_norm": 0.09627685695886612, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0035, + "step": 21820 + }, + { + "epoch": 1.3080472167295825, + "grad_norm": 0.416618674993515, + "learning_rate": 5.301584321328435e-06, + "loss": 0.0038, + "step": 21830 + }, + { + "epoch": 1.3086464138055005, + "grad_norm": 0.18699553608894348, + "learning_rate": 5.294041118587667e-06, + "loss": 0.004, + "step": 21840 + }, + { + "epoch": 1.309245610881419, + "grad_norm": 0.1827329397201538, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0039, + "step": 21850 + }, + { + "epoch": 1.309844807957337, + "grad_norm": 0.19719162583351135, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0034, + "step": 21860 + }, + { + "epoch": 1.3104440050332555, + "grad_norm": 0.09895205497741699, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0042, + "step": 21870 + }, + { + "epoch": 1.3110432021091736, + "grad_norm": 0.11187861114740372, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0036, + "step": 21880 + }, + { + "epoch": 1.311642399185092, + "grad_norm": 0.154103085398674, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0037, + "step": 21890 + }, + { + "epoch": 1.3122415962610101, + "grad_norm": 0.11124159395694733, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0038, + "step": 21900 + }, + { + "epoch": 1.3128407933369286, + "grad_norm": 0.27686378359794617, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0041, + "step": 21910 + }, + { + "epoch": 1.3134399904128466, + "grad_norm": 0.12900429964065552, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0036, + "step": 21920 + }, + { + "epoch": 1.3140391874887651, + "grad_norm": 0.26441213488578796, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0032, + "step": 21930 + }, + { + "epoch": 1.3146383845646834, + "grad_norm": 0.2187345325946808, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.004, + "step": 21940 + }, + { + "epoch": 1.3152375816406017, + "grad_norm": 0.08503159135580063, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0034, + "step": 21950 + }, + { + "epoch": 1.31583677871652, + "grad_norm": 0.12869144976139069, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0035, + "step": 21960 + }, + { + "epoch": 1.3164359757924382, + "grad_norm": 0.13212713599205017, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0027, + "step": 21970 + }, + { + "epoch": 1.3170351728683565, + "grad_norm": 0.23211228847503662, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0032, + "step": 21980 + }, + { + "epoch": 1.3176343699442747, + "grad_norm": 0.2017366737127304, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0043, + "step": 21990 + }, + { + "epoch": 1.318233567020193, + "grad_norm": 0.21221789717674255, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0039, + "step": 22000 + }, + { + "epoch": 1.3188327640961113, + "grad_norm": 0.24497511982917786, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0046, + "step": 22010 + }, + { + "epoch": 1.3194319611720295, + "grad_norm": 0.15008985996246338, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0039, + "step": 22020 + }, + { + "epoch": 1.3200311582479478, + "grad_norm": 0.15641193091869354, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0039, + "step": 22030 + }, + { + "epoch": 1.320630355323866, + "grad_norm": 0.2608455419540405, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0036, + "step": 22040 + }, + { + "epoch": 1.3212295523997843, + "grad_norm": 0.09808705747127533, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0038, + "step": 22050 + }, + { + "epoch": 1.3218287494757026, + "grad_norm": 0.18084567785263062, + "learning_rate": 5.129800405815733e-06, + "loss": 0.0045, + "step": 22060 + }, + { + "epoch": 1.3224279465516209, + "grad_norm": 0.1957635134458542, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0036, + "step": 22070 + }, + { + "epoch": 1.3230271436275391, + "grad_norm": 0.1479685753583908, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0045, + "step": 22080 + }, + { + "epoch": 1.3236263407034574, + "grad_norm": 0.14854201674461365, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0035, + "step": 22090 + }, + { + "epoch": 1.3242255377793757, + "grad_norm": 0.14744973182678223, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0033, + "step": 22100 + }, + { + "epoch": 1.324824734855294, + "grad_norm": 0.7196730375289917, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0071, + "step": 22110 + }, + { + "epoch": 1.3254239319312122, + "grad_norm": 0.22570419311523438, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0056, + "step": 22120 + }, + { + "epoch": 1.3260231290071305, + "grad_norm": 0.16870586574077606, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0042, + "step": 22130 + }, + { + "epoch": 1.3266223260830488, + "grad_norm": 0.12610554695129395, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0051, + "step": 22140 + }, + { + "epoch": 1.327221523158967, + "grad_norm": 0.11198554188013077, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0042, + "step": 22150 + }, + { + "epoch": 1.3278207202348853, + "grad_norm": 0.13166265189647675, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0037, + "step": 22160 + }, + { + "epoch": 1.3284199173108036, + "grad_norm": 0.1181526631116867, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0037, + "step": 22170 + }, + { + "epoch": 1.3290191143867218, + "grad_norm": 0.2055635005235672, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0027, + "step": 22180 + }, + { + "epoch": 1.32961831146264, + "grad_norm": 0.13400030136108398, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0028, + "step": 22190 + }, + { + "epoch": 1.3302175085385584, + "grad_norm": 0.09746947884559631, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0048, + "step": 22200 + }, + { + "epoch": 1.3308167056144766, + "grad_norm": 0.22124870121479034, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0048, + "step": 22210 + }, + { + "epoch": 1.331415902690395, + "grad_norm": 0.09961193799972534, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0038, + "step": 22220 + }, + { + "epoch": 1.3320150997663132, + "grad_norm": 0.20024695992469788, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0036, + "step": 22230 + }, + { + "epoch": 1.3326142968422314, + "grad_norm": 0.3697144687175751, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0044, + "step": 22240 + }, + { + "epoch": 1.3332134939181497, + "grad_norm": 0.1713833063840866, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0038, + "step": 22250 + }, + { + "epoch": 1.333812690994068, + "grad_norm": 0.1914745569229126, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0051, + "step": 22260 + }, + { + "epoch": 1.3344118880699862, + "grad_norm": 0.190393328666687, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0035, + "step": 22270 + }, + { + "epoch": 1.3350110851459045, + "grad_norm": 0.17361588776111603, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0038, + "step": 22280 + }, + { + "epoch": 1.3356102822218228, + "grad_norm": 0.19456325471401215, + "learning_rate": 4.961660586405147e-06, + "loss": 0.0036, + "step": 22290 + }, + { + "epoch": 1.336209479297741, + "grad_norm": 0.15772588551044464, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0039, + "step": 22300 + }, + { + "epoch": 1.3368086763736593, + "grad_norm": 0.11680205166339874, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0045, + "step": 22310 + }, + { + "epoch": 1.3374078734495776, + "grad_norm": 0.3643893599510193, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0044, + "step": 22320 + }, + { + "epoch": 1.3380070705254958, + "grad_norm": 0.1628265231847763, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0036, + "step": 22330 + }, + { + "epoch": 1.338606267601414, + "grad_norm": 0.10073156654834747, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0041, + "step": 22340 + }, + { + "epoch": 1.3392054646773324, + "grad_norm": 0.13039462268352509, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0045, + "step": 22350 + }, + { + "epoch": 1.3398046617532506, + "grad_norm": 0.12775596976280212, + "learning_rate": 4.911226880894818e-06, + "loss": 0.003, + "step": 22360 + }, + { + "epoch": 1.340403858829169, + "grad_norm": 0.1513100564479828, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0044, + "step": 22370 + }, + { + "epoch": 1.3410030559050872, + "grad_norm": 0.1346164345741272, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0036, + "step": 22380 + }, + { + "epoch": 1.3416022529810054, + "grad_norm": 0.12880294024944305, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0041, + "step": 22390 + }, + { + "epoch": 1.3422014500569237, + "grad_norm": 0.3154917359352112, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0038, + "step": 22400 + }, + { + "epoch": 1.342800647132842, + "grad_norm": 0.18458192050457, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.0057, + "step": 22410 + }, + { + "epoch": 1.3433998442087602, + "grad_norm": 0.2524041533470154, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0038, + "step": 22420 + }, + { + "epoch": 1.3439990412846785, + "grad_norm": 0.11894001811742783, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0033, + "step": 22430 + }, + { + "epoch": 1.3445982383605968, + "grad_norm": 0.1094699576497078, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0037, + "step": 22440 + }, + { + "epoch": 1.345197435436515, + "grad_norm": 0.11090611666440964, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0045, + "step": 22450 + }, + { + "epoch": 1.3457966325124333, + "grad_norm": 0.3179106116294861, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0041, + "step": 22460 + }, + { + "epoch": 1.3463958295883516, + "grad_norm": 0.09424899518489838, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0034, + "step": 22470 + }, + { + "epoch": 1.3469950266642698, + "grad_norm": 0.3028348982334137, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0035, + "step": 22480 + }, + { + "epoch": 1.3475942237401881, + "grad_norm": 0.30831560492515564, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0045, + "step": 22490 + }, + { + "epoch": 1.3481934208161064, + "grad_norm": 0.34811046719551086, + "learning_rate": 4.81141273556404e-06, + "loss": 0.005, + "step": 22500 + }, + { + "epoch": 1.3487926178920246, + "grad_norm": 0.18413113057613373, + "learning_rate": 4.804337352679613e-06, + "loss": 0.0044, + "step": 22510 + }, + { + "epoch": 1.349391814967943, + "grad_norm": 0.11229179799556732, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.004, + "step": 22520 + }, + { + "epoch": 1.3499910120438612, + "grad_norm": 0.2966957688331604, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0056, + "step": 22530 + }, + { + "epoch": 1.3505902091197795, + "grad_norm": 0.10525348782539368, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0037, + "step": 22540 + }, + { + "epoch": 1.3511894061956977, + "grad_norm": 0.1479673534631729, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0049, + "step": 22550 + }, + { + "epoch": 1.351788603271616, + "grad_norm": 0.5229315757751465, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0051, + "step": 22560 + }, + { + "epoch": 1.3523878003475343, + "grad_norm": 0.17021632194519043, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0038, + "step": 22570 + }, + { + "epoch": 1.3529869974234525, + "grad_norm": 0.10177282989025116, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0036, + "step": 22580 + }, + { + "epoch": 1.3535861944993708, + "grad_norm": 0.17768025398254395, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0034, + "step": 22590 + }, + { + "epoch": 1.354185391575289, + "grad_norm": 0.2090948224067688, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0027, + "step": 22600 + }, + { + "epoch": 1.3547845886512073, + "grad_norm": 0.1722206026315689, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0031, + "step": 22610 + }, + { + "epoch": 1.3553837857271256, + "grad_norm": 0.09709088504314423, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0037, + "step": 22620 + }, + { + "epoch": 1.3559829828030439, + "grad_norm": 0.1969165802001953, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0034, + "step": 22630 + }, + { + "epoch": 1.3565821798789621, + "grad_norm": 0.0810595229268074, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0038, + "step": 22640 + }, + { + "epoch": 1.3571813769548804, + "grad_norm": 0.22003750503063202, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0041, + "step": 22650 + }, + { + "epoch": 1.3577805740307987, + "grad_norm": 0.2809178829193115, + "learning_rate": 4.699083753549858e-06, + "loss": 0.003, + "step": 22660 + }, + { + "epoch": 1.358379771106717, + "grad_norm": 0.1343737691640854, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0026, + "step": 22670 + }, + { + "epoch": 1.3589789681826352, + "grad_norm": 0.19191010296344757, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0035, + "step": 22680 + }, + { + "epoch": 1.3595781652585535, + "grad_norm": 0.16617201268672943, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0031, + "step": 22690 + }, + { + "epoch": 1.3601773623344717, + "grad_norm": 0.24936997890472412, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0032, + "step": 22700 + }, + { + "epoch": 1.36077655941039, + "grad_norm": 0.5643696188926697, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0053, + "step": 22710 + }, + { + "epoch": 1.3613757564863083, + "grad_norm": 0.19725625216960907, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0031, + "step": 22720 + }, + { + "epoch": 1.3619749535622265, + "grad_norm": 0.1692969799041748, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0035, + "step": 22730 + }, + { + "epoch": 1.362574150638145, + "grad_norm": 0.17487913370132446, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0041, + "step": 22740 + }, + { + "epoch": 1.363173347714063, + "grad_norm": 0.25642889738082886, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0043, + "step": 22750 + }, + { + "epoch": 1.3637725447899816, + "grad_norm": 0.3692823350429535, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0039, + "step": 22760 + }, + { + "epoch": 1.3643717418658996, + "grad_norm": 0.230118989944458, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0047, + "step": 22770 + }, + { + "epoch": 1.364970938941818, + "grad_norm": 0.1609203815460205, + "learning_rate": 4.616077433849538e-06, + "loss": 0.0038, + "step": 22780 + }, + { + "epoch": 1.3655701360177361, + "grad_norm": 0.21201254427433014, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0029, + "step": 22790 + }, + { + "epoch": 1.3661693330936546, + "grad_norm": 0.10142157226800919, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0034, + "step": 22800 + }, + { + "epoch": 1.3667685301695727, + "grad_norm": 0.19121089577674866, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0033, + "step": 22810 + }, + { + "epoch": 1.3673677272454912, + "grad_norm": 0.156619131565094, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0045, + "step": 22820 + }, + { + "epoch": 1.3679669243214092, + "grad_norm": 0.14690659940242767, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0042, + "step": 22830 + }, + { + "epoch": 1.3685661213973277, + "grad_norm": 0.13466109335422516, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0041, + "step": 22840 + }, + { + "epoch": 1.3691653184732457, + "grad_norm": 0.3713383674621582, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0056, + "step": 22850 + }, + { + "epoch": 1.3697645155491642, + "grad_norm": 0.12184764444828033, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0038, + "step": 22860 + }, + { + "epoch": 1.3703637126250823, + "grad_norm": 0.23971956968307495, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0035, + "step": 22870 + }, + { + "epoch": 1.3709629097010008, + "grad_norm": 0.3320925235748291, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0052, + "step": 22880 + }, + { + "epoch": 1.3715621067769188, + "grad_norm": 0.11913793534040451, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0029, + "step": 22890 + }, + { + "epoch": 1.3721613038528373, + "grad_norm": 0.11725693941116333, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0044, + "step": 22900 + }, + { + "epoch": 1.3727605009287553, + "grad_norm": 0.1550632119178772, + "learning_rate": 4.527371771040039e-06, + "loss": 0.0049, + "step": 22910 + }, + { + "epoch": 1.3733596980046738, + "grad_norm": 0.23413509130477905, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0043, + "step": 22920 + }, + { + "epoch": 1.3739588950805919, + "grad_norm": 0.16070885956287384, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0032, + "step": 22930 + }, + { + "epoch": 1.3745580921565104, + "grad_norm": 0.12317437678575516, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0038, + "step": 22940 + }, + { + "epoch": 1.3751572892324284, + "grad_norm": 0.3462170660495758, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0048, + "step": 22950 + }, + { + "epoch": 1.375756486308347, + "grad_norm": 0.12654773890972137, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0045, + "step": 22960 + }, + { + "epoch": 1.376355683384265, + "grad_norm": 0.06262557208538055, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0026, + "step": 22970 + }, + { + "epoch": 1.3769548804601834, + "grad_norm": 0.1439850926399231, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0045, + "step": 22980 + }, + { + "epoch": 1.3775540775361017, + "grad_norm": 0.24463413655757904, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0042, + "step": 22990 + }, + { + "epoch": 1.37815327461202, + "grad_norm": 0.22048236429691315, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0036, + "step": 23000 + }, + { + "epoch": 1.3787524716879382, + "grad_norm": 0.10628963261842728, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0037, + "step": 23010 + }, + { + "epoch": 1.3793516687638565, + "grad_norm": 0.14685721695423126, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0043, + "step": 23020 + }, + { + "epoch": 1.3799508658397748, + "grad_norm": 0.18807503581047058, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0042, + "step": 23030 + }, + { + "epoch": 1.380550062915693, + "grad_norm": 0.19162075221538544, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0047, + "step": 23040 + }, + { + "epoch": 1.3811492599916113, + "grad_norm": 0.2444164752960205, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0041, + "step": 23050 + }, + { + "epoch": 1.3817484570675296, + "grad_norm": 0.12120077759027481, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0037, + "step": 23060 + }, + { + "epoch": 1.3823476541434478, + "grad_norm": 0.19946682453155518, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.003, + "step": 23070 + }, + { + "epoch": 1.3829468512193661, + "grad_norm": 0.23982395231723785, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0035, + "step": 23080 + }, + { + "epoch": 1.3835460482952844, + "grad_norm": 0.13806626200675964, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0031, + "step": 23090 + }, + { + "epoch": 1.3841452453712026, + "grad_norm": 0.2610985040664673, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0047, + "step": 23100 + }, + { + "epoch": 1.384744442447121, + "grad_norm": 0.1384919434785843, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0055, + "step": 23110 + }, + { + "epoch": 1.3853436395230392, + "grad_norm": 0.14737965166568756, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0037, + "step": 23120 + }, + { + "epoch": 1.3859428365989575, + "grad_norm": 0.1304326057434082, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0036, + "step": 23130 + }, + { + "epoch": 1.3865420336748757, + "grad_norm": 0.22288398444652557, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0035, + "step": 23140 + }, + { + "epoch": 1.387141230750794, + "grad_norm": 0.11266916245222092, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0071, + "step": 23150 + }, + { + "epoch": 1.3877404278267123, + "grad_norm": 0.15941838920116425, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0024, + "step": 23160 + }, + { + "epoch": 1.3883396249026305, + "grad_norm": 0.18921831250190735, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0026, + "step": 23170 + }, + { + "epoch": 1.3889388219785488, + "grad_norm": 0.10112889111042023, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0037, + "step": 23180 + }, + { + "epoch": 1.389538019054467, + "grad_norm": 0.1865631341934204, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0028, + "step": 23190 + }, + { + "epoch": 1.3901372161303853, + "grad_norm": 0.20046782493591309, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0042, + "step": 23200 + }, + { + "epoch": 1.3907364132063036, + "grad_norm": 0.11953336745500565, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0026, + "step": 23210 + }, + { + "epoch": 1.3913356102822219, + "grad_norm": 0.17050383985042572, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0029, + "step": 23220 + }, + { + "epoch": 1.3919348073581401, + "grad_norm": 0.28782936930656433, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0042, + "step": 23230 + }, + { + "epoch": 1.3925340044340584, + "grad_norm": 0.2104359269142151, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0034, + "step": 23240 + }, + { + "epoch": 1.3931332015099767, + "grad_norm": 0.12790441513061523, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0048, + "step": 23250 + }, + { + "epoch": 1.393732398585895, + "grad_norm": 0.12111827731132507, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0043, + "step": 23260 + }, + { + "epoch": 1.3943315956618132, + "grad_norm": 0.2542783319950104, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0043, + "step": 23270 + }, + { + "epoch": 1.3949307927377315, + "grad_norm": 0.17177502810955048, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0036, + "step": 23280 + }, + { + "epoch": 1.3955299898136497, + "grad_norm": 0.14121277630329132, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0051, + "step": 23290 + }, + { + "epoch": 1.396129186889568, + "grad_norm": 0.11357807368040085, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0033, + "step": 23300 + }, + { + "epoch": 1.3967283839654863, + "grad_norm": 0.3277477025985718, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0043, + "step": 23310 + }, + { + "epoch": 1.3973275810414045, + "grad_norm": 0.37000587582588196, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0037, + "step": 23320 + }, + { + "epoch": 1.3979267781173228, + "grad_norm": 0.11122190207242966, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0034, + "step": 23330 + }, + { + "epoch": 1.398525975193241, + "grad_norm": 0.14530375599861145, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0039, + "step": 23340 + }, + { + "epoch": 1.3991251722691593, + "grad_norm": 0.19974422454833984, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0033, + "step": 23350 + }, + { + "epoch": 1.3997243693450776, + "grad_norm": 0.15466761589050293, + "learning_rate": 4.230335566422999e-06, + "loss": 0.003, + "step": 23360 + }, + { + "epoch": 1.4003235664209959, + "grad_norm": 0.19129224121570587, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0042, + "step": 23370 + }, + { + "epoch": 1.4009227634969141, + "grad_norm": 0.2474614828824997, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0046, + "step": 23380 + }, + { + "epoch": 1.4015219605728324, + "grad_norm": 0.15569351613521576, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0048, + "step": 23390 + }, + { + "epoch": 1.4021211576487507, + "grad_norm": 0.09572251886129379, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0044, + "step": 23400 + }, + { + "epoch": 1.402720354724669, + "grad_norm": 0.13737086951732635, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0043, + "step": 23410 + }, + { + "epoch": 1.4033195518005872, + "grad_norm": 0.12266672402620316, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0039, + "step": 23420 + }, + { + "epoch": 1.4039187488765055, + "grad_norm": 0.09208404272794724, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0039, + "step": 23430 + }, + { + "epoch": 1.4045179459524237, + "grad_norm": 0.16571840643882751, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0047, + "step": 23440 + }, + { + "epoch": 1.405117143028342, + "grad_norm": 0.3071173131465912, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0039, + "step": 23450 + }, + { + "epoch": 1.4057163401042603, + "grad_norm": 0.09059276431798935, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0031, + "step": 23460 + }, + { + "epoch": 1.4063155371801785, + "grad_norm": 0.16070133447647095, + "learning_rate": 4.160146936563338e-06, + "loss": 0.004, + "step": 23470 + }, + { + "epoch": 1.4069147342560968, + "grad_norm": 0.12942227721214294, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0027, + "step": 23480 + }, + { + "epoch": 1.407513931332015, + "grad_norm": 0.13913804292678833, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0048, + "step": 23490 + }, + { + "epoch": 1.4081131284079333, + "grad_norm": 0.206321582198143, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0053, + "step": 23500 + }, + { + "epoch": 1.4087123254838516, + "grad_norm": 0.20973987877368927, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0041, + "step": 23510 + }, + { + "epoch": 1.4093115225597699, + "grad_norm": 0.23191478848457336, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0063, + "step": 23520 + }, + { + "epoch": 1.4099107196356881, + "grad_norm": 0.18233250081539154, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0029, + "step": 23530 + }, + { + "epoch": 1.4105099167116064, + "grad_norm": 0.133034810423851, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0043, + "step": 23540 + }, + { + "epoch": 1.4111091137875247, + "grad_norm": 0.10777711123228073, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0028, + "step": 23550 + }, + { + "epoch": 1.411708310863443, + "grad_norm": 0.14128559827804565, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0034, + "step": 23560 + }, + { + "epoch": 1.4123075079393612, + "grad_norm": 0.13215866684913635, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0036, + "step": 23570 + }, + { + "epoch": 1.4129067050152795, + "grad_norm": 0.18918493390083313, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0043, + "step": 23580 + }, + { + "epoch": 1.4135059020911978, + "grad_norm": 0.14459657669067383, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0045, + "step": 23590 + }, + { + "epoch": 1.414105099167116, + "grad_norm": 0.17287056148052216, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0039, + "step": 23600 + }, + { + "epoch": 1.4147042962430343, + "grad_norm": 0.13909804821014404, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0039, + "step": 23610 + }, + { + "epoch": 1.4153034933189526, + "grad_norm": 0.14798089861869812, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0037, + "step": 23620 + }, + { + "epoch": 1.4159026903948708, + "grad_norm": 0.10916659235954285, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0023, + "step": 23630 + }, + { + "epoch": 1.416501887470789, + "grad_norm": 0.1151762530207634, + "learning_rate": 4.053587511509546e-06, + "loss": 0.005, + "step": 23640 + }, + { + "epoch": 1.4171010845467074, + "grad_norm": 0.14232765138149261, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0032, + "step": 23650 + }, + { + "epoch": 1.4177002816226256, + "grad_norm": 0.09513483196496964, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0043, + "step": 23660 + }, + { + "epoch": 1.418299478698544, + "grad_norm": 0.09156285226345062, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0039, + "step": 23670 + }, + { + "epoch": 1.4188986757744622, + "grad_norm": 0.1405397206544876, + "learning_rate": 4.028855757736123e-06, + "loss": 0.004, + "step": 23680 + }, + { + "epoch": 1.4194978728503804, + "grad_norm": 0.15840958058834076, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0037, + "step": 23690 + }, + { + "epoch": 1.4200970699262987, + "grad_norm": 0.190508171916008, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0037, + "step": 23700 + }, + { + "epoch": 1.420696267002217, + "grad_norm": 0.15277954936027527, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0028, + "step": 23710 + }, + { + "epoch": 1.4212954640781352, + "grad_norm": 0.14111991226673126, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0034, + "step": 23720 + }, + { + "epoch": 1.4218946611540535, + "grad_norm": 0.31528833508491516, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0044, + "step": 23730 + }, + { + "epoch": 1.4224938582299718, + "grad_norm": 0.1420607715845108, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0039, + "step": 23740 + }, + { + "epoch": 1.42309305530589, + "grad_norm": 0.1340852528810501, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0032, + "step": 23750 + }, + { + "epoch": 1.4236922523818083, + "grad_norm": 0.11166475713253021, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0033, + "step": 23760 + }, + { + "epoch": 1.4242914494577266, + "grad_norm": 0.13635945320129395, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0028, + "step": 23770 + }, + { + "epoch": 1.4248906465336448, + "grad_norm": 0.15865778923034668, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0036, + "step": 23780 + }, + { + "epoch": 1.4254898436095633, + "grad_norm": 0.08569981157779694, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0026, + "step": 23790 + }, + { + "epoch": 1.4260890406854814, + "grad_norm": 0.1041082963347435, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0033, + "step": 23800 + }, + { + "epoch": 1.4266882377613999, + "grad_norm": 0.17262709140777588, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0041, + "step": 23810 + }, + { + "epoch": 1.427287434837318, + "grad_norm": 0.20455610752105713, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0035, + "step": 23820 + }, + { + "epoch": 1.4278866319132364, + "grad_norm": 0.15869568288326263, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0027, + "step": 23830 + }, + { + "epoch": 1.4284858289891544, + "grad_norm": 0.14855770766735077, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0042, + "step": 23840 + }, + { + "epoch": 1.429085026065073, + "grad_norm": 0.08842955529689789, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0028, + "step": 23850 + }, + { + "epoch": 1.429684223140991, + "grad_norm": 0.18251122534275055, + "learning_rate": 3.919189353330104e-06, + "loss": 0.003, + "step": 23860 + }, + { + "epoch": 1.4302834202169095, + "grad_norm": 0.24990014731884003, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0038, + "step": 23870 + }, + { + "epoch": 1.4308826172928275, + "grad_norm": 0.1088186502456665, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0036, + "step": 23880 + }, + { + "epoch": 1.431481814368746, + "grad_norm": 0.09780745953321457, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0042, + "step": 23890 + }, + { + "epoch": 1.432081011444664, + "grad_norm": 0.1625395119190216, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0033, + "step": 23900 + }, + { + "epoch": 1.4326802085205825, + "grad_norm": 0.16848890483379364, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0033, + "step": 23910 + }, + { + "epoch": 1.4332794055965006, + "grad_norm": 0.19756828248500824, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0051, + "step": 23920 + }, + { + "epoch": 1.433878602672419, + "grad_norm": 0.15720513463020325, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0033, + "step": 23930 + }, + { + "epoch": 1.4344777997483371, + "grad_norm": 0.22365699708461761, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0028, + "step": 23940 + }, + { + "epoch": 1.4350769968242556, + "grad_norm": 0.07928138971328735, + "learning_rate": 3.865363184624925e-06, + "loss": 0.003, + "step": 23950 + }, + { + "epoch": 1.4356761939001736, + "grad_norm": 0.26314112544059753, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0049, + "step": 23960 + }, + { + "epoch": 1.4362753909760921, + "grad_norm": 0.1249697357416153, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0036, + "step": 23970 + }, + { + "epoch": 1.4368745880520102, + "grad_norm": 0.09758924692869186, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0031, + "step": 23980 + }, + { + "epoch": 1.4374737851279287, + "grad_norm": 0.08506497740745544, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0037, + "step": 23990 + }, + { + "epoch": 1.4380729822038467, + "grad_norm": 0.1978219896554947, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0036, + "step": 24000 + }, + { + "epoch": 1.4386721792797652, + "grad_norm": 0.15215060114860535, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0033, + "step": 24010 + }, + { + "epoch": 1.4392713763556833, + "grad_norm": 0.1608658879995346, + "learning_rate": 3.823967005382315e-06, + "loss": 0.0034, + "step": 24020 + }, + { + "epoch": 1.4398705734316017, + "grad_norm": 0.10854586958885193, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0026, + "step": 24030 + }, + { + "epoch": 1.4404697705075198, + "grad_norm": 0.1394745409488678, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0036, + "step": 24040 + }, + { + "epoch": 1.4410689675834383, + "grad_norm": 0.0879194363951683, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0034, + "step": 24050 + }, + { + "epoch": 1.4416681646593565, + "grad_norm": 0.11169253289699554, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0037, + "step": 24060 + }, + { + "epoch": 1.4422673617352748, + "grad_norm": 0.12410115450620651, + "learning_rate": 3.794650811106129e-06, + "loss": 0.0033, + "step": 24070 + }, + { + "epoch": 1.442866558811193, + "grad_norm": 0.13719962537288666, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.0032, + "step": 24080 + }, + { + "epoch": 1.4434657558871113, + "grad_norm": 0.10031221807003021, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0046, + "step": 24090 + }, + { + "epoch": 1.4440649529630296, + "grad_norm": 0.1156797707080841, + "learning_rate": 3.777162510056721e-06, + "loss": 0.0042, + "step": 24100 + }, + { + "epoch": 1.4446641500389479, + "grad_norm": 0.1494375318288803, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0036, + "step": 24110 + }, + { + "epoch": 1.4452633471148661, + "grad_norm": 0.08620154112577438, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0034, + "step": 24120 + }, + { + "epoch": 1.4458625441907844, + "grad_norm": 0.16659799218177795, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0053, + "step": 24130 + }, + { + "epoch": 1.4464617412667027, + "grad_norm": 0.1313968300819397, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.0046, + "step": 24140 + }, + { + "epoch": 1.447060938342621, + "grad_norm": 0.21495603024959564, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0037, + "step": 24150 + }, + { + "epoch": 1.4476601354185392, + "grad_norm": 0.11284582316875458, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0033, + "step": 24160 + }, + { + "epoch": 1.4482593324944575, + "grad_norm": 0.18478819727897644, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0038, + "step": 24170 + }, + { + "epoch": 1.4488585295703758, + "grad_norm": 0.12338980287313461, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0029, + "step": 24180 + }, + { + "epoch": 1.449457726646294, + "grad_norm": 0.09782207757234573, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0037, + "step": 24190 + }, + { + "epoch": 1.4500569237222123, + "grad_norm": 0.10959567129611969, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0029, + "step": 24200 + }, + { + "epoch": 1.4506561207981306, + "grad_norm": 0.17048455774784088, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0035, + "step": 24210 + }, + { + "epoch": 1.4512553178740488, + "grad_norm": 0.12739142775535583, + "learning_rate": 3.707974016467e-06, + "loss": 0.0028, + "step": 24220 + }, + { + "epoch": 1.451854514949967, + "grad_norm": 0.19227802753448486, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0045, + "step": 24230 + }, + { + "epoch": 1.4524537120258854, + "grad_norm": 0.11818226426839828, + "learning_rate": 3.696562092850226e-06, + "loss": 0.0032, + "step": 24240 + }, + { + "epoch": 1.4530529091018036, + "grad_norm": 0.10820474475622177, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0026, + "step": 24250 + }, + { + "epoch": 1.453652106177722, + "grad_norm": 0.11386270821094513, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0026, + "step": 24260 + }, + { + "epoch": 1.4542513032536402, + "grad_norm": 0.23488907516002655, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0035, + "step": 24270 + }, + { + "epoch": 1.4548505003295584, + "grad_norm": 0.12526266276836395, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0037, + "step": 24280 + }, + { + "epoch": 1.4554496974054767, + "grad_norm": 0.22899770736694336, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0035, + "step": 24290 + }, + { + "epoch": 1.456048894481395, + "grad_norm": 0.13044586777687073, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0039, + "step": 24300 + }, + { + "epoch": 1.4566480915573132, + "grad_norm": 0.3652730882167816, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0041, + "step": 24310 + }, + { + "epoch": 1.4572472886332315, + "grad_norm": 0.1416187435388565, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.0036, + "step": 24320 + }, + { + "epoch": 1.4578464857091498, + "grad_norm": 0.11176013946533203, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0037, + "step": 24330 + }, + { + "epoch": 1.458445682785068, + "grad_norm": 0.09744516015052795, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0037, + "step": 24340 + }, + { + "epoch": 1.4590448798609863, + "grad_norm": 0.11925745010375977, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0034, + "step": 24350 + }, + { + "epoch": 1.4596440769369046, + "grad_norm": 0.0942603051662445, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0031, + "step": 24360 + }, + { + "epoch": 1.4602432740128228, + "grad_norm": 0.12849931418895721, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.0042, + "step": 24370 + }, + { + "epoch": 1.460842471088741, + "grad_norm": 0.11910247802734375, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0025, + "step": 24380 + }, + { + "epoch": 1.4614416681646594, + "grad_norm": 0.09603044390678406, + "learning_rate": 3.612069140022124e-06, + "loss": 0.004, + "step": 24390 + }, + { + "epoch": 1.4620408652405776, + "grad_norm": 0.1962766945362091, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0038, + "step": 24400 + }, + { + "epoch": 1.462640062316496, + "grad_norm": 0.15775476396083832, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0043, + "step": 24410 + }, + { + "epoch": 1.4632392593924142, + "grad_norm": 0.1549777239561081, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0041, + "step": 24420 + }, + { + "epoch": 1.4638384564683324, + "grad_norm": 0.24444808065891266, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0029, + "step": 24430 + }, + { + "epoch": 1.4644376535442507, + "grad_norm": 0.12734061479568481, + "learning_rate": 3.584337233394337e-06, + "loss": 0.003, + "step": 24440 + }, + { + "epoch": 1.465036850620169, + "grad_norm": 0.23149384558200836, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0042, + "step": 24450 + }, + { + "epoch": 1.4656360476960872, + "grad_norm": 0.1598765254020691, + "learning_rate": 3.573305344104808e-06, + "loss": 0.0041, + "step": 24460 + }, + { + "epoch": 1.4662352447720055, + "grad_norm": 0.12173855304718018, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0046, + "step": 24470 + }, + { + "epoch": 1.4668344418479238, + "grad_norm": 0.09653043001890182, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0032, + "step": 24480 + }, + { + "epoch": 1.467433638923842, + "grad_norm": 0.13262024521827698, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.003, + "step": 24490 + }, + { + "epoch": 1.4680328359997603, + "grad_norm": 0.2603001892566681, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0031, + "step": 24500 + }, + { + "epoch": 1.4686320330756786, + "grad_norm": 0.24721759557724, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0028, + "step": 24510 + }, + { + "epoch": 1.4692312301515968, + "grad_norm": 0.11963216960430145, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0047, + "step": 24520 + }, + { + "epoch": 1.4698304272275151, + "grad_norm": 0.12025906145572662, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.0024, + "step": 24530 + }, + { + "epoch": 1.4704296243034334, + "grad_norm": 0.1969287395477295, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0033, + "step": 24540 + }, + { + "epoch": 1.4710288213793516, + "grad_norm": 0.24025285243988037, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0047, + "step": 24550 + }, + { + "epoch": 1.47162801845527, + "grad_norm": 0.07612641155719757, + "learning_rate": 3.518669865884119e-06, + "loss": 0.0026, + "step": 24560 + }, + { + "epoch": 1.4722272155311882, + "grad_norm": 0.18313643336296082, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0038, + "step": 24570 + }, + { + "epoch": 1.4728264126071064, + "grad_norm": 0.3311282694339752, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0036, + "step": 24580 + }, + { + "epoch": 1.4734256096830247, + "grad_norm": 0.16643930971622467, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0034, + "step": 24590 + }, + { + "epoch": 1.474024806758943, + "grad_norm": 0.11099164932966232, + "learning_rate": 3.497061149826966e-06, + "loss": 0.003, + "step": 24600 + }, + { + "epoch": 1.4746240038348613, + "grad_norm": 0.11017951369285583, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0031, + "step": 24610 + }, + { + "epoch": 1.4752232009107795, + "grad_norm": 0.17948199808597565, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0037, + "step": 24620 + }, + { + "epoch": 1.4758223979866978, + "grad_norm": 0.1002451479434967, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.0042, + "step": 24630 + }, + { + "epoch": 1.476421595062616, + "grad_norm": 0.13393986225128174, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0038, + "step": 24640 + }, + { + "epoch": 1.4770207921385343, + "grad_norm": 0.0963628888130188, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0027, + "step": 24650 + }, + { + "epoch": 1.4776199892144526, + "grad_norm": 0.14946860074996948, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.0039, + "step": 24660 + }, + { + "epoch": 1.4782191862903709, + "grad_norm": 0.2011580467224121, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.0045, + "step": 24670 + }, + { + "epoch": 1.4788183833662891, + "grad_norm": 0.12523533403873444, + "learning_rate": 3.454266765790622e-06, + "loss": 0.0033, + "step": 24680 + }, + { + "epoch": 1.4794175804422074, + "grad_norm": 0.22948165237903595, + "learning_rate": 3.448957251110008e-06, + "loss": 0.0031, + "step": 24690 + }, + { + "epoch": 1.4800167775181257, + "grad_norm": 0.24120132625102997, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0046, + "step": 24700 + }, + { + "epoch": 1.480615974594044, + "grad_norm": 0.30398526787757874, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0043, + "step": 24710 + }, + { + "epoch": 1.4812151716699622, + "grad_norm": 0.13554388284683228, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0033, + "step": 24720 + }, + { + "epoch": 1.4818143687458805, + "grad_norm": 0.14989149570465088, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.003, + "step": 24730 + }, + { + "epoch": 1.4824135658217987, + "grad_norm": 0.15678660571575165, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0037, + "step": 24740 + }, + { + "epoch": 1.483012762897717, + "grad_norm": 0.29919424653053284, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.0042, + "step": 24750 + }, + { + "epoch": 1.4836119599736353, + "grad_norm": 0.08935242891311646, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.004, + "step": 24760 + }, + { + "epoch": 1.4842111570495535, + "grad_norm": 0.22928708791732788, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0042, + "step": 24770 + }, + { + "epoch": 1.4848103541254718, + "grad_norm": 0.18873436748981476, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0036, + "step": 24780 + }, + { + "epoch": 1.48540955120139, + "grad_norm": 0.0956149622797966, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0037, + "step": 24790 + }, + { + "epoch": 1.4860087482773083, + "grad_norm": 0.13334470987319946, + "learning_rate": 3.391138816571675e-06, + "loss": 0.0034, + "step": 24800 + }, + { + "epoch": 1.4866079453532266, + "grad_norm": 0.13492803275585175, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0033, + "step": 24810 + }, + { + "epoch": 1.4872071424291449, + "grad_norm": 0.13227517902851105, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0032, + "step": 24820 + }, + { + "epoch": 1.4878063395050631, + "grad_norm": 0.11342936754226685, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0042, + "step": 24830 + }, + { + "epoch": 1.4884055365809814, + "grad_norm": 0.3178110122680664, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0038, + "step": 24840 + }, + { + "epoch": 1.4890047336568997, + "grad_norm": 0.04432455077767372, + "learning_rate": 3.36521439484193e-06, + "loss": 0.0028, + "step": 24850 + }, + { + "epoch": 1.4896039307328182, + "grad_norm": 0.09680923074483871, + "learning_rate": 3.36005636574796e-06, + "loss": 0.0036, + "step": 24860 + }, + { + "epoch": 1.4902031278087362, + "grad_norm": 0.2477794885635376, + "learning_rate": 3.354907302553392e-06, + "loss": 0.004, + "step": 24870 + }, + { + "epoch": 1.4908023248846547, + "grad_norm": 0.11931425333023071, + "learning_rate": 3.349767211300933e-06, + "loss": 0.004, + "step": 24880 + }, + { + "epoch": 1.4914015219605727, + "grad_norm": 0.1410735696554184, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0032, + "step": 24890 + }, + { + "epoch": 1.4920007190364912, + "grad_norm": 0.16996408998966217, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.0041, + "step": 24900 + }, + { + "epoch": 1.4925999161124093, + "grad_norm": 0.1275407373905182, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.0038, + "step": 24910 + }, + { + "epoch": 1.4931991131883278, + "grad_norm": 0.10107860714197159, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0036, + "step": 24920 + }, + { + "epoch": 1.4937983102642458, + "grad_norm": 0.10196204483509064, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0032, + "step": 24930 + }, + { + "epoch": 1.4943975073401643, + "grad_norm": 0.10152500867843628, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0038, + "step": 24940 + }, + { + "epoch": 1.4949967044160823, + "grad_norm": 0.19691230356693268, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.0045, + "step": 24950 + }, + { + "epoch": 1.4955959014920008, + "grad_norm": 0.33672890067100525, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0038, + "step": 24960 + }, + { + "epoch": 1.4961950985679189, + "grad_norm": 0.09857437759637833, + "learning_rate": 3.303911119253872e-06, + "loss": 0.004, + "step": 24970 + }, + { + "epoch": 1.4967942956438374, + "grad_norm": 0.13289818167686462, + "learning_rate": 3.298861077451818e-06, + "loss": 0.003, + "step": 24980 + }, + { + "epoch": 1.4973934927197554, + "grad_norm": 0.18509522080421448, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0064, + "step": 24990 + }, + { + "epoch": 1.497992689795674, + "grad_norm": 0.11460676789283752, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0024, + "step": 25000 + }, + { + "epoch": 1.498591886871592, + "grad_norm": 0.12012742459774017, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.0039, + "step": 25010 + }, + { + "epoch": 1.4991910839475104, + "grad_norm": 0.356365442276001, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.0035, + "step": 25020 + }, + { + "epoch": 1.4997902810234285, + "grad_norm": 0.5451288223266602, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0068, + "step": 25030 + }, + { + "epoch": 1.500389478099347, + "grad_norm": 0.1067429855465889, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0034, + "step": 25040 + }, + { + "epoch": 1.500988675175265, + "grad_norm": 0.2349347621202469, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0041, + "step": 25050 + }, + { + "epoch": 1.5015878722511835, + "grad_norm": 0.09102735668420792, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0032, + "step": 25060 + }, + { + "epoch": 1.5021870693271016, + "grad_norm": 0.11968998610973358, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0034, + "step": 25070 + }, + { + "epoch": 1.50278626640302, + "grad_norm": 0.1355520486831665, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0034, + "step": 25080 + }, + { + "epoch": 1.503385463478938, + "grad_norm": 0.11785157769918442, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0044, + "step": 25090 + }, + { + "epoch": 1.5039846605548566, + "grad_norm": 0.12043727189302444, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0047, + "step": 25100 + }, + { + "epoch": 1.5045838576307746, + "grad_norm": 0.13475126028060913, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0033, + "step": 25110 + }, + { + "epoch": 1.5051830547066931, + "grad_norm": 0.12776954472064972, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0032, + "step": 25120 + }, + { + "epoch": 1.5057822517826112, + "grad_norm": 0.10374128818511963, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0054, + "step": 25130 + }, + { + "epoch": 1.5063814488585296, + "grad_norm": 0.08750293403863907, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0035, + "step": 25140 + }, + { + "epoch": 1.5069806459344477, + "grad_norm": 0.1284732222557068, + "learning_rate": 3.214397932123149e-06, + "loss": 0.0047, + "step": 25150 + }, + { + "epoch": 1.5075798430103662, + "grad_norm": 0.12900014221668243, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0042, + "step": 25160 + }, + { + "epoch": 1.5081790400862842, + "grad_norm": 0.11983122676610947, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0031, + "step": 25170 + }, + { + "epoch": 1.5087782371622027, + "grad_norm": 0.20311471819877625, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0045, + "step": 25180 + }, + { + "epoch": 1.5093774342381208, + "grad_norm": 0.1965232491493225, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.0047, + "step": 25190 + }, + { + "epoch": 1.5099766313140393, + "grad_norm": 0.10592305660247803, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0031, + "step": 25200 + }, + { + "epoch": 1.5105758283899573, + "grad_norm": 0.10558371245861053, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0032, + "step": 25210 + }, + { + "epoch": 1.5111750254658758, + "grad_norm": 0.12083200365304947, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0025, + "step": 25220 + }, + { + "epoch": 1.5117742225417938, + "grad_norm": 0.2367735505104065, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0024, + "step": 25230 + }, + { + "epoch": 1.5123734196177123, + "grad_norm": 0.1387612670660019, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.004, + "step": 25240 + }, + { + "epoch": 1.5129726166936306, + "grad_norm": 0.18766231834888458, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0035, + "step": 25250 + }, + { + "epoch": 1.5135718137695489, + "grad_norm": 0.18110574781894684, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0031, + "step": 25260 + }, + { + "epoch": 1.5141710108454671, + "grad_norm": 0.1886875331401825, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.003, + "step": 25270 + }, + { + "epoch": 1.5147702079213854, + "grad_norm": 0.09323479980230331, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.004, + "step": 25280 + }, + { + "epoch": 1.5153694049973037, + "grad_norm": 0.1508265882730484, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0039, + "step": 25290 + }, + { + "epoch": 1.515968602073222, + "grad_norm": 0.11250200122594833, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0039, + "step": 25300 + }, + { + "epoch": 1.5165677991491402, + "grad_norm": 0.23230062425136566, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.004, + "step": 25310 + }, + { + "epoch": 1.5171669962250585, + "grad_norm": 0.179047629237175, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.004, + "step": 25320 + }, + { + "epoch": 1.5177661933009767, + "grad_norm": 0.13797952234745026, + "learning_rate": 3.127844986891409e-06, + "loss": 0.0042, + "step": 25330 + }, + { + "epoch": 1.518365390376895, + "grad_norm": 0.12740616500377655, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0027, + "step": 25340 + }, + { + "epoch": 1.5189645874528133, + "grad_norm": 0.11396504938602448, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.0037, + "step": 25350 + }, + { + "epoch": 1.5195637845287315, + "grad_norm": 0.12815812230110168, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.0033, + "step": 25360 + }, + { + "epoch": 1.5201629816046498, + "grad_norm": 0.17100073397159576, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0039, + "step": 25370 + }, + { + "epoch": 1.520762178680568, + "grad_norm": 0.09657446295022964, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0031, + "step": 25380 + }, + { + "epoch": 1.5213613757564863, + "grad_norm": 0.3235829472541809, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0049, + "step": 25390 + }, + { + "epoch": 1.5219605728324046, + "grad_norm": 0.17849496006965637, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0033, + "step": 25400 + }, + { + "epoch": 1.5225597699083229, + "grad_norm": 0.16907230019569397, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0028, + "step": 25410 + }, + { + "epoch": 1.5231589669842411, + "grad_norm": 0.26099368929862976, + "learning_rate": 3.085688933413021e-06, + "loss": 0.003, + "step": 25420 + }, + { + "epoch": 1.5237581640601594, + "grad_norm": 0.21024562418460846, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0035, + "step": 25430 + }, + { + "epoch": 1.5243573611360777, + "grad_norm": 0.10564325749874115, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.0032, + "step": 25440 + }, + { + "epoch": 1.524956558211996, + "grad_norm": 0.10607697814702988, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0037, + "step": 25450 + }, + { + "epoch": 1.5255557552879142, + "grad_norm": 0.20698976516723633, + "learning_rate": 3.067194157156521e-06, + "loss": 0.003, + "step": 25460 + }, + { + "epoch": 1.5261549523638325, + "grad_norm": 0.20934849977493286, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0033, + "step": 25470 + }, + { + "epoch": 1.5267541494397507, + "grad_norm": 0.12407243996858597, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0037, + "step": 25480 + }, + { + "epoch": 1.527353346515669, + "grad_norm": 0.13003374636173248, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.0032, + "step": 25490 + }, + { + "epoch": 1.5279525435915873, + "grad_norm": 0.15529648959636688, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0048, + "step": 25500 + }, + { + "epoch": 1.5285517406675055, + "grad_norm": 0.12824782729148865, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.0031, + "step": 25510 + }, + { + "epoch": 1.5291509377434238, + "grad_norm": 0.12616124749183655, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.0037, + "step": 25520 + }, + { + "epoch": 1.529750134819342, + "grad_norm": 0.2119731307029724, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0037, + "step": 25530 + }, + { + "epoch": 1.5303493318952603, + "grad_norm": 0.22325192391872406, + "learning_rate": 3.030651808761638e-06, + "loss": 0.0035, + "step": 25540 + }, + { + "epoch": 1.5309485289711786, + "grad_norm": 0.10937803238630295, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0037, + "step": 25550 + }, + { + "epoch": 1.5315477260470969, + "grad_norm": 0.3106321692466736, + "learning_rate": 3.021609639602321e-06, + "loss": 0.0034, + "step": 25560 + }, + { + "epoch": 1.5321469231230151, + "grad_norm": 0.2864716649055481, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.0037, + "step": 25570 + }, + { + "epoch": 1.5327461201989334, + "grad_norm": 0.10637935250997543, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.0037, + "step": 25580 + }, + { + "epoch": 1.5333453172748517, + "grad_norm": 0.11078158766031265, + "learning_rate": 3.008116622200155e-06, + "loss": 0.0034, + "step": 25590 + }, + { + "epoch": 1.53394451435077, + "grad_norm": 0.06270865350961685, + "learning_rate": 3.003637700546652e-06, + "loss": 0.003, + "step": 25600 + }, + { + "epoch": 1.5345437114266882, + "grad_norm": 0.12176132947206497, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0043, + "step": 25610 + }, + { + "epoch": 1.5351429085026065, + "grad_norm": 0.16978275775909424, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0032, + "step": 25620 + }, + { + "epoch": 1.5357421055785248, + "grad_norm": 0.2582871913909912, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.0036, + "step": 25630 + }, + { + "epoch": 1.536341302654443, + "grad_norm": 0.27402547001838684, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0031, + "step": 25640 + }, + { + "epoch": 1.5369404997303613, + "grad_norm": 0.15350353717803955, + "learning_rate": 2.981383959667165e-06, + "loss": 0.004, + "step": 25650 + }, + { + "epoch": 1.5375396968062796, + "grad_norm": 0.0939447432756424, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0033, + "step": 25660 + }, + { + "epoch": 1.5381388938821978, + "grad_norm": 0.16549192368984222, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0048, + "step": 25670 + }, + { + "epoch": 1.538738090958116, + "grad_norm": 0.11002931743860245, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0028, + "step": 25680 + }, + { + "epoch": 1.5393372880340344, + "grad_norm": 0.17383548617362976, + "learning_rate": 2.963750320724704e-06, + "loss": 0.0032, + "step": 25690 + }, + { + "epoch": 1.5399364851099526, + "grad_norm": 0.18648599088191986, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0039, + "step": 25700 + }, + { + "epoch": 1.540535682185871, + "grad_norm": 0.2366044819355011, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0036, + "step": 25710 + }, + { + "epoch": 1.5411348792617892, + "grad_norm": 0.1678195595741272, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.0032, + "step": 25720 + }, + { + "epoch": 1.5417340763377074, + "grad_norm": 0.31918013095855713, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0045, + "step": 25730 + }, + { + "epoch": 1.5423332734136257, + "grad_norm": 0.14635732769966125, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.0039, + "step": 25740 + }, + { + "epoch": 1.542932470489544, + "grad_norm": 0.19166909158229828, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0028, + "step": 25750 + }, + { + "epoch": 1.5435316675654622, + "grad_norm": 0.11960610002279282, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0029, + "step": 25760 + }, + { + "epoch": 1.5441308646413805, + "grad_norm": 0.06636705994606018, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0025, + "step": 25770 + }, + { + "epoch": 1.5447300617172988, + "grad_norm": 0.17033624649047852, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0028, + "step": 25780 + }, + { + "epoch": 1.5453292587932173, + "grad_norm": 0.07974246889352798, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.003, + "step": 25790 + }, + { + "epoch": 1.5459284558691353, + "grad_norm": 0.1188567653298378, + "learning_rate": 2.916036854664115e-06, + "loss": 0.0043, + "step": 25800 + }, + { + "epoch": 1.5465276529450538, + "grad_norm": 0.11378541588783264, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0028, + "step": 25810 + }, + { + "epoch": 1.5471268500209718, + "grad_norm": 0.11495907604694366, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0037, + "step": 25820 + }, + { + "epoch": 1.5477260470968903, + "grad_norm": 0.144247367978096, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0031, + "step": 25830 + }, + { + "epoch": 1.5483252441728084, + "grad_norm": 0.14722205698490143, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0046, + "step": 25840 + }, + { + "epoch": 1.5489244412487269, + "grad_norm": 0.10647077113389969, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.0026, + "step": 25850 + }, + { + "epoch": 1.549523638324645, + "grad_norm": 0.17438668012619019, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0032, + "step": 25860 + }, + { + "epoch": 1.5501228354005634, + "grad_norm": 0.17071637511253357, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0035, + "step": 25870 + }, + { + "epoch": 1.5507220324764814, + "grad_norm": 0.2201206386089325, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0041, + "step": 25880 + }, + { + "epoch": 1.5513212295524, + "grad_norm": 0.14397655427455902, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0037, + "step": 25890 + }, + { + "epoch": 1.551920426628318, + "grad_norm": 0.055822595953941345, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0035, + "step": 25900 + }, + { + "epoch": 1.5525196237042365, + "grad_norm": 0.13084810972213745, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0046, + "step": 25910 + }, + { + "epoch": 1.5531188207801545, + "grad_norm": 0.3321281373500824, + "learning_rate": 2.865295218604555e-06, + "loss": 0.0038, + "step": 25920 + }, + { + "epoch": 1.553718017856073, + "grad_norm": 0.1274777501821518, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0041, + "step": 25930 + }, + { + "epoch": 1.554317214931991, + "grad_norm": 0.09797787666320801, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0034, + "step": 25940 + }, + { + "epoch": 1.5549164120079095, + "grad_norm": 0.1270579695701599, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0042, + "step": 25950 + }, + { + "epoch": 1.5555156090838276, + "grad_norm": 0.09015227854251862, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0034, + "step": 25960 + }, + { + "epoch": 1.556114806159746, + "grad_norm": 0.12557077407836914, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.0037, + "step": 25970 + }, + { + "epoch": 1.5567140032356641, + "grad_norm": 0.2725144922733307, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0036, + "step": 25980 + }, + { + "epoch": 1.5573132003115826, + "grad_norm": 0.13758502900600433, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0039, + "step": 25990 + }, + { + "epoch": 1.5579123973875006, + "grad_norm": 0.19999243319034576, + "learning_rate": 2.832230653119002e-06, + "loss": 0.0038, + "step": 26000 + }, + { + "epoch": 1.5585115944634191, + "grad_norm": 0.1323961615562439, + "learning_rate": 2.828140665735232e-06, + "loss": 0.0033, + "step": 26010 + }, + { + "epoch": 1.5591107915393372, + "grad_norm": 0.12714031338691711, + "learning_rate": 2.8240602684835614e-06, + "loss": 0.0033, + "step": 26020 + }, + { + "epoch": 1.5597099886152557, + "grad_norm": 0.40822476148605347, + "learning_rate": 2.8199894661525695e-06, + "loss": 0.0041, + "step": 26030 + }, + { + "epoch": 1.5603091856911737, + "grad_norm": 0.14638100564479828, + "learning_rate": 2.8159282635195604e-06, + "loss": 0.0042, + "step": 26040 + }, + { + "epoch": 1.5609083827670922, + "grad_norm": 0.17443427443504333, + "learning_rate": 2.8118766653505857e-06, + "loss": 0.0031, + "step": 26050 + }, + { + "epoch": 1.5615075798430103, + "grad_norm": 0.09581520408391953, + "learning_rate": 2.8078346764004217e-06, + "loss": 0.0036, + "step": 26060 + }, + { + "epoch": 1.5621067769189287, + "grad_norm": 0.14804130792617798, + "learning_rate": 2.8038023014125693e-06, + "loss": 0.004, + "step": 26070 + }, + { + "epoch": 1.5627059739948468, + "grad_norm": 0.4015085697174072, + "learning_rate": 2.799779545119241e-06, + "loss": 0.0062, + "step": 26080 + }, + { + "epoch": 1.5633051710707653, + "grad_norm": 0.3468920886516571, + "learning_rate": 2.7957664122413685e-06, + "loss": 0.004, + "step": 26090 + }, + { + "epoch": 1.5639043681466833, + "grad_norm": 0.19594644010066986, + "learning_rate": 2.7917629074885855e-06, + "loss": 0.0042, + "step": 26100 + }, + { + "epoch": 1.5645035652226018, + "grad_norm": 0.09097496420145035, + "learning_rate": 2.78776903555923e-06, + "loss": 0.0027, + "step": 26110 + }, + { + "epoch": 1.5651027622985199, + "grad_norm": 0.11387573927640915, + "learning_rate": 2.7837848011403307e-06, + "loss": 0.0025, + "step": 26120 + }, + { + "epoch": 1.5657019593744383, + "grad_norm": 0.17657096683979034, + "learning_rate": 2.7798102089076096e-06, + "loss": 0.0036, + "step": 26130 + }, + { + "epoch": 1.5663011564503564, + "grad_norm": 0.09257909655570984, + "learning_rate": 2.7758452635254706e-06, + "loss": 0.0033, + "step": 26140 + }, + { + "epoch": 1.5669003535262749, + "grad_norm": 0.15154404938220978, + "learning_rate": 2.771889969647e-06, + "loss": 0.0046, + "step": 26150 + }, + { + "epoch": 1.567499550602193, + "grad_norm": 0.07300597429275513, + "learning_rate": 2.7679443319139547e-06, + "loss": 0.0028, + "step": 26160 + }, + { + "epoch": 1.5680987476781114, + "grad_norm": 0.12779368460178375, + "learning_rate": 2.76400835495676e-06, + "loss": 0.0028, + "step": 26170 + }, + { + "epoch": 1.5686979447540295, + "grad_norm": 0.12631577253341675, + "learning_rate": 2.760082043394504e-06, + "loss": 0.0035, + "step": 26180 + }, + { + "epoch": 1.569297141829948, + "grad_norm": 0.3630695044994354, + "learning_rate": 2.756165401834933e-06, + "loss": 0.0034, + "step": 26190 + }, + { + "epoch": 1.569896338905866, + "grad_norm": 0.18113726377487183, + "learning_rate": 2.7522584348744443e-06, + "loss": 0.005, + "step": 26200 + }, + { + "epoch": 1.5704955359817845, + "grad_norm": 0.21797926723957062, + "learning_rate": 2.748361147098079e-06, + "loss": 0.0033, + "step": 26210 + }, + { + "epoch": 1.5710947330577025, + "grad_norm": 0.1614106148481369, + "learning_rate": 2.7444735430795245e-06, + "loss": 0.0034, + "step": 26220 + }, + { + "epoch": 1.571693930133621, + "grad_norm": 0.10198274999856949, + "learning_rate": 2.740595627381096e-06, + "loss": 0.0038, + "step": 26230 + }, + { + "epoch": 1.572293127209539, + "grad_norm": 0.14413216710090637, + "learning_rate": 2.7367274045537477e-06, + "loss": 0.0022, + "step": 26240 + }, + { + "epoch": 1.5728923242854576, + "grad_norm": 0.08031613379716873, + "learning_rate": 2.732868879137055e-06, + "loss": 0.0037, + "step": 26250 + }, + { + "epoch": 1.5734915213613756, + "grad_norm": 0.31797754764556885, + "learning_rate": 2.7290200556592094e-06, + "loss": 0.0035, + "step": 26260 + }, + { + "epoch": 1.574090718437294, + "grad_norm": 0.0591890886425972, + "learning_rate": 2.72518093863702e-06, + "loss": 0.0027, + "step": 26270 + }, + { + "epoch": 1.5746899155132121, + "grad_norm": 0.15585894882678986, + "learning_rate": 2.721351532575906e-06, + "loss": 0.0064, + "step": 26280 + }, + { + "epoch": 1.5752891125891306, + "grad_norm": 0.13518628478050232, + "learning_rate": 2.717531841969889e-06, + "loss": 0.0042, + "step": 26290 + }, + { + "epoch": 1.5758883096650487, + "grad_norm": 0.13154275715351105, + "learning_rate": 2.713721871301588e-06, + "loss": 0.0033, + "step": 26300 + }, + { + "epoch": 1.5764875067409672, + "grad_norm": 0.33374130725860596, + "learning_rate": 2.709921625042214e-06, + "loss": 0.0061, + "step": 26310 + }, + { + "epoch": 1.5770867038168854, + "grad_norm": 0.12396867573261261, + "learning_rate": 2.7061311076515717e-06, + "loss": 0.004, + "step": 26320 + }, + { + "epoch": 1.5776859008928037, + "grad_norm": 0.08533058315515518, + "learning_rate": 2.7023503235780395e-06, + "loss": 0.0031, + "step": 26330 + }, + { + "epoch": 1.578285097968722, + "grad_norm": 0.25102120637893677, + "learning_rate": 2.6985792772585826e-06, + "loss": 0.003, + "step": 26340 + }, + { + "epoch": 1.5788842950446402, + "grad_norm": 0.10319694876670837, + "learning_rate": 2.6948179731187315e-06, + "loss": 0.0035, + "step": 26350 + }, + { + "epoch": 1.5794834921205585, + "grad_norm": 0.1508130133152008, + "learning_rate": 2.6910664155725847e-06, + "loss": 0.0046, + "step": 26360 + }, + { + "epoch": 1.5800826891964768, + "grad_norm": 0.09007565677165985, + "learning_rate": 2.6873246090228063e-06, + "loss": 0.0025, + "step": 26370 + }, + { + "epoch": 1.580681886272395, + "grad_norm": 0.13807767629623413, + "learning_rate": 2.683592557860616e-06, + "loss": 0.003, + "step": 26380 + }, + { + "epoch": 1.5812810833483133, + "grad_norm": 0.1909133791923523, + "learning_rate": 2.6798702664657803e-06, + "loss": 0.0034, + "step": 26390 + }, + { + "epoch": 1.5818802804242316, + "grad_norm": 0.14300945401191711, + "learning_rate": 2.6761577392066163e-06, + "loss": 0.004, + "step": 26400 + }, + { + "epoch": 1.5824794775001498, + "grad_norm": 0.08184076100587845, + "learning_rate": 2.6724549804399845e-06, + "loss": 0.0025, + "step": 26410 + }, + { + "epoch": 1.583078674576068, + "grad_norm": 0.1493527740240097, + "learning_rate": 2.6687619945112743e-06, + "loss": 0.0026, + "step": 26420 + }, + { + "epoch": 1.5836778716519864, + "grad_norm": 0.09850698709487915, + "learning_rate": 2.6650787857544134e-06, + "loss": 0.0032, + "step": 26430 + }, + { + "epoch": 1.5842770687279046, + "grad_norm": 0.0875677615404129, + "learning_rate": 2.661405358491851e-06, + "loss": 0.0039, + "step": 26440 + }, + { + "epoch": 1.584876265803823, + "grad_norm": 0.2319948524236679, + "learning_rate": 2.6577417170345594e-06, + "loss": 0.0039, + "step": 26450 + }, + { + "epoch": 1.5854754628797412, + "grad_norm": 0.10797403752803802, + "learning_rate": 2.6540878656820246e-06, + "loss": 0.0036, + "step": 26460 + }, + { + "epoch": 1.5860746599556594, + "grad_norm": 0.19400249421596527, + "learning_rate": 2.6504438087222474e-06, + "loss": 0.0034, + "step": 26470 + }, + { + "epoch": 1.5866738570315777, + "grad_norm": 0.1569194793701172, + "learning_rate": 2.6468095504317275e-06, + "loss": 0.0039, + "step": 26480 + }, + { + "epoch": 1.587273054107496, + "grad_norm": 0.17117120325565338, + "learning_rate": 2.643185095075473e-06, + "loss": 0.003, + "step": 26490 + }, + { + "epoch": 1.5878722511834142, + "grad_norm": 0.19703997671604156, + "learning_rate": 2.6395704469069837e-06, + "loss": 0.0041, + "step": 26500 + }, + { + "epoch": 1.5884714482593325, + "grad_norm": 0.09663215279579163, + "learning_rate": 2.635965610168249e-06, + "loss": 0.005, + "step": 26510 + }, + { + "epoch": 1.5890706453352508, + "grad_norm": 0.13411357998847961, + "learning_rate": 2.6323705890897464e-06, + "loss": 0.0035, + "step": 26520 + }, + { + "epoch": 1.589669842411169, + "grad_norm": 0.15013787150382996, + "learning_rate": 2.628785387890433e-06, + "loss": 0.0031, + "step": 26530 + }, + { + "epoch": 1.5902690394870873, + "grad_norm": 0.15517787635326385, + "learning_rate": 2.6252100107777422e-06, + "loss": 0.0028, + "step": 26540 + }, + { + "epoch": 1.5908682365630056, + "grad_norm": 0.23037715256214142, + "learning_rate": 2.6216444619475786e-06, + "loss": 0.0034, + "step": 26550 + }, + { + "epoch": 1.5914674336389238, + "grad_norm": 0.1925845891237259, + "learning_rate": 2.6180887455843135e-06, + "loss": 0.0028, + "step": 26560 + }, + { + "epoch": 1.5920666307148421, + "grad_norm": 0.08933448791503906, + "learning_rate": 2.6145428658607753e-06, + "loss": 0.0029, + "step": 26570 + }, + { + "epoch": 1.5926658277907604, + "grad_norm": 0.14989611506462097, + "learning_rate": 2.6110068269382534e-06, + "loss": 0.0034, + "step": 26580 + }, + { + "epoch": 1.5932650248666786, + "grad_norm": 0.2904585897922516, + "learning_rate": 2.6074806329664854e-06, + "loss": 0.0027, + "step": 26590 + }, + { + "epoch": 1.593864221942597, + "grad_norm": 0.17784662544727325, + "learning_rate": 2.6039642880836585e-06, + "loss": 0.0039, + "step": 26600 + }, + { + "epoch": 1.5944634190185152, + "grad_norm": 0.07810595631599426, + "learning_rate": 2.600457796416397e-06, + "loss": 0.0025, + "step": 26610 + }, + { + "epoch": 1.5950626160944334, + "grad_norm": 0.06783948838710785, + "learning_rate": 2.5969611620797636e-06, + "loss": 0.003, + "step": 26620 + }, + { + "epoch": 1.5956618131703517, + "grad_norm": 0.13763132691383362, + "learning_rate": 2.593474389177255e-06, + "loss": 0.0037, + "step": 26630 + }, + { + "epoch": 1.59626101024627, + "grad_norm": 0.1127597987651825, + "learning_rate": 2.5899974818007924e-06, + "loss": 0.0045, + "step": 26640 + }, + { + "epoch": 1.5968602073221883, + "grad_norm": 0.07828421145677567, + "learning_rate": 2.586530444030723e-06, + "loss": 0.0027, + "step": 26650 + }, + { + "epoch": 1.5974594043981065, + "grad_norm": 0.1327218860387802, + "learning_rate": 2.583073279935805e-06, + "loss": 0.0042, + "step": 26660 + }, + { + "epoch": 1.5980586014740248, + "grad_norm": 0.09427100419998169, + "learning_rate": 2.5796259935732143e-06, + "loss": 0.0027, + "step": 26670 + }, + { + "epoch": 1.598657798549943, + "grad_norm": 0.2112533301115036, + "learning_rate": 2.5761885889885346e-06, + "loss": 0.0046, + "step": 26680 + }, + { + "epoch": 1.5992569956258613, + "grad_norm": 0.24039748311042786, + "learning_rate": 2.5727610702157518e-06, + "loss": 0.0032, + "step": 26690 + }, + { + "epoch": 1.5998561927017796, + "grad_norm": 0.28341665863990784, + "learning_rate": 2.5693434412772496e-06, + "loss": 0.005, + "step": 26700 + }, + { + "epoch": 1.6004553897776979, + "grad_norm": 0.23401512205600739, + "learning_rate": 2.565935706183804e-06, + "loss": 0.0029, + "step": 26710 + }, + { + "epoch": 1.6010545868536161, + "grad_norm": 0.13487978279590607, + "learning_rate": 2.5625378689345837e-06, + "loss": 0.0028, + "step": 26720 + }, + { + "epoch": 1.6016537839295344, + "grad_norm": 0.10604815185070038, + "learning_rate": 2.5591499335171394e-06, + "loss": 0.0029, + "step": 26730 + }, + { + "epoch": 1.6022529810054527, + "grad_norm": 0.12193044275045395, + "learning_rate": 2.555771903907403e-06, + "loss": 0.0031, + "step": 26740 + }, + { + "epoch": 1.602852178081371, + "grad_norm": 0.291572630405426, + "learning_rate": 2.5524037840696787e-06, + "loss": 0.0033, + "step": 26750 + }, + { + "epoch": 1.6034513751572892, + "grad_norm": 0.14938616752624512, + "learning_rate": 2.5490455779566446e-06, + "loss": 0.0027, + "step": 26760 + }, + { + "epoch": 1.6040505722332075, + "grad_norm": 0.16085144877433777, + "learning_rate": 2.545697289509341e-06, + "loss": 0.0022, + "step": 26770 + }, + { + "epoch": 1.6046497693091257, + "grad_norm": 0.14876601099967957, + "learning_rate": 2.5423589226571733e-06, + "loss": 0.0034, + "step": 26780 + }, + { + "epoch": 1.605248966385044, + "grad_norm": 0.13766804337501526, + "learning_rate": 2.5390304813179e-06, + "loss": 0.0029, + "step": 26790 + }, + { + "epoch": 1.6058481634609623, + "grad_norm": 0.1824955940246582, + "learning_rate": 2.5357119693976297e-06, + "loss": 0.0028, + "step": 26800 + }, + { + "epoch": 1.6064473605368805, + "grad_norm": 0.09187015891075134, + "learning_rate": 2.532403390790823e-06, + "loss": 0.0028, + "step": 26810 + }, + { + "epoch": 1.6070465576127988, + "grad_norm": 0.1488831490278244, + "learning_rate": 2.529104749380281e-06, + "loss": 0.0023, + "step": 26820 + }, + { + "epoch": 1.607645754688717, + "grad_norm": 0.16146720945835114, + "learning_rate": 2.5258160490371446e-06, + "loss": 0.0036, + "step": 26830 + }, + { + "epoch": 1.6082449517646353, + "grad_norm": 0.19863533973693848, + "learning_rate": 2.5225372936208854e-06, + "loss": 0.0034, + "step": 26840 + }, + { + "epoch": 1.6088441488405536, + "grad_norm": 0.08710742741823196, + "learning_rate": 2.5192684869793043e-06, + "loss": 0.0028, + "step": 26850 + }, + { + "epoch": 1.609443345916472, + "grad_norm": 0.1280236840248108, + "learning_rate": 2.51600963294853e-06, + "loss": 0.0037, + "step": 26860 + }, + { + "epoch": 1.6100425429923901, + "grad_norm": 0.29420942068099976, + "learning_rate": 2.5127607353530097e-06, + "loss": 0.0031, + "step": 26870 + }, + { + "epoch": 1.6106417400683086, + "grad_norm": 0.16633544862270355, + "learning_rate": 2.5095217980055052e-06, + "loss": 0.0037, + "step": 26880 + }, + { + "epoch": 1.6112409371442267, + "grad_norm": 0.10398953408002853, + "learning_rate": 2.5062928247070873e-06, + "loss": 0.0032, + "step": 26890 + }, + { + "epoch": 1.6118401342201452, + "grad_norm": 0.1609172523021698, + "learning_rate": 2.503073819247138e-06, + "loss": 0.0031, + "step": 26900 + }, + { + "epoch": 1.6124393312960632, + "grad_norm": 0.14156407117843628, + "learning_rate": 2.4998647854033393e-06, + "loss": 0.0032, + "step": 26910 + }, + { + "epoch": 1.6130385283719817, + "grad_norm": 0.3801378309726715, + "learning_rate": 2.4966657269416738e-06, + "loss": 0.0039, + "step": 26920 + }, + { + "epoch": 1.6136377254478997, + "grad_norm": 0.1612473726272583, + "learning_rate": 2.49347664761641e-06, + "loss": 0.0026, + "step": 26930 + }, + { + "epoch": 1.6142369225238182, + "grad_norm": 0.3169429898262024, + "learning_rate": 2.490297551170112e-06, + "loss": 0.0038, + "step": 26940 + }, + { + "epoch": 1.6148361195997363, + "grad_norm": 0.11678534001111984, + "learning_rate": 2.487128441333628e-06, + "loss": 0.0041, + "step": 26950 + }, + { + "epoch": 1.6154353166756548, + "grad_norm": 0.08701438456773758, + "learning_rate": 2.4839693218260844e-06, + "loss": 0.0025, + "step": 26960 + }, + { + "epoch": 1.6160345137515728, + "grad_norm": 0.14214813709259033, + "learning_rate": 2.4808201963548844e-06, + "loss": 0.0032, + "step": 26970 + }, + { + "epoch": 1.6166337108274913, + "grad_norm": 0.06335555016994476, + "learning_rate": 2.477681068615698e-06, + "loss": 0.0022, + "step": 26980 + }, + { + "epoch": 1.6172329079034093, + "grad_norm": 0.1225769966840744, + "learning_rate": 2.4745519422924715e-06, + "loss": 0.0035, + "step": 26990 + }, + { + "epoch": 1.6178321049793278, + "grad_norm": 0.12757551670074463, + "learning_rate": 2.471432821057406e-06, + "loss": 0.0034, + "step": 27000 + }, + { + "epoch": 1.6184313020552459, + "grad_norm": 0.04847760871052742, + "learning_rate": 2.4683237085709673e-06, + "loss": 0.0025, + "step": 27010 + }, + { + "epoch": 1.6190304991311644, + "grad_norm": 0.11208045482635498, + "learning_rate": 2.4652246084818678e-06, + "loss": 0.0028, + "step": 27020 + }, + { + "epoch": 1.6196296962070824, + "grad_norm": 0.10029870271682739, + "learning_rate": 2.4621355244270764e-06, + "loss": 0.0042, + "step": 27030 + }, + { + "epoch": 1.620228893283001, + "grad_norm": 0.10894428193569183, + "learning_rate": 2.4590564600318047e-06, + "loss": 0.003, + "step": 27040 + }, + { + "epoch": 1.620828090358919, + "grad_norm": 0.16484397649765015, + "learning_rate": 2.4559874189095077e-06, + "loss": 0.0027, + "step": 27050 + }, + { + "epoch": 1.6214272874348374, + "grad_norm": 0.18669992685317993, + "learning_rate": 2.4529284046618745e-06, + "loss": 0.0031, + "step": 27060 + }, + { + "epoch": 1.6220264845107555, + "grad_norm": 0.10345451533794403, + "learning_rate": 2.4498794208788296e-06, + "loss": 0.0024, + "step": 27070 + }, + { + "epoch": 1.622625681586674, + "grad_norm": 0.14037790894508362, + "learning_rate": 2.446840471138524e-06, + "loss": 0.0027, + "step": 27080 + }, + { + "epoch": 1.623224878662592, + "grad_norm": 0.2581053078174591, + "learning_rate": 2.443811559007335e-06, + "loss": 0.0027, + "step": 27090 + }, + { + "epoch": 1.6238240757385105, + "grad_norm": 0.12379001826047897, + "learning_rate": 2.440792688039862e-06, + "loss": 0.0024, + "step": 27100 + }, + { + "epoch": 1.6244232728144286, + "grad_norm": 0.17116566002368927, + "learning_rate": 2.437783861778914e-06, + "loss": 0.0025, + "step": 27110 + }, + { + "epoch": 1.625022469890347, + "grad_norm": 0.13846145570278168, + "learning_rate": 2.4347850837555197e-06, + "loss": 0.0042, + "step": 27120 + }, + { + "epoch": 1.625621666966265, + "grad_norm": 0.09063230454921722, + "learning_rate": 2.4317963574889108e-06, + "loss": 0.0032, + "step": 27130 + }, + { + "epoch": 1.6262208640421836, + "grad_norm": 0.19914232194423676, + "learning_rate": 2.428817686486524e-06, + "loss": 0.0043, + "step": 27140 + }, + { + "epoch": 1.6268200611181016, + "grad_norm": 0.13414347171783447, + "learning_rate": 2.425849074243997e-06, + "loss": 0.0031, + "step": 27150 + }, + { + "epoch": 1.6274192581940201, + "grad_norm": 0.11173701286315918, + "learning_rate": 2.4228905242451593e-06, + "loss": 0.0037, + "step": 27160 + }, + { + "epoch": 1.6280184552699382, + "grad_norm": 0.11112072318792343, + "learning_rate": 2.419942039962035e-06, + "loss": 0.0032, + "step": 27170 + }, + { + "epoch": 1.6286176523458566, + "grad_norm": 0.27570319175720215, + "learning_rate": 2.4170036248548345e-06, + "loss": 0.0048, + "step": 27180 + }, + { + "epoch": 1.6292168494217747, + "grad_norm": 0.09076099097728729, + "learning_rate": 2.414075282371954e-06, + "loss": 0.0033, + "step": 27190 + }, + { + "epoch": 1.6298160464976932, + "grad_norm": 0.185089111328125, + "learning_rate": 2.411157015949963e-06, + "loss": 0.005, + "step": 27200 + }, + { + "epoch": 1.6304152435736112, + "grad_norm": 0.06751414388418198, + "learning_rate": 2.408248829013611e-06, + "loss": 0.0026, + "step": 27210 + }, + { + "epoch": 1.6310144406495297, + "grad_norm": 0.14673012495040894, + "learning_rate": 2.4053507249758174e-06, + "loss": 0.0029, + "step": 27220 + }, + { + "epoch": 1.6316136377254478, + "grad_norm": 0.11741532385349274, + "learning_rate": 2.40246270723767e-06, + "loss": 0.0034, + "step": 27230 + }, + { + "epoch": 1.6322128348013663, + "grad_norm": 0.06512618809938431, + "learning_rate": 2.399584779188417e-06, + "loss": 0.0032, + "step": 27240 + }, + { + "epoch": 1.6328120318772843, + "grad_norm": 0.22004343569278717, + "learning_rate": 2.396716944205467e-06, + "loss": 0.004, + "step": 27250 + }, + { + "epoch": 1.6334112289532028, + "grad_norm": 0.1706841140985489, + "learning_rate": 2.3938592056543853e-06, + "loss": 0.0028, + "step": 27260 + }, + { + "epoch": 1.6340104260291208, + "grad_norm": 0.1023155003786087, + "learning_rate": 2.391011566888887e-06, + "loss": 0.0029, + "step": 27270 + }, + { + "epoch": 1.6346096231050393, + "grad_norm": 0.17524677515029907, + "learning_rate": 2.3881740312508346e-06, + "loss": 0.0039, + "step": 27280 + }, + { + "epoch": 1.6352088201809574, + "grad_norm": 0.10368278622627258, + "learning_rate": 2.3853466020702323e-06, + "loss": 0.0026, + "step": 27290 + }, + { + "epoch": 1.6358080172568759, + "grad_norm": 0.06621989607810974, + "learning_rate": 2.382529282665229e-06, + "loss": 0.0028, + "step": 27300 + }, + { + "epoch": 1.636407214332794, + "grad_norm": 0.2700876295566559, + "learning_rate": 2.379722076342103e-06, + "loss": 0.0045, + "step": 27310 + }, + { + "epoch": 1.6370064114087124, + "grad_norm": 0.07727917283773422, + "learning_rate": 2.376924986395271e-06, + "loss": 0.0034, + "step": 27320 + }, + { + "epoch": 1.6376056084846304, + "grad_norm": 0.11636139452457428, + "learning_rate": 2.37413801610727e-06, + "loss": 0.0026, + "step": 27330 + }, + { + "epoch": 1.638204805560549, + "grad_norm": 0.07539201527833939, + "learning_rate": 2.371361168748767e-06, + "loss": 0.0029, + "step": 27340 + }, + { + "epoch": 1.638804002636467, + "grad_norm": 0.14615486562252045, + "learning_rate": 2.3685944475785463e-06, + "loss": 0.0037, + "step": 27350 + }, + { + "epoch": 1.6394031997123855, + "grad_norm": 0.10396217554807663, + "learning_rate": 2.3658378558435098e-06, + "loss": 0.004, + "step": 27360 + }, + { + "epoch": 1.6400023967883035, + "grad_norm": 0.08993582427501678, + "learning_rate": 2.363091396778672e-06, + "loss": 0.0037, + "step": 27370 + }, + { + "epoch": 1.640601593864222, + "grad_norm": 0.15601681172847748, + "learning_rate": 2.3603550736071535e-06, + "loss": 0.0032, + "step": 27380 + }, + { + "epoch": 1.6412007909401403, + "grad_norm": 0.27940425276756287, + "learning_rate": 2.357628889540182e-06, + "loss": 0.0035, + "step": 27390 + }, + { + "epoch": 1.6417999880160585, + "grad_norm": 0.19063127040863037, + "learning_rate": 2.3549128477770894e-06, + "loss": 0.0032, + "step": 27400 + }, + { + "epoch": 1.6423991850919768, + "grad_norm": 0.0989932119846344, + "learning_rate": 2.3522069515052996e-06, + "loss": 0.0033, + "step": 27410 + }, + { + "epoch": 1.642998382167895, + "grad_norm": 0.1885364055633545, + "learning_rate": 2.349511203900333e-06, + "loss": 0.0046, + "step": 27420 + }, + { + "epoch": 1.6435975792438133, + "grad_norm": 0.19619473814964294, + "learning_rate": 2.3468256081258e-06, + "loss": 0.0032, + "step": 27430 + }, + { + "epoch": 1.6441967763197316, + "grad_norm": 0.3142991364002228, + "learning_rate": 2.344150167333397e-06, + "loss": 0.0041, + "step": 27440 + }, + { + "epoch": 1.6447959733956499, + "grad_norm": 0.09447146952152252, + "learning_rate": 2.3414848846629013e-06, + "loss": 0.0034, + "step": 27450 + }, + { + "epoch": 1.6453951704715681, + "grad_norm": 0.12683053314685822, + "learning_rate": 2.3388297632421727e-06, + "loss": 0.0044, + "step": 27460 + }, + { + "epoch": 1.6459943675474864, + "grad_norm": 0.30415666103363037, + "learning_rate": 2.3361848061871417e-06, + "loss": 0.0048, + "step": 27470 + }, + { + "epoch": 1.6465935646234047, + "grad_norm": 0.16130568087100983, + "learning_rate": 2.333550016601814e-06, + "loss": 0.0032, + "step": 27480 + }, + { + "epoch": 1.647192761699323, + "grad_norm": 0.19884297251701355, + "learning_rate": 2.3309253975782623e-06, + "loss": 0.0036, + "step": 27490 + }, + { + "epoch": 1.6477919587752412, + "grad_norm": 0.2124500721693039, + "learning_rate": 2.3283109521966236e-06, + "loss": 0.0048, + "step": 27500 + }, + { + "epoch": 1.6483911558511595, + "grad_norm": 0.20656649768352509, + "learning_rate": 2.325706683525094e-06, + "loss": 0.0044, + "step": 27510 + }, + { + "epoch": 1.6489903529270777, + "grad_norm": 0.08909416943788528, + "learning_rate": 2.3231125946199277e-06, + "loss": 0.0042, + "step": 27520 + }, + { + "epoch": 1.649589550002996, + "grad_norm": 0.1665533483028412, + "learning_rate": 2.320528688525433e-06, + "loss": 0.0045, + "step": 27530 + }, + { + "epoch": 1.6501887470789143, + "grad_norm": 0.11362092941999435, + "learning_rate": 2.317954968273969e-06, + "loss": 0.0028, + "step": 27540 + }, + { + "epoch": 1.6507879441548325, + "grad_norm": 0.11079458892345428, + "learning_rate": 2.3153914368859386e-06, + "loss": 0.0044, + "step": 27550 + }, + { + "epoch": 1.6513871412307508, + "grad_norm": 0.1600227653980255, + "learning_rate": 2.3128380973697868e-06, + "loss": 0.0032, + "step": 27560 + }, + { + "epoch": 1.651986338306669, + "grad_norm": 0.10425245016813278, + "learning_rate": 2.3102949527220025e-06, + "loss": 0.0024, + "step": 27570 + }, + { + "epoch": 1.6525855353825873, + "grad_norm": 0.142449289560318, + "learning_rate": 2.3077620059271054e-06, + "loss": 0.0031, + "step": 27580 + }, + { + "epoch": 1.6531847324585056, + "grad_norm": 0.13777248561382294, + "learning_rate": 2.305239259957653e-06, + "loss": 0.0031, + "step": 27590 + }, + { + "epoch": 1.6537839295344239, + "grad_norm": 0.21916678547859192, + "learning_rate": 2.302726717774224e-06, + "loss": 0.0034, + "step": 27600 + }, + { + "epoch": 1.6543831266103421, + "grad_norm": 0.11044235527515411, + "learning_rate": 2.3002243823254294e-06, + "loss": 0.0032, + "step": 27610 + }, + { + "epoch": 1.6549823236862604, + "grad_norm": 0.30877354741096497, + "learning_rate": 2.2977322565478988e-06, + "loss": 0.0039, + "step": 27620 + }, + { + "epoch": 1.6555815207621787, + "grad_norm": 0.12299321591854095, + "learning_rate": 2.2952503433662806e-06, + "loss": 0.003, + "step": 27630 + }, + { + "epoch": 1.656180717838097, + "grad_norm": 0.10495458543300629, + "learning_rate": 2.2927786456932383e-06, + "loss": 0.0036, + "step": 27640 + }, + { + "epoch": 1.6567799149140152, + "grad_norm": 0.13938122987747192, + "learning_rate": 2.2903171664294446e-06, + "loss": 0.0045, + "step": 27650 + }, + { + "epoch": 1.6573791119899335, + "grad_norm": 0.1632867157459259, + "learning_rate": 2.287865908463585e-06, + "loss": 0.0043, + "step": 27660 + }, + { + "epoch": 1.6579783090658518, + "grad_norm": 0.11505074799060822, + "learning_rate": 2.2854248746723464e-06, + "loss": 0.004, + "step": 27670 + }, + { + "epoch": 1.65857750614177, + "grad_norm": 0.19847853481769562, + "learning_rate": 2.2829940679204192e-06, + "loss": 0.0033, + "step": 27680 + }, + { + "epoch": 1.6591767032176883, + "grad_norm": 0.0759914219379425, + "learning_rate": 2.280573491060488e-06, + "loss": 0.0025, + "step": 27690 + }, + { + "epoch": 1.6597759002936066, + "grad_norm": 0.23778557777404785, + "learning_rate": 2.278163146933236e-06, + "loss": 0.0029, + "step": 27700 + }, + { + "epoch": 1.6603750973695248, + "grad_norm": 0.14102019369602203, + "learning_rate": 2.275763038367336e-06, + "loss": 0.0026, + "step": 27710 + }, + { + "epoch": 1.660974294445443, + "grad_norm": 0.09396950155496597, + "learning_rate": 2.2733731681794505e-06, + "loss": 0.0032, + "step": 27720 + }, + { + "epoch": 1.6615734915213614, + "grad_norm": 0.1578163504600525, + "learning_rate": 2.270993539174225e-06, + "loss": 0.0034, + "step": 27730 + }, + { + "epoch": 1.6621726885972796, + "grad_norm": 0.12897615134716034, + "learning_rate": 2.268624154144283e-06, + "loss": 0.0034, + "step": 27740 + }, + { + "epoch": 1.662771885673198, + "grad_norm": 0.05674497038125992, + "learning_rate": 2.266265015870234e-06, + "loss": 0.0028, + "step": 27750 + }, + { + "epoch": 1.6633710827491162, + "grad_norm": 0.12161347270011902, + "learning_rate": 2.2639161271206562e-06, + "loss": 0.0033, + "step": 27760 + }, + { + "epoch": 1.6639702798250344, + "grad_norm": 0.11158734560012817, + "learning_rate": 2.261577490652103e-06, + "loss": 0.004, + "step": 27770 + }, + { + "epoch": 1.6645694769009527, + "grad_norm": 0.09899834543466568, + "learning_rate": 2.259249109209093e-06, + "loss": 0.003, + "step": 27780 + }, + { + "epoch": 1.665168673976871, + "grad_norm": 0.2654432952404022, + "learning_rate": 2.256930985524111e-06, + "loss": 0.0032, + "step": 27790 + }, + { + "epoch": 1.6657678710527892, + "grad_norm": 0.1188909262418747, + "learning_rate": 2.2546231223176062e-06, + "loss": 0.0034, + "step": 27800 + }, + { + "epoch": 1.6663670681287075, + "grad_norm": 0.4437197148799896, + "learning_rate": 2.2523255222979846e-06, + "loss": 0.005, + "step": 27810 + }, + { + "epoch": 1.6669662652046258, + "grad_norm": 0.17790400981903076, + "learning_rate": 2.2500381881616064e-06, + "loss": 0.0033, + "step": 27820 + }, + { + "epoch": 1.667565462280544, + "grad_norm": 0.10867536813020706, + "learning_rate": 2.2477611225927847e-06, + "loss": 0.0032, + "step": 27830 + }, + { + "epoch": 1.6681646593564623, + "grad_norm": 0.10958084464073181, + "learning_rate": 2.2454943282637852e-06, + "loss": 0.0024, + "step": 27840 + }, + { + "epoch": 1.6687638564323806, + "grad_norm": 0.06520948559045792, + "learning_rate": 2.2432378078348166e-06, + "loss": 0.0029, + "step": 27850 + }, + { + "epoch": 1.6693630535082988, + "grad_norm": 0.13580842316150665, + "learning_rate": 2.2409915639540295e-06, + "loss": 0.0029, + "step": 27860 + }, + { + "epoch": 1.669962250584217, + "grad_norm": 0.15817365050315857, + "learning_rate": 2.2387555992575192e-06, + "loss": 0.002, + "step": 27870 + }, + { + "epoch": 1.6705614476601354, + "grad_norm": 0.35285326838493347, + "learning_rate": 2.236529916369313e-06, + "loss": 0.0062, + "step": 27880 + }, + { + "epoch": 1.6711606447360536, + "grad_norm": 0.24554285407066345, + "learning_rate": 2.2343145179013726e-06, + "loss": 0.0043, + "step": 27890 + }, + { + "epoch": 1.671759841811972, + "grad_norm": 0.16509993374347687, + "learning_rate": 2.232109406453595e-06, + "loss": 0.0032, + "step": 27900 + }, + { + "epoch": 1.6723590388878904, + "grad_norm": 0.13468189537525177, + "learning_rate": 2.229914584613798e-06, + "loss": 0.0026, + "step": 27910 + }, + { + "epoch": 1.6729582359638084, + "grad_norm": 0.17360062897205353, + "learning_rate": 2.22773005495773e-06, + "loss": 0.0029, + "step": 27920 + }, + { + "epoch": 1.673557433039727, + "grad_norm": 0.12582021951675415, + "learning_rate": 2.2255558200490557e-06, + "loss": 0.0035, + "step": 27930 + }, + { + "epoch": 1.674156630115645, + "grad_norm": 0.1015002503991127, + "learning_rate": 2.2233918824393625e-06, + "loss": 0.0033, + "step": 27940 + }, + { + "epoch": 1.6747558271915635, + "grad_norm": 0.3634873926639557, + "learning_rate": 2.221238244668151e-06, + "loss": 0.0031, + "step": 27950 + }, + { + "epoch": 1.6753550242674815, + "grad_norm": 0.15137465298175812, + "learning_rate": 2.219094909262834e-06, + "loss": 0.006, + "step": 27960 + }, + { + "epoch": 1.6759542213434, + "grad_norm": 0.09976715594530106, + "learning_rate": 2.2169618787387374e-06, + "loss": 0.0031, + "step": 27970 + }, + { + "epoch": 1.676553418419318, + "grad_norm": 0.09910957515239716, + "learning_rate": 2.2148391555990905e-06, + "loss": 0.0024, + "step": 27980 + }, + { + "epoch": 1.6771526154952365, + "grad_norm": 0.11276205629110336, + "learning_rate": 2.212726742335025e-06, + "loss": 0.0032, + "step": 27990 + }, + { + "epoch": 1.6777518125711546, + "grad_norm": 0.22798313200473785, + "learning_rate": 2.210624641425579e-06, + "loss": 0.004, + "step": 28000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.748965507895132e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..19d22af7b7d6155175015b5c3c5b452030d153ea --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-28000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccf8e16588ffacf58cd09ed0241d355125d76c992d11c15a4bc8ee94db38dc3b +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a3180771919159b1a2d73c43bc8cf42097657196 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da8805f8908531b109f7f6b4df063664b6f536cfa398316ef359bee72c5e28e2 +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..627385151b88c21eac3195320bb4364d04f3e13c --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86948cca4d01a987e5a3bd9f9005ea3f68bc6d7e9966463f5f48a450f5cf1971 +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c97ef2523875853c9b2568bed1c7ba1a232166d7 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b5d6df2df75c34f17edb92890cc4bf03e44c09933f4a35f72f6f5be21049eea +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..0da6184f9732635317d9591566929a0f088174db --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -24.608807465362545, + -30.57493604888916, + -14.421680474472046, + -1.8400005650520326, + -2.2583390679359434, + -1.9374337060928344, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 2.947746359062201, + 22.348905650329584, + 21.642364361572263, + 2.36660552740097, + 4.0908002225875855, + 3.2823701507568366, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + -6.435277462005615, + -1.046771764755249, + 3.5443263053894043, + 0.010237408801913261, + 0.7088965773582458, + 0.433538019657135, + 0.11327514797449112, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 7.037599563598633, + 16.91518783569336, + 8.290277481079102, + 0.6919190883636475, + 1.1289485692977905, + 0.9604002833366394, + 0.9935636520385742, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.702568125152588, + -21.763728466033935, + -21.216347326660156, + -2.3684931322097778, + -4.066458044528961, + -3.2888745792388914, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.601868363571164, + 30.525507734680176, + 14.354210775756833, + 1.8357849156379702, + 2.250663768482209, + 1.934181491851806, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.545124530792236, + 1.3164341449737549, + -3.4697155952453613, + -0.00962071679532528, + -0.7082296013832092, + -0.43808361887931824, + 0.13391299545764923, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.980162143707275, + 16.702543258666992, + 8.168180465698242, + 0.6913491487503052, + 1.1232151985168457, + 0.9606267809867859, + 0.990993082523346, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff8c0ecf109af377331583e4079865e7d8037bc8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 5 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..21f5e7f5499a8396da069c35a081bcba48d29832 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/trainer_state.json @@ -0,0 +1,21034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7975912277548085, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 2.688621997833252, + "learning_rate": 1.8e-07, + "loss": 0.1495, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.1722424030303955, + "learning_rate": 3.8e-07, + "loss": 0.1358, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 2.3095974922180176, + "learning_rate": 5.800000000000001e-07, + "loss": 0.1268, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 2.131070852279663, + "learning_rate": 7.8e-07, + "loss": 0.1224, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 2.273555278778076, + "learning_rate": 9.800000000000001e-07, + "loss": 0.118, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 1.3571869134902954, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.111, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 1.6004165410995483, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0826, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 1.0413638353347778, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.0657, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 1.1965473890304565, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.0493, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 1.1422100067138672, + "learning_rate": 1.98e-06, + "loss": 0.0444, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 0.6911118626594543, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.0457, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 0.6770259737968445, + "learning_rate": 2.38e-06, + "loss": 0.0257, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 0.4811704456806183, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.0208, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 0.7260023951530457, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.0203, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 0.4369716942310333, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.0174, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 0.4100959300994873, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.0133, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 1.0024627447128296, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.0149, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.4598183035850525, + "learning_rate": 3.58e-06, + "loss": 0.0143, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 0.7042055130004883, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.0143, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 0.7677909731864929, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0151, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 0.45090702176094055, + "learning_rate": 4.18e-06, + "loss": 0.0113, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 0.4400976598262787, + "learning_rate": 4.38e-06, + "loss": 0.0155, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 0.2424178272485733, + "learning_rate": 4.58e-06, + "loss": 0.0113, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 0.4720967411994934, + "learning_rate": 4.78e-06, + "loss": 0.0166, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 0.41622042655944824, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0104, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 0.6915765404701233, + "learning_rate": 5.18e-06, + "loss": 0.0108, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.25931113958358765, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0104, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.42486071586608887, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0084, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.3798843324184418, + "learning_rate": 5.78e-06, + "loss": 0.0107, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.3281213343143463, + "learning_rate": 5.98e-06, + "loss": 0.0081, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 0.3394489884376526, + "learning_rate": 6.18e-06, + "loss": 0.01, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 0.38298189640045166, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0098, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 0.3188078999519348, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0104, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.3152049779891968, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0087, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.34163472056388855, + "learning_rate": 6.98e-06, + "loss": 0.01, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 0.43860143423080444, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0065, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.2845093309879303, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0086, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 0.4009752869606018, + "learning_rate": 7.58e-06, + "loss": 0.0099, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.37756970524787903, + "learning_rate": 7.78e-06, + "loss": 0.0097, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.38135284185409546, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0076, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 0.3145769536495209, + "learning_rate": 8.18e-06, + "loss": 0.0106, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 0.32534345984458923, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0069, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.24024507403373718, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0089, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 0.32857799530029297, + "learning_rate": 8.78e-06, + "loss": 0.0105, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.28823110461235046, + "learning_rate": 8.98e-06, + "loss": 0.0101, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 0.32506972551345825, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0126, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 0.19875723123550415, + "learning_rate": 9.38e-06, + "loss": 0.0081, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.3245992958545685, + "learning_rate": 9.58e-06, + "loss": 0.0099, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.24933603405952454, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0117, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.3154098391532898, + "learning_rate": 9.980000000000001e-06, + "loss": 0.009, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.3685779273509979, + "learning_rate": 1.018e-05, + "loss": 0.0101, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 0.7251449823379517, + "learning_rate": 1.038e-05, + "loss": 0.0119, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 0.3183727264404297, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.009, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.3737810254096985, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0089, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.45293235778808594, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.011, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 0.3476772606372833, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.008, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.38373252749443054, + "learning_rate": 1.138e-05, + "loss": 0.0088, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 0.2530902624130249, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.008, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 0.19455896317958832, + "learning_rate": 1.178e-05, + "loss": 0.008, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.3315221071243286, + "learning_rate": 1.198e-05, + "loss": 0.0102, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.23430880904197693, + "learning_rate": 1.218e-05, + "loss": 0.007, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.4636307656764984, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0075, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.3785994052886963, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0109, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.2804955542087555, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0099, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.393702894449234, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0132, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.400641530752182, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0099, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 0.24428881704807281, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0076, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 0.4449252188205719, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0103, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.406582236289978, + "learning_rate": 1.378e-05, + "loss": 0.0098, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.36386170983314514, + "learning_rate": 1.398e-05, + "loss": 0.0088, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.38196030259132385, + "learning_rate": 1.418e-05, + "loss": 0.01, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.28740620613098145, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.008, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.3616485297679901, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0094, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.4004146158695221, + "learning_rate": 1.478e-05, + "loss": 0.009, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.4585514962673187, + "learning_rate": 1.498e-05, + "loss": 0.0092, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.20028235018253326, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0138, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 0.46603646874427795, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0139, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.3518030047416687, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0116, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.22323082387447357, + "learning_rate": 1.578e-05, + "loss": 0.0097, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.26777058839797974, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0081, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.32380548119544983, + "learning_rate": 1.618e-05, + "loss": 0.0087, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.5248059630393982, + "learning_rate": 1.638e-05, + "loss": 0.0102, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.3495309054851532, + "learning_rate": 1.658e-05, + "loss": 0.0121, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.3551771342754364, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.5039486289024353, + "learning_rate": 1.698e-05, + "loss": 0.0094, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.3826751410961151, + "learning_rate": 1.718e-05, + "loss": 0.0107, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.46699973940849304, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0122, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3312668204307556, + "learning_rate": 1.758e-05, + "loss": 0.0087, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 0.28113219141960144, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0121, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.49752357602119446, + "learning_rate": 1.798e-05, + "loss": 0.0101, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.4177795350551605, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0096, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.34015583992004395, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0082, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.4612225890159607, + "learning_rate": 1.858e-05, + "loss": 0.0084, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.3813643753528595, + "learning_rate": 1.878e-05, + "loss": 0.012, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 0.27937838435173035, + "learning_rate": 1.898e-05, + "loss": 0.0104, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.4471273422241211, + "learning_rate": 1.918e-05, + "loss": 0.0125, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.4010440707206726, + "learning_rate": 1.938e-05, + "loss": 0.0106, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.41607654094696045, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0107, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 0.3589233458042145, + "learning_rate": 1.978e-05, + "loss": 0.0081, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.5726460814476013, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0111, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.36717164516448975, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0102, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.47284170985221863, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.01, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.5372244119644165, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0117, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.40928924083709717, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0088, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.4905182421207428, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0107, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.3709850609302521, + "learning_rate": 1.999981616897523e-05, + "loss": 0.01, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 0.6419615745544434, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0095, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.4986196458339691, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0127, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.5523516535758972, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0115, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.5443158745765686, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0113, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 0.5146775245666504, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0101, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.2972394824028015, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0092, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.4030104875564575, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0097, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 0.4765481650829315, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0136, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.4051239788532257, + "learning_rate": 1.999882759038658e-05, + "loss": 0.0113, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.3703782558441162, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0108, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5248176455497742, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0112, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.3100311756134033, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0083, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.45929211378097534, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0114, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 0.5695507526397705, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0095, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.5395359992980957, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0151, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.5106327533721924, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0124, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.3423260450363159, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0132, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.32126766443252563, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.011, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.5105165839195251, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0085, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 0.31927764415740967, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0088, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 0.4421865940093994, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0093, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.2930506765842438, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0091, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.2920694053173065, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0085, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.2661049962043762, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0081, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 0.3047257661819458, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0083, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.2774506211280823, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0091, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.2554785907268524, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0096, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.5792570114135742, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0108, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.3250623941421509, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0125, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 0.5885359048843384, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0117, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.37988749146461487, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.009, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.3751101493835449, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0099, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.31976667046546936, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0097, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 0.37007251381874084, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0079, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.4624205231666565, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0103, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 0.3769538700580597, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0094, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.25460657477378845, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0076, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.3976004719734192, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0109, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.2983521521091461, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0113, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.25581008195877075, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0101, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.29260268807411194, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0102, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.3522181808948517, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0105, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.36269208788871765, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0103, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.40412119030952454, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0116, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.24089744687080383, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0119, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.4667617082595825, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0084, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.30139675736427307, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0101, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.38486286997795105, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0097, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.3526909649372101, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0071, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.3023934066295624, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0125, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.2796316146850586, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0072, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.25742489099502563, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0089, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.3626627027988434, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.01, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.3032572567462921, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0084, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.23514018952846527, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0086, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.3835832476615906, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0091, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.5170259475708008, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0146, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 0.8983817100524902, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0112, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.26260825991630554, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0086, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.481942743062973, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0126, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.311187207698822, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0064, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.3346790373325348, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0073, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.33836621046066284, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0085, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.3678463101387024, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0098, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.6136184334754944, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0154, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.39811593294143677, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0112, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6973778009414673, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0099, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.4773237109184265, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0091, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.3776084780693054, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.009, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 0.5061993598937988, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0097, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.41183987259864807, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.009, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 0.31513598561286926, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0112, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.4571514129638672, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0097, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.3183996379375458, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.01, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.2978666126728058, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0089, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.4791043698787689, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0087, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.5216032266616821, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0124, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.44693392515182495, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0092, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 0.41371819376945496, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0111, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.3593288064002991, + "learning_rate": 1.996106060741973e-05, + "loss": 0.014, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 0.4550306499004364, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0098, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.3510669469833374, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0066, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.2778814136981964, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0108, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.32210350036621094, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0067, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.42160800099372864, + "learning_rate": 1.995639934033493e-05, + "loss": 0.012, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.49051347374916077, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0102, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.3643694519996643, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.009, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.3717772960662842, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0076, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.32102280855178833, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0081, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.36725476384162903, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0102, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.39626258611679077, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0078, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.4183773696422577, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0105, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.3494930863380432, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0078, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.6155357956886292, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0119, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.34380587935447693, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.0105, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.5476253032684326, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.01, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 0.37999996542930603, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0094, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 0.3124147057533264, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0125, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.4887244999408722, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.01, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5969874858856201, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0106, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 0.4295594096183777, + "learning_rate": 1.993971819309759e-05, + "loss": 0.007, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.3899303078651428, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0096, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.3912282884120941, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0075, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.5355616807937622, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0093, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.29141828417778015, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0129, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.24389855563640594, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.009, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.4070908725261688, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0085, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.26783379912376404, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0071, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.2644960880279541, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0089, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.35223162174224854, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0093, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.47337162494659424, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0095, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.25418519973754883, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0093, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 0.36384159326553345, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0082, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.30014440417289734, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0081, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.41121408343315125, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0081, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.5576186776161194, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.008, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.35785913467407227, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0083, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.3306240439414978, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0084, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 0.37215736508369446, + "learning_rate": 1.991774193879505e-05, + "loss": 0.012, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.5504099726676941, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0088, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.24932143092155457, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.007, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.5866615176200867, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0088, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.5174368619918823, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0121, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.2345893532037735, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0095, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.2683233916759491, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0068, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.2471713274717331, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0085, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.5090919733047485, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0108, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.2857886552810669, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0078, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.23729385435581207, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0096, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.30867621302604675, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0088, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.42522960901260376, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0103, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.37170591950416565, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0105, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.3672806918621063, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0121, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.4048611521720886, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.01, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.24768167734146118, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0125, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 0.5003495812416077, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0125, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.4303686022758484, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0084, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.3701602518558502, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0101, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.38272005319595337, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0075, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.2844183146953583, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0105, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.31114980578422546, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0095, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.3436568081378937, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0113, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.273001104593277, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0076, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.2653564512729645, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0077, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.3115384578704834, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0132, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.25932809710502625, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0083, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.28656521439552307, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0066, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.31808462738990784, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.0115, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.18877890706062317, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0092, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.3685394525527954, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0091, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.3878263533115387, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0082, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 0.284507691860199, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0085, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.3473755121231079, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0081, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.39935287833213806, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0081, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.34282153844833374, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0076, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.3581090271472931, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0087, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.37332627177238464, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0089, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 0.5224587321281433, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0089, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.42577075958251953, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0108, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.4602234959602356, + "learning_rate": 1.985504281027289e-05, + "loss": 0.014, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.4852961003780365, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0091, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.4437471628189087, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0112, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.37050408124923706, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0068, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.3345497250556946, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0069, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.36727628111839294, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0081, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 0.37056809663772583, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0152, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.5640603303909302, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0085, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 0.3653910160064697, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0078, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.2954258322715759, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0083, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6086210012435913, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0082, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 0.5260390043258667, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0105, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.3067379295825958, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.0092, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.3480100929737091, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0088, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.26472753286361694, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0067, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.5254784226417542, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0146, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.35744136571884155, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0098, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.36186468601226807, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0084, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 0.35203835368156433, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0115, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.30590811371803284, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0108, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.34612980484962463, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0082, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.2946765720844269, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0075, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.33707642555236816, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.007, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.2572688162326813, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0099, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.3901146352291107, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0185, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.4349755644798279, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0084, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.2383752018213272, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0092, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.46043846011161804, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0073, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.24630354344844818, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0062, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.5232640504837036, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0067, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 0.3850713074207306, + "learning_rate": 1.979809151602651e-05, + "loss": 0.014, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 0.44703760743141174, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0081, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.3762659728527069, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0099, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.4593638479709625, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0093, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.40554332733154297, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0125, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.33439910411834717, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0081, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.2623269855976105, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0062, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.22419600188732147, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0078, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.37183159589767456, + "learning_rate": 1.978133252131276e-05, + "loss": 0.01, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.27857136726379395, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0089, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.27683520317077637, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0069, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.45064759254455566, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0076, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.24215294420719147, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.0071, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.5163891315460205, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.0078, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.3922234773635864, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0077, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.19653558731079102, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0063, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.17621839046478271, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0084, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.6482162475585938, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0075, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.32759004831314087, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0088, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.33347561955451965, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0073, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.42883744835853577, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0084, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 0.3348788917064667, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0082, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.28349289298057556, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0102, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.2733197510242462, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0074, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.3263874351978302, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.01, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.295757532119751, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0071, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5598515868186951, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0093, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.425937294960022, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0083, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.2442379742860794, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0087, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.3378766179084778, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0163, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5137761831283569, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0099, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.3825916647911072, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0096, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.32084307074546814, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0066, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.3979593515396118, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0077, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.3103732764720917, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0067, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.5531997084617615, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0131, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5423216819763184, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0121, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.5038735270500183, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0087, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.44273868203163147, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.008, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.335232675075531, + "learning_rate": 1.971017390295979e-05, + "loss": 0.009, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.4746256470680237, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.26807400584220886, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0075, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.35464033484458923, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0123, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.33803898096084595, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0094, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 0.20334473252296448, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0101, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.34386369585990906, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0081, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.38781842589378357, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0088, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.25994163751602173, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0079, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.3342406451702118, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0091, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.3120318353176117, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0079, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.3556351661682129, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0073, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.21421445906162262, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0095, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.39498451352119446, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0087, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.5480947494506836, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0079, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.16734588146209717, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0072, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.3987548351287842, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0083, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.3929785490036011, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0096, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.2884303331375122, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0102, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.3338335454463959, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0092, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.47452738881111145, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0093, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.25584715604782104, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0068, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 0.3038389980792999, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0076, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.4123639464378357, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0101, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.38520675897598267, + "learning_rate": 1.964833301001045e-05, + "loss": 0.014, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.3355116844177246, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0094, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.3479195535182953, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0105, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.2700177729129791, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0076, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.2166757434606552, + "learning_rate": 1.963745667883003e-05, + "loss": 0.008, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 0.18578873574733734, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0071, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.26316413283348083, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0079, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.28762468695640564, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0115, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 0.3712877631187439, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0074, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.2862299382686615, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0072, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.2730867564678192, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0101, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.327648401260376, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0092, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.41153189539909363, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0083, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.32522135972976685, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0095, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.22764958441257477, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0085, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.3491888642311096, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.009, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.3123551607131958, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0103, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.1881783902645111, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0085, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.40902259945869446, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0089, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.382953941822052, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0088, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 0.23950865864753723, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0064, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.3419397175312042, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0118, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.42207059264183044, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0091, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.40754130482673645, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0087, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.2390766590833664, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0069, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.2974188029766083, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.0091, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.2993582785129547, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0092, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.42652204632759094, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0068, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.3138194680213928, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.009, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.38833311200141907, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0083, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.4015152156352997, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0081, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.42086881399154663, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.007, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.26732996106147766, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0071, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.5763937830924988, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0101, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.2955382764339447, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0075, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.4625638723373413, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0094, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.29631468653678894, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0096, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.46335819363594055, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0103, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.3183141350746155, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.008, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.26456212997436523, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0083, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 0.40924879908561707, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0097, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 0.3981763422489166, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0094, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.36437541246414185, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0064, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.2935962378978729, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0081, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.3478807210922241, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0079, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.3460087180137634, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0069, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.2706817090511322, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0088, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.2674945890903473, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0083, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.2268197238445282, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0072, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.3216208219528198, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0092, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.3226968050003052, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0101, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.2743329405784607, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0075, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.32573118805885315, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0094, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 0.53167325258255, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0099, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.3915646970272064, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0089, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.4526256322860718, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0101, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.323249489068985, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0094, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4046335816383362, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0088, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.34745559096336365, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0078, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.30308133363723755, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0071, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.37923407554626465, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0076, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 0.26785972714424133, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0093, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.2778306305408478, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0083, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.611038088798523, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0098, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.4114893078804016, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0111, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.2732110023498535, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0076, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.2964401841163635, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0095, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.40240928530693054, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0097, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.3901022672653198, + "learning_rate": 1.944152646499645e-05, + "loss": 0.008, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.38001132011413574, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0109, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.35937973856925964, + "learning_rate": 1.943474465322135e-05, + "loss": 0.007, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.2745327651500702, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0075, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.1598518043756485, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.007, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.401614785194397, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0115, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.4127846360206604, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0068, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.22147920727729797, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0061, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.28602245450019836, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0067, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.22147324681282043, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0076, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.2550548315048218, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0088, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.24113087356090546, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0076, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.3658410608768463, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0075, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.3856262266635895, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0112, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.33494284749031067, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0075, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.3767516314983368, + "learning_rate": 1.938969919958475e-05, + "loss": 0.01, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.24380649626255035, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.009, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.30575039982795715, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0079, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.32913386821746826, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.009, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.29845312237739563, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0099, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.44377902150154114, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0092, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.34614384174346924, + "learning_rate": 1.936834723687526e-05, + "loss": 0.009, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.3316318690776825, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0096, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.4076138734817505, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0081, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 0.30320486426353455, + "learning_rate": 1.935753861926916e-05, + "loss": 0.015, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.32243025302886963, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.011, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.323745459318161, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0077, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5750753283500671, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0088, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.22709843516349792, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0101, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.3067542314529419, + "learning_rate": 1.933932815280178e-05, + "loss": 0.007, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.392337828874588, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0089, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.43343180418014526, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0073, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.4371345341205597, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0078, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.35214635729789734, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0077, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.3259161412715912, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0074, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.3849303722381592, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0066, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.3968902826309204, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0091, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.33016201853752136, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0095, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.3859156668186188, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.008, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.3020654618740082, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.007, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.44503262639045715, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0105, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.3908904194831848, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0073, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.39256253838539124, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0078, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.352611243724823, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0077, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.39203983545303345, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0081, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.23835115134716034, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0066, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.24996638298034668, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0098, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.29537609219551086, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0082, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.2898835837841034, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0077, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.4040369391441345, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0083, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.3501318395137787, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0094, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5462452173233032, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0097, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.4217568337917328, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0072, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.18295089900493622, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0083, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.3695569336414337, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0078, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.37818798422813416, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0089, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.29818472266197205, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0084, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.3328498303890228, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.01, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.340724378824234, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0075, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.2966301441192627, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0063, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.30677109956741333, + "learning_rate": 1.922098355206593e-05, + "loss": 0.008, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.2091839611530304, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.0078, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.4229014217853546, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0115, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.40779992938041687, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0075, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.378817081451416, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.008, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.29796919226646423, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0092, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.2702767252922058, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0076, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.31349876523017883, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0085, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 0.30500444769859314, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0093, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.2860834002494812, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0061, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.26036593317985535, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0099, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.19049863517284393, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0075, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.3235284388065338, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0083, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.364092618227005, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.011, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.2409065216779709, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0092, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.36907926201820374, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.008, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.3230077922344208, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0073, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.191047802567482, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0063, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.3346494436264038, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0082, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.21352025866508484, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0075, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.5505086779594421, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0095, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.34264758229255676, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0083, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.20266413688659668, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0074, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.24938757717609406, + "learning_rate": 1.912718096497034e-05, + "loss": 0.007, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.4140026569366455, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0086, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.4424414038658142, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0104, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 0.5327904224395752, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0115, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 0.35958340764045715, + "learning_rate": 1.911035077753307e-05, + "loss": 0.01, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.2547682523727417, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0066, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.3701247274875641, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0115, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.34443217515945435, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0077, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.20353800058364868, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0061, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5660653114318848, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0091, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.26445311307907104, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0073, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.5561402440071106, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0071, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.3700469434261322, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0083, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.35783904790878296, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.008, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.3238641619682312, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0081, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.25247740745544434, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0099, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.435730904340744, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.008, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.37758126854896545, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0068, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.33323949575424194, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.0094, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.4356318712234497, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0093, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 0.37893903255462646, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0058, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.4411139190196991, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0085, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.3852006793022156, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0087, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4287096858024597, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0107, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.43085435032844543, + "learning_rate": 1.902392195640386e-05, + "loss": 0.009, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 0.2709400951862335, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0066, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.358126163482666, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0082, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.25320038199424744, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0077, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 0.31440937519073486, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0077, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.25246965885162354, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0079, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.28420332074165344, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0101, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.25251317024230957, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0075, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.19744229316711426, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0069, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4457854628562927, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0073, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.36817625164985657, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0096, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.3394709825515747, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0073, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.2909093201160431, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0065, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.20237651467323303, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0057, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.29520732164382935, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0072, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.25512900948524475, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0096, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.45816823840141296, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0073, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.33459368348121643, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0096, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.21619321405887604, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0063, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.25518253445625305, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0067, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.2273867279291153, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.007, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.2864684462547302, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0074, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.3077942728996277, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0075, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 0.40526703000068665, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0079, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.41480064392089844, + "learning_rate": 1.891523933768891e-05, + "loss": 0.01, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.2750788629055023, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0064, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 0.29671600461006165, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0095, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 0.24160107970237732, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0069, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 0.2949109971523285, + "learning_rate": 1.889660337749874e-05, + "loss": 0.007, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.2847975492477417, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0059, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.30052465200424194, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0067, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.36128419637680054, + "learning_rate": 1.888252908366661e-05, + "loss": 0.014, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.36974236369132996, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0064, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.43730056285858154, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0084, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.3145422339439392, + "learning_rate": 1.88683715346172e-05, + "loss": 0.008, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.35473865270614624, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0091, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.2501350939273834, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.008, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.34808069467544556, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0099, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.45218509435653687, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.0068, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.34530994296073914, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0098, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.38257333636283875, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0101, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.3040159344673157, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0079, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.3323517143726349, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0068, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.2639414370059967, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0078, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.3493870794773102, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0081, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.5838330984115601, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0091, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 0.428803026676178, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0087, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.3654572069644928, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0114, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.3295663297176361, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0075, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.3469060957431793, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0074, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.3366406261920929, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0066, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.32569241523742676, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0054, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3086700737476349, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0086, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.38562801480293274, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0092, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.3523421585559845, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0085, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.2278694063425064, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0063, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.32141822576522827, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0147, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.3375259041786194, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0077, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4483063220977783, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0062, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.3667140007019043, + "learning_rate": 1.874717450126662e-05, + "loss": 0.008, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.3419000506401062, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0079, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.36556369066238403, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0079, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.33135318756103516, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0064, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.4458329975605011, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0091, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.34939518570899963, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0072, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.34424352645874023, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0077, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.3460613191127777, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0113, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.38822048902511597, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0066, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.35550639033317566, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0083, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 0.30869176983833313, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0087, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.38202086091041565, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0081, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.25744789838790894, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0074, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.29700344800949097, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0082, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.305786669254303, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0076, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.3291271924972534, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0071, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.26111704111099243, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0074, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.348176509141922, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0086, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.27502793073654175, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0076, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.2831551432609558, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0092, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.39652079343795776, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0066, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.3885122239589691, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0087, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.37296077609062195, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0104, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.33606627583503723, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0086, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.3855937421321869, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0097, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.3322301506996155, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0076, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 0.33322253823280334, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.009, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.22358210384845734, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0088, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.5901851058006287, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0088, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.4703235328197479, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0084, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.20072896778583527, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0077, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.3537980616092682, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0098, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.3123277723789215, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0068, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.35979342460632324, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0065, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.38628828525543213, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0074, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.3498038053512573, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0074, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.20784054696559906, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0059, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.1811107099056244, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0085, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.43317103385925293, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0064, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.3815033435821533, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0064, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.35989734530448914, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.008, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.46118423342704773, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.012, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.25334376096725464, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0078, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.26764073967933655, + "learning_rate": 1.852547637090483e-05, + "loss": 0.01, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.2785920202732086, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0066, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.41587865352630615, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0061, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.42850133776664734, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.009, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.32369133830070496, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0091, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.2930110692977905, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0069, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.3199067711830139, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0085, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 0.4349478483200073, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0078, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 0.3054976165294647, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0061, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.2826739251613617, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0068, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.25106528401374817, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.007, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.25897887349128723, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0076, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.26398584246635437, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.0069, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.41751599311828613, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0083, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.17239610850811005, + "learning_rate": 1.844974808419918e-05, + "loss": 0.006, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3300461173057556, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0051, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.2645586133003235, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0068, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.24550332129001617, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0071, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.2889944911003113, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0091, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.476601779460907, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0066, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.35630306601524353, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0074, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.35651877522468567, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0084, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.3889803886413574, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0079, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4214278757572174, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.009, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.30540233850479126, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0083, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.3624532222747803, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0076, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.32963570952415466, + "learning_rate": 1.838347361898993e-05, + "loss": 0.01, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.3533381521701813, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0064, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.3011729419231415, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0065, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.4733760952949524, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0089, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.38553985953330994, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0059, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.2560643255710602, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0073, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.39531010389328003, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0106, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.2701983153820038, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0086, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.352717787027359, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0096, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.29157745838165283, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0073, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.4267994165420532, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0075, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.36308032274246216, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0075, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.33457428216934204, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0103, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.3717971444129944, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0069, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 0.21432936191558838, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0081, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.2878777086734772, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0057, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.4453850984573364, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0095, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.36917057633399963, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0063, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.3252313733100891, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0082, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.2529674470424652, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0057, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.2816419303417206, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0097, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.6464210152626038, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0076, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.33034399151802063, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0069, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.27335023880004883, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0078, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 0.3158395290374756, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0064, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.5128306746482849, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0087, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 0.24884961545467377, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0084, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.324278324842453, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0075, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6472476124763489, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0093, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.21269051730632782, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0066, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.29203882813453674, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0074, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.30436405539512634, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0087, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5066608190536499, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0081, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.32647472620010376, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0066, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.2804315388202667, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0066, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.24779941141605377, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0074, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.34001022577285767, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0101, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.2611280381679535, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0082, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.3129233717918396, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0079, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.2822776734828949, + "learning_rate": 1.815952390818299e-05, + "loss": 0.0098, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.36969345808029175, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0064, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.33959338068962097, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0088, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.2628033459186554, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0062, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.38812723755836487, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0061, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.26403307914733887, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0055, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 0.3789900541305542, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0081, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.28676870465278625, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0127, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.606293797492981, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0082, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.37321826815605164, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0063, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.368115097284317, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0091, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.3368416726589203, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0068, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.23466472327709198, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.006, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.3796599507331848, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0169, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.2202090471982956, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0099, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.5006175637245178, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0086, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.3673453629016876, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0083, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4379428029060364, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.006, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.43015891313552856, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0084, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.2806220054626465, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0061, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.23545289039611816, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0062, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 0.32115358114242554, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0075, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.3217777907848358, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0062, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.3224331736564636, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0072, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.31703537702560425, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0082, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 0.4175204932689667, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.008, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.22969186305999756, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0084, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.3421284258365631, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0077, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.32668444514274597, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0071, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.2729822099208832, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0068, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.33153197169303894, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0074, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.4678424000740051, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0076, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.23711496591567993, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0076, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.3230719566345215, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0084, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.32328692078590393, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0075, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.566879153251648, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0072, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.26277920603752136, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0062, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.339163601398468, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0082, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.23408609628677368, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0061, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.2942394018173218, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0065, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 0.3774799704551697, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0063, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.2847958207130432, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0072, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.2577030062675476, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0088, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.2883673906326294, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0075, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.3596307933330536, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0073, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.30285483598709106, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0076, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.2933914363384247, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0077, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.7666468024253845, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0102, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.31347739696502686, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0072, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.3435507118701935, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0081, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.3266170620918274, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0058, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.284027099609375, + "learning_rate": 1.784745142605655e-05, + "loss": 0.005, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.19972574710845947, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0072, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.2587524950504303, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0067, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.2922254204750061, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0064, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.17053507268428802, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0092, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.2850453555583954, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0073, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.2844892144203186, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0075, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.28969481587409973, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0079, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.4704195261001587, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0102, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.2652505338191986, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0077, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.2656702399253845, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0118, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.2282119244337082, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0086, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.30130353569984436, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0062, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.2295757234096527, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0066, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.25287938117980957, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0065, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.3274557292461395, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0076, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.34377023577690125, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0079, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.36259520053863525, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0055, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.24462608993053436, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0067, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.3615039587020874, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0088, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.40002626180648804, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0086, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.3362888991832733, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0062, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.33698126673698425, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0087, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.3287750482559204, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.0068, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.23409898579120636, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0063, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.23275460302829742, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0066, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.35324692726135254, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0068, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.2781875729560852, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0066, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.3083304166793823, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0069, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.22543831169605255, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0066, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.22566530108451843, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0066, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.3640650808811188, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0073, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.35346123576164246, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0069, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 0.30858153104782104, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0076, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.30895760655403137, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0074, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.30667638778686523, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0082, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.3134152889251709, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0086, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.21407048404216766, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0077, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.3456077575683594, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0083, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.4259016513824463, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.009, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.38690924644470215, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0094, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.31742537021636963, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0065, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.3568819463253021, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0077, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.3771888315677643, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0073, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.25528469681739807, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0067, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.36028411984443665, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0064, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.41987329721450806, + "learning_rate": 1.754802282200567e-05, + "loss": 0.007, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.18902993202209473, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0064, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.1859915405511856, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0086, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.1778331696987152, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0052, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4222147464752197, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.007, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.26806506514549255, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0074, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.34431734681129456, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0056, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.41732800006866455, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0079, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.3027847409248352, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0054, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.47592151165008545, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0066, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9539707899093628, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0095, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.4084669351577759, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0082, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.3052361309528351, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0072, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.23123528063297272, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.009, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.20356184244155884, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0073, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.048543930053711, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.3017459213733673, + "learning_rate": 1.74400239259128e-05, + "loss": 0.007, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.3679676353931427, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0085, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.20339734852313995, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0087, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.3523346781730652, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0076, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.4162348210811615, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0063, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.3293565511703491, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0067, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.24455691874027252, + "learning_rate": 1.739902378104222e-05, + "loss": 0.007, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 0.17645037174224854, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0051, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.2554231286048889, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0076, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.20006878674030304, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0076, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.27911216020584106, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0082, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.5701723694801331, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0081, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.222118079662323, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0072, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.2762138843536377, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0049, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 1.4110082387924194, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0114, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.31313180923461914, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0078, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.20941513776779175, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0079, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.3963930308818817, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0053, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.2066672146320343, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0049, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.3919369876384735, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0082, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.2544628083705902, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.0054, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.31123557686805725, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0078, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.24768301844596863, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0051, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.26674744486808777, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0052, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.27382466197013855, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0075, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.23384103178977966, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.0059, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.3531075417995453, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0068, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.34425088763237, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0066, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.2716144323348999, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0058, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.35163211822509766, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0071, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.23585639894008636, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0072, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.28066661953926086, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0068, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.3146689832210541, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0071, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.37553170323371887, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.008, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.18403242528438568, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0068, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.3904851973056793, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0072, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.4481397867202759, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0074, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.31124234199523926, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0074, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.3815377354621887, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0084, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.2909438908100128, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0074, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.3408021330833435, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0074, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.23902025818824768, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0076, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.2194853127002716, + "learning_rate": 1.714740708672306e-05, + "loss": 0.006, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 0.4337097108364105, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0092, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.4132380783557892, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0078, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.3434816598892212, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0076, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.25129666924476624, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0058, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.45458248257637024, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0064, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.5350340008735657, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.009, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 0.28008121252059937, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0073, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.33276447653770447, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0064, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37103456258773804, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0078, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 0.4689319133758545, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0073, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.3622629642486572, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.006, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.2822306156158447, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0073, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.19226481020450592, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0059, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.27806007862091064, + "learning_rate": 1.704700993266678e-05, + "loss": 0.007, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.25948378443717957, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0076, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.5857216715812683, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0095, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.30467140674591064, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0073, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.2067701816558838, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0068, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 0.5653601288795471, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0087, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.3107249140739441, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0065, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4027363061904907, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0098, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.2757766544818878, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0091, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.30397671461105347, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0061, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.28112074732780457, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0063, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.14751966297626495, + "learning_rate": 1.696714953556411e-05, + "loss": 0.008, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.2988373935222626, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0055, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.2706286311149597, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0066, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.3612031042575836, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.006, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.25386789441108704, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0065, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.3170768916606903, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0056, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.4776926338672638, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0059, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.34828829765319824, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0088, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.20440815389156342, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0066, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.2943046987056732, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0068, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.16982606053352356, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0073, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.5607914924621582, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0085, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.35823172330856323, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0064, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.23943926393985748, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0068, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.24083787202835083, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0056, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.37987980246543884, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0062, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.35953620076179504, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0069, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.22255095839500427, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0071, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.4121200442314148, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0098, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.2377164363861084, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0076, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.2298472374677658, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0064, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.40824711322784424, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0066, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.33295100927352905, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.007, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.3978032171726227, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0077, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.27672451734542847, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.006, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.2591206729412079, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0089, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.1749347746372223, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0051, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.18699893355369568, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0056, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.240631103515625, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0089, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.3650512993335724, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0075, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.3503545820713043, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0067, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.3086877167224884, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0061, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.41695648431777954, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0064, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.33144691586494446, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0067, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.2679164409637451, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0072, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.22681233286857605, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0071, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.36362454295158386, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0067, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.20192845165729523, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0067, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.3895004093647003, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0055, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.22510671615600586, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0069, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.19641445577144623, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0101, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.2914806008338928, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0076, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.3187137544155121, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0059, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.3116552233695984, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0095, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.2597426772117615, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0058, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.21480600535869598, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0055, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.23912057280540466, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.006, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.317941278219223, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0064, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.58933025598526, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0095, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.21906700730323792, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0105, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.23899045586585999, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0059, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.2969389259815216, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0124, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.3514954447746277, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0066, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.18145518004894257, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0077, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.3087640404701233, + "learning_rate": 1.656303606359183e-05, + "loss": 0.006, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.3532063364982605, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0055, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.34000685811042786, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0096, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.24904295802116394, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0073, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.36314642429351807, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.008, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.20241902768611908, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.009, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.3215351700782776, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0075, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 0.4313117563724518, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0081, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.48170387744903564, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0071, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.3369109630584717, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0066, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.34541958570480347, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0058, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.2493886947631836, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0058, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.22845667600631714, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0096, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.2695702016353607, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0055, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 0.28211796283721924, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0052, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.1901162564754486, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0058, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.2701025605201721, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0061, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.36527693271636963, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0072, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.3061700463294983, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0067, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.5612105131149292, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0087, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.23399880528450012, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0072, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.314933180809021, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0078, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.35548436641693115, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0094, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.37685567140579224, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0084, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.3190719783306122, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0065, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.26337119936943054, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0063, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.3518264889717102, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0072, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.3185817003250122, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0068, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.2995646893978119, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0064, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.3110463619232178, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0063, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.24277286231517792, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0064, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.17603862285614014, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0061, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.28089356422424316, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0076, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.2855492830276489, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0047, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.3247278928756714, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0058, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.18349547684192657, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0061, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.30654969811439514, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.007, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.2674420177936554, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0067, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.38177546858787537, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0091, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.33796218037605286, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0068, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.3754856586456299, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0063, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.21820858120918274, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.007, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.36184942722320557, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0061, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.32240399718284607, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0063, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 0.24755406379699707, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0059, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.397858589887619, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0064, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.389072448015213, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0063, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.3368140757083893, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0071, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.29631632566452026, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0062, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.24265453219413757, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0076, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.19892603158950806, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0064, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.1852462887763977, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0051, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.1886446475982666, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0075, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.25982722640037537, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0068, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.3376137614250183, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0058, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.33173730969429016, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0064, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.3177517354488373, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0072, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.3385971784591675, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0066, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.29163679480552673, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0073, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.2335229516029358, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0056, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.24502214789390564, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0054, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.2009458988904953, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0061, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.3341793715953827, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0082, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.3872147798538208, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0063, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.29940876364707947, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0073, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.4895729720592499, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0086, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.4485950469970703, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.0053, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.22961653769016266, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0077, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.24187293648719788, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.005, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.3535212278366089, + "learning_rate": 1.601916647245149e-05, + "loss": 0.007, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.26539868116378784, + "learning_rate": 1.601107070706339e-05, + "loss": 0.008, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.43096065521240234, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0076, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.16919535398483276, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0058, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.2383720725774765, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0064, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.36103156208992004, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0067, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.2657287120819092, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0072, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.21437199413776398, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0065, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.34000417590141296, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0046, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.4855337142944336, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0068, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.3178497850894928, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0064, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.3171309530735016, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0067, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.3364340662956238, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0067, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2272711992263794, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0069, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.29505178332328796, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0078, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.3755042552947998, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0081, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.2983969449996948, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0085, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.3112468421459198, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0072, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.1950412392616272, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0061, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.2153436243534088, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0065, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.25062650442123413, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0079, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.1407836377620697, + "learning_rate": 1.584793312377278e-05, + "loss": 0.005, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.17276513576507568, + "learning_rate": 1.583971586792325e-05, + "loss": 0.006, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.47983887791633606, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0076, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.28724750876426697, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0076, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.3224884569644928, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0079, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.37969788908958435, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0063, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.48106926679611206, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0071, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.3555319905281067, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0075, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.19486083090305328, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.006, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.42018064856529236, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0074, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.3075830936431885, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0071, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.20921990275382996, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0063, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.20436584949493408, + "learning_rate": 1.574895332125391e-05, + "loss": 0.006, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.28120604157447815, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0071, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.22980183362960815, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0078, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.24825431406497955, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0064, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.22042447328567505, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0071, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.249199777841568, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0076, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.32628607749938965, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0057, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.35151633620262146, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0059, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.29098865389823914, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0064, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.24006013572216034, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0058, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.2797141671180725, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0073, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.2963006794452667, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0058, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.19539053738117218, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0053, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.2686854898929596, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0051, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.35952430963516235, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0071, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.21042552590370178, + "learning_rate": 1.562410199183484e-05, + "loss": 0.005, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.27942436933517456, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0068, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.17137926816940308, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0063, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.20331411063671112, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0047, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.15683002769947052, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0052, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.14726290106773376, + "learning_rate": 1.558221191857467e-05, + "loss": 0.006, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.2940376400947571, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0068, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.4059796929359436, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0067, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.2587816119194031, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0086, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.3462979793548584, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0078, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.5607128739356995, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0079, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.24189788103103638, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0052, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 0.23362945020198822, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0073, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.22395116090774536, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0059, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.3514958322048187, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0064, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.25395795702934265, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0081, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.2948741018772125, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0051, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.22298739850521088, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0038, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.46948447823524475, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0097, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.2992243468761444, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0083, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.18001538515090942, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0055, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.23337051272392273, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0066, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.2863878905773163, + "learning_rate": 1.543878746906905e-05, + "loss": 0.006, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.23027309775352478, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0072, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.21359150111675262, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0064, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.3878735601902008, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0069, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.29146283864974976, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.007, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.21782676875591278, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0051, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.45582008361816406, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0063, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.4554077982902527, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0067, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.2254059612751007, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0064, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.13952374458312988, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0061, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.23241721093654633, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0072, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.3424162268638611, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0058, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.21074503660202026, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0057, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.33662086725234985, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0056, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.24403709173202515, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0073, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.27195101976394653, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0058, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.34224429726600647, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0072, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.29089581966400146, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0053, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3397226333618164, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0066, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.30517837405204773, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0092, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.3485032021999359, + "learning_rate": 1.52681291800283e-05, + "loss": 0.007, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.31346458196640015, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0045, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.1864607185125351, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.006, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.20976679027080536, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0053, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.22616958618164062, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0059, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.14772117137908936, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0073, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.33677151799201965, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0059, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.32354292273521423, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0061, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.21409569680690765, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0064, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.4659721851348877, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0061, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 0.32267874479293823, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0064, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.5019848942756653, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0061, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.32694318890571594, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0076, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.3013843297958374, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0068, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.1973707377910614, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0059, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22204430401325226, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0056, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.3365449607372284, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0059, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.3398677110671997, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.007, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.27888917922973633, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0062, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.2814931273460388, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0069, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.3317541182041168, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.006, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.21940776705741882, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0052, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.239700049161911, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0059, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.19117280840873718, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0071, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.21827168762683868, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0056, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.25645333528518677, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0085, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.30847233533859253, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0055, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.3127819895744324, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0058, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.30181658267974854, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0075, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.34778207540512085, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0077, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.18988046050071716, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0048, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.3479195833206177, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0045, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.24158424139022827, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0051, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.14698052406311035, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0053, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.4441753625869751, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0065, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.28078633546829224, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0064, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.29406028985977173, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0048, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3856968581676483, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0067, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.36528849601745605, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0062, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.34250667691230774, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0053, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.2862832844257355, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0055, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.3683549761772156, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0091, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.26892581582069397, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0069, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.2220073938369751, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0052, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.18825116753578186, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0065, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.28731998801231384, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0069, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.26817163825035095, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0058, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.44162800908088684, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0065, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 0.2990165948867798, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0074, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.20428279042243958, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0053, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.2918189465999603, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0056, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.30408942699432373, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0063, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.2593521177768707, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0061, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.34048640727996826, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0054, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.2438877820968628, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0059, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.5205245018005371, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0065, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.3658570349216461, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0061, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.23279106616973877, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0039, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.2704083323478699, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0054, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.1849551945924759, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0061, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.21807430684566498, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0059, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.47879981994628906, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0061, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.24125567078590393, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0056, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.25820469856262207, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0053, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.30664944648742676, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0075, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.3646678030490875, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0057, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.2534210979938507, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0045, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.2125798910856247, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0074, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 0.4387839734554291, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0072, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.337387353181839, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.01, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.23150259256362915, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0072, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.3243090808391571, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0076, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.26716119050979614, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.006, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.15551891922950745, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0061, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.1841796338558197, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0058, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 0.3119230270385742, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.006, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.2633327841758728, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0059, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.24567869305610657, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0055, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.3697315454483032, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0061, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.1941021829843521, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0052, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.2610131502151489, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.007, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.24856074154376984, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0062, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.27259066700935364, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0052, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.20962993800640106, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0055, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.4015270471572876, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0062, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.22935271263122559, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0063, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.29984018206596375, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0059, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.35775551199913025, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0079, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.15501125156879425, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0054, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.3543296158313751, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0072, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.1982075721025467, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0064, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.2616399824619293, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0062, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.2612541615962982, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0064, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3081730008125305, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0055, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.24024926126003265, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0083, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.20793405175209045, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0055, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.21445533633232117, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0058, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.24078251421451569, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0059, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.36214157938957214, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0061, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.2583295702934265, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0054, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.2641732394695282, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0069, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.2179708331823349, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0049, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.27418699860572815, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0049, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.3894921839237213, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0076, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.3912152945995331, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0063, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.16886518895626068, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0059, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.2731325626373291, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0073, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.3299262225627899, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.007, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.2671407163143158, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0058, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.2701479196548462, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0059, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.3803080916404724, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0061, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.2621704041957855, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0061, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.27780428528785706, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0065, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.3326016962528229, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0081, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.3632255792617798, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0069, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.24395202100276947, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0065, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.3215671181678772, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0066, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.2625272572040558, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0065, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.31547197699546814, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0043, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.1893424689769745, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0059, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.27042335271835327, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0059, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.22597061097621918, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0063, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.1742873191833496, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0062, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.16797663271427155, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0048, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.42558521032333374, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0075, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.37216684222221375, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0061, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.19943472743034363, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0065, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.2211161106824875, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0075, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.2680184245109558, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0052, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.2402123361825943, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0051, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.1881084442138672, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0066, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.26134756207466125, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0063, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.3185539245605469, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0062, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.3118845820426941, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0061, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.22595946490764618, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.007, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.2627023458480835, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0067, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.2984865605831146, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0051, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.25496092438697815, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0057, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.3078263998031616, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0074, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.17885653674602509, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0057, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.37737196683883667, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0058, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.21651378273963928, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0053, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.1974128633737564, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0059, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.17184904217720032, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0058, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.3074864447116852, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0059, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.28784239292144775, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0061, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.3435216546058655, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0065, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.38048845529556274, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0057, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.1875533014535904, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0052, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.48555630445480347, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0063, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 0.25066429376602173, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0055, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.2763892412185669, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0059, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.21217335760593414, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0092, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.23555652797222137, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0064, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.14828811585903168, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.006, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 0.27303484082221985, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0047, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.14681454002857208, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0067, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.43693456053733826, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0081, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.2940906286239624, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0059, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.20382657647132874, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0074, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.25655868649482727, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0069, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.31879740953445435, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0062, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4898712933063507, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0051, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.17142456769943237, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0061, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.14010348916053772, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0045, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.26882827281951904, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0056, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.2636195421218872, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0048, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.24932081997394562, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0045, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.3367895185947418, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0049, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.15173649787902832, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0053, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.34083831310272217, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0072, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3327343165874481, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0048, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.36545902490615845, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0076, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.22761192917823792, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0067, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.19272181391716003, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0072, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.2881070375442505, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.006, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.32841676473617554, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0063, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.19850151240825653, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0052, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.31401291489601135, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0052, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.4023345112800598, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0058, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.25802844762802124, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0051, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.19678954780101776, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0053, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.4545653164386749, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0073, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.36174362897872925, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0068, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.31692951917648315, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0063, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.3470834195613861, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0064, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.29541268944740295, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0062, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.26377183198928833, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.006, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.2019137591123581, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0058, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.45156505703926086, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.007, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.15810425579547882, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.006, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.20093902945518494, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.006, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.28989917039871216, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0062, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.39454182982444763, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0063, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.25967612862586975, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0069, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.2058791220188141, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0065, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.26367849111557007, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0074, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.2432256042957306, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0054, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.19844679534435272, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0048, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.16757237911224365, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0052, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.2988821566104889, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0047, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.2231496274471283, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0048, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.265029639005661, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0048, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.41179928183555603, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.0049, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.33498677611351013, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0052, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.2323407232761383, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0048, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.27306419610977173, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0061, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.2791977822780609, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0088, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.453421026468277, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0073, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.3209727108478546, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0063, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.2572932839393616, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0056, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.19572272896766663, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0051, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.2831172049045563, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0057, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.21267575025558472, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0059, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.3220005929470062, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0057, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.2515857517719269, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0063, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.18344618380069733, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0052, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.34515154361724854, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0052, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.16711464524269104, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0054, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.3027217984199524, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.006, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.31168296933174133, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.007, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5778804421424866, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0056, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.2591782212257385, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0061, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.2449295073747635, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0046, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 0.19733767211437225, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0054, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.14837461709976196, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0053, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.3784295916557312, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0054, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.2400134950876236, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0054, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.17671307921409607, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0051, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.2664073705673218, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.006, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.25426605343818665, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0062, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.26733267307281494, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0049, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.46151378750801086, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.006, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.17070212960243225, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0062, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.42009514570236206, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0052, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.20439159870147705, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0053, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.25189417600631714, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0066, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.21402288973331451, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0072, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.294109046459198, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0061, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.29355865716934204, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0061, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.2937833368778229, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0061, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.1926010102033615, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0056, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.21794214844703674, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0065, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.23409108817577362, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0067, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.4696379005908966, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0062, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.28415724635124207, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0061, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.22433705627918243, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0064, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3090682923793793, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0056, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.23742817342281342, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0057, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.2670089900493622, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0052, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.2810697555541992, + "learning_rate": 1.299277443549658e-05, + "loss": 0.007, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.44233059883117676, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0069, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.14227768778800964, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0064, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.298776239156723, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0072, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.2882034480571747, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0064, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.23135380446910858, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0064, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.2870500981807709, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.005, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.24524538218975067, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0064, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.2949783504009247, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0081, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.2215491235256195, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0069, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.26351356506347656, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0082, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.1909482628107071, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0052, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.13428187370300293, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0068, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.2125115543603897, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0048, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.27032148838043213, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0056, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.20981402695178986, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0069, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.24961373209953308, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0073, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.13643066585063934, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0054, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.25289252400398254, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0071, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.4061530828475952, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.006, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.29924723505973816, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0055, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.37029367685317993, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0053, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.37273409962654114, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0066, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.18242980539798737, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0054, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.18563945591449738, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0044, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.32972440123558044, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0045, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 0.3327874541282654, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0065, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.2077408730983734, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0055, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.1813255399465561, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0055, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.17811767756938934, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0055, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.20526157319545746, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0043, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.112189382314682, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0055, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.29082757234573364, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0099, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.23212411999702454, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0067, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.17449915409088135, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0047, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.3327349126338959, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0047, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.2709571123123169, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0056, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.19788618385791779, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0063, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.22075456380844116, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0064, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.2943982779979706, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0057, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.1718410849571228, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0056, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.3546068072319031, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0055, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.18132814764976501, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0047, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.17795684933662415, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0048, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.22964486479759216, + "learning_rate": 1.257232766480803e-05, + "loss": 0.005, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.3259448707103729, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0072, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.18410101532936096, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0045, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.28669047355651855, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0056, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.25986725091934204, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0055, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.1731722205877304, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0053, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.17501944303512573, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.005, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.2749968469142914, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0046, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.26125603914260864, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0055, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.22476239502429962, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0103, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.26169249415397644, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0067, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.19236186146736145, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0048, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.26535508036613464, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0055, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.2534106373786926, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0052, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.29464206099510193, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0076, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.3711875081062317, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0059, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.26430103182792664, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0055, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.27274343371391296, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.006, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.15951389074325562, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0069, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.33735600113868713, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0064, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.19443227350711823, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0051, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.1960541307926178, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0049, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21133695542812347, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0066, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.22702853381633759, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.006, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.22489185631275177, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0061, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.33164891600608826, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0067, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.22196516394615173, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.0055, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.19532594084739685, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0048, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.41902172565460205, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0064, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.30388328433036804, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0052, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.2507944703102112, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0051, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.30817684531211853, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0052, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.27485454082489014, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.006, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.14287802577018738, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0047, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 0.14513961970806122, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.0049, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.3345814645290375, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0051, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.2974685728549957, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0049, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.3455393612384796, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0062, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.16792115569114685, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.005, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.3038713335990906, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.005, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.2928559184074402, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0054, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.2317439168691635, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0039, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.3498123586177826, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0067, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.2850436866283417, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0045, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.18316122889518738, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0089, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.34362390637397766, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0066, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.13047993183135986, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0057, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.3403606116771698, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0055, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.27717292308807373, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0043, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.27412480115890503, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0049, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.1914675235748291, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0075, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.3778243958950043, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0084, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.20566068589687347, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.007, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.1868937760591507, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0051, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.24719548225402832, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.005, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.20591633021831512, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0053, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4353996217250824, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.005, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.31571000814437866, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.005, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.14182177186012268, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0048, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.3461489975452423, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0062, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.17980965971946716, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0043, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.28671878576278687, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0048, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.18663623929023743, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0072, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.25223061442375183, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0063, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.20179906487464905, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.37325599789619446, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0079, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.18855971097946167, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0052, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.2992260754108429, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0051, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.18020357191562653, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0046, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.2106374204158783, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0044, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3749687373638153, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0068, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.1616801619529724, + "learning_rate": 1.188676298665799e-05, + "loss": 0.007, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.20882001519203186, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0143, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.16600479185581207, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0052, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.406480073928833, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0051, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.27349016070365906, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0056, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.2340608835220337, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0044, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.3165459632873535, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0042, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.19552721083164215, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0047, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.21882636845111847, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0061, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.23699741065502167, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0052, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.283207505941391, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0053, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.2782933712005615, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0062, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.3389151096343994, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0074, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.25642505288124084, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0061, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.19476772844791412, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0067, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.1992277055978775, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0057, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.21006375551223755, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0058, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.18808932602405548, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0073, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.258075475692749, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0052, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.29291409254074097, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0052, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.19002115726470947, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0041, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.4246057868003845, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.006, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 0.16166792809963226, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.005, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.35779255628585815, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0065, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.20405125617980957, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0082, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.23229332268238068, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0095, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.21156901121139526, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0074, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.22334401309490204, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0051, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.18344342708587646, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0048, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.22982414066791534, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0056, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.24991759657859802, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0046, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.27965986728668213, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0045, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.309841126203537, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0054, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.20964398980140686, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0044, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.45226722955703735, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0057, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.17177052795886993, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0064, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.8886939287185669, + "learning_rate": 1.153689339251154e-05, + "loss": 0.008, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.14726528525352478, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0066, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.32135209441185, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0064, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.22926779091358185, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0052, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.21345189213752747, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0047, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.31324461102485657, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0072, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.2185574620962143, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0047, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.36229151487350464, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0042, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.3479749262332916, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0053, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.23806153237819672, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0065, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.30633601546287537, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0079, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.2326052039861679, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0063, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 0.1756114363670349, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0064, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.18622055649757385, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0045, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.3261238932609558, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0059, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.16155003011226654, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0057, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.22661013901233673, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0046, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.24310468137264252, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0044, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.16182619333267212, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0056, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.1656215786933899, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0039, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.2945510447025299, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0049, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.24436083436012268, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0058, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.34221476316452026, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0069, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.26235878467559814, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0055, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.23333275318145752, + "learning_rate": 1.130316049722011e-05, + "loss": 0.005, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.23382601141929626, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0057, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 0.1693800389766693, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0058, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.3740929067134857, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.005, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.26146796345710754, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0038, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.13361674547195435, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0053, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8631370663642883, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0085, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.2952764630317688, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0054, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.23047442734241486, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0054, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.25271645188331604, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0059, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.3246142864227295, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0066, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.31531205773353577, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0045, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4806351959705353, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0089, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.15645328164100647, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0051, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.29767802357673645, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0044, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.23338516056537628, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0055, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.20454354584217072, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0049, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.2087928056716919, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.004, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.18911990523338318, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0058, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.16931432485580444, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0053, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.3027138411998749, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0055, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.22635169327259064, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0039, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.26646292209625244, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0047, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.20067426562309265, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0054, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.22507227957248688, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0076, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.18533077836036682, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.005, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.1757635474205017, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0077, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.2326493263244629, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.006, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.2661048471927643, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0048, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.3285987079143524, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0047, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.3764145076274872, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.005, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.19637148082256317, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0048, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 0.16601431369781494, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.005, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.12405529618263245, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0036, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.21413138508796692, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0043, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.3323937952518463, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0057, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.20915299654006958, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0054, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.28372666239738464, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0048, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.32995301485061646, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0051, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.2148507684469223, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0061, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.22549118101596832, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.005, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.19749189913272858, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0049, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.250184565782547, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0065, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.23174546658992767, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.2707926034927368, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0049, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.175989031791687, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0058, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.2267833948135376, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0044, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.3495822846889496, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0048, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.2051204890012741, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0063, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.22149987518787384, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0058, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.21434035897254944, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0046, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.2996143400669098, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0065, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.22886960208415985, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0053, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.3317148685455322, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.005, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.45717868208885193, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0062, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.1223258301615715, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0051, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.2037084549665451, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0046, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.3772616982460022, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0045, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.30312252044677734, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0069, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.14988413453102112, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0047, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.3409348130226135, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0069, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.2308650016784668, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0049, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.15572187304496765, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0051, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.1962181180715561, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0049, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.337464302778244, + "learning_rate": 1.067930046280971e-05, + "loss": 0.005, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.17047251760959625, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0045, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.3098141849040985, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0043, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.17919068038463593, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0052, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.3461310863494873, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.006, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.37006744742393494, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0066, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.19726566970348358, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.005, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.1319705843925476, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0049, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.2131422460079193, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0055, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.1435563862323761, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0067, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.24024318158626556, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0055, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.1511068344116211, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0052, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.16795606911182404, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0047, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.1475641280412674, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0046, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.21277494728565216, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0048, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.2511015832424164, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0043, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.24675171077251434, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0059, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.2560728192329407, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0055, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.30879196524620056, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.005, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.1838868409395218, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0052, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.1673516035079956, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.20293423533439636, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0047, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.25513023138046265, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0052, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.26149800419807434, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0045, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.27551159262657166, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0041, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.2508440911769867, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0043, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.2889135181903839, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0043, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.1755184680223465, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0051, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.2095116674900055, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.33451047539711, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0079, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.44589516520500183, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0064, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.24158142507076263, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0047, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.15632936358451843, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.006, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.10808487981557846, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0065, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.1782998889684677, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0046, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.16395118832588196, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.004, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 0.30205732583999634, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0058, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.1561775654554367, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.004, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.1649634838104248, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0062, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.15428072214126587, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0043, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.11285894364118576, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0067, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.3470291793346405, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0056, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.16610246896743774, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0051, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.29931193590164185, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0051, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.15366005897521973, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.005, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.2352767139673233, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0057, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.19226962327957153, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0042, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.1903623789548874, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0044, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.4167932868003845, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0071, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.2913760840892792, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0046, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.2632276713848114, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0063, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.21258050203323364, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0043, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.19750680029392242, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.0032, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.2896588444709778, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0045, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3017624020576477, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0074, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.18355949223041534, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0051, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.16483789682388306, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0056, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.2190672904253006, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0043, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.28435200452804565, + "learning_rate": 1.011517750003287e-05, + "loss": 0.005, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.2564929723739624, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0049, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.2592712342739105, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0048, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.18716935813426971, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0047, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.18236829340457916, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0049, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.27956655621528625, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0056, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.13664546608924866, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0048, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.21617569029331207, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0052, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.2196502536535263, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0054, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.20864732563495636, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0041, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.38381293416023254, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.005, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.1605401486158371, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0045, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.2079813927412033, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0051, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.2110205590724945, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0054, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.2421400547027588, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0048, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.41358140110969543, + "learning_rate": 9.969762660447491e-06, + "loss": 0.006, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.23386628925800323, + "learning_rate": 9.960077585586335e-06, + "loss": 0.005, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.20425592362880707, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0059, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.21164651215076447, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0042, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.1642364114522934, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0034, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.18716906011104584, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0044, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.15626995265483856, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0044, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.18394386768341064, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0044, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.3590037524700165, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0073, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.2103291153907776, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0051, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.19865299761295319, + "learning_rate": 9.87296819358355e-06, + "loss": 0.006, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.2052467316389084, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0065, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.31245940923690796, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0049, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.2959006726741791, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0042, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.33695659041404724, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0071, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.20898328721523285, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0062, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.3500119149684906, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0049, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.3926694095134735, + "learning_rate": 9.805290087509098e-06, + "loss": 0.007, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.24234539270401, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0039, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.1705496460199356, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0056, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.2907398045063019, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0048, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.2366454005241394, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0047, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.25498414039611816, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0046, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.163838192820549, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0048, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.1613040417432785, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0048, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.3639470338821411, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0042, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.22151169180870056, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0043, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.13474372029304504, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0051, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.2601003050804138, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0038, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.20202822983264923, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0046, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.18514803051948547, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0061, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.16678287088871002, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0038, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.17608965933322906, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0041, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.26356828212738037, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0059, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.297612726688385, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0047, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.16363881528377533, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.12642459571361542, + "learning_rate": 9.621949874438232e-06, + "loss": 0.004, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.3339644968509674, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0052, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.20784282684326172, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0046, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.28467273712158203, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0047, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.3124372661113739, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0051, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.3490087389945984, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0047, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.15114343166351318, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0051, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.41157594323158264, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0058, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.40405890345573425, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0045, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.1149911880493164, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0087, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.18746539950370789, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0058, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.1327875554561615, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0049, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.1530160903930664, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0038, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.2663615047931671, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0049, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.3390499949455261, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0046, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.2461002618074417, + "learning_rate": 9.477616135359713e-06, + "loss": 0.006, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.2141093611717224, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0049, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.20443470776081085, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0052, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.14927290380001068, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0039, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.3012462854385376, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0047, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.33484792709350586, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0045, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.19986321032047272, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0041, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.21612870693206787, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0043, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.19541047513484955, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0044, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.24203962087631226, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0049, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.1470087766647339, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0049, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.2336059808731079, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0048, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.32893121242523193, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0044, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.32034680247306824, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0055, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.27538758516311646, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0049, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.18869644403457642, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0065, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.2719379961490631, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0047, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.2850756347179413, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0043, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.19997543096542358, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0068, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.19222821295261383, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0044, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.16414248943328857, + "learning_rate": 9.285803018919292e-06, + "loss": 0.004, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.23754803836345673, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0039, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.2682085335254669, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0048, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.18268488347530365, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0046, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.14906349778175354, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0034, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.19079554080963135, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0041, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.09538780897855759, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0043, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.19193744659423828, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0044, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.1366361379623413, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0049, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.29436588287353516, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0052, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.24179348349571228, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0047, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.236627459526062, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0061, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.1719210296869278, + "learning_rate": 9.171095634265995e-06, + "loss": 0.0054, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.2724406123161316, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0048, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.09852395206689835, + "learning_rate": 9.152007262148612e-06, + "loss": 0.004, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.23493632674217224, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0049, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.20697079598903656, + "learning_rate": 9.132927564918328e-06, + "loss": 0.0047, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.16597376763820648, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0048, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.23542962968349457, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0046, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.18859006464481354, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0054, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.16773538291454315, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0044, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.2122378647327423, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0042, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.18205690383911133, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0046, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.1791398823261261, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0043, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.4446735680103302, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0052, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.32150915265083313, + "learning_rate": 9.047178679583151e-06, + "loss": 0.005, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.15855731070041656, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0045, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.19377414882183075, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0057, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.25969046354293823, + "learning_rate": 9.018636566864313e-06, + "loss": 0.006, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.2349981814622879, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0073, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.1853523701429367, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0051, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.22417226433753967, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0058, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.1969340741634369, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0058, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.18523764610290527, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0059, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.28188323974609375, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0052, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.18134717643260956, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0048, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.15660132467746735, + "learning_rate": 8.942627394858978e-06, + "loss": 0.004, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.3179869055747986, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0044, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.14007267355918884, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0043, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.31531354784965515, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0062, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.1867508888244629, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0054, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4172282814979553, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0056, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.21233956515789032, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0054, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.13055016100406647, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0048, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.24662990868091583, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0054, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.1877284198999405, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0045, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.20158089697360992, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0052, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.23169469833374023, + "learning_rate": 8.83836825410936e-06, + "loss": 0.0048, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.27991265058517456, + "learning_rate": 8.828905148874785e-06, + "loss": 0.008, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.3321090638637543, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0063, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.14790703356266022, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0033, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.1504756361246109, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0052, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.2211659848690033, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0038, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.1777208149433136, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0041, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.2586643397808075, + "learning_rate": 8.772180411864604e-06, + "loss": 0.006, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.2705499529838562, + "learning_rate": 8.762735374981932e-06, + "loss": 0.0047, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.16527540981769562, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0037, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.24313445389270782, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0057, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.16705767810344696, + "learning_rate": 8.734416061983528e-06, + "loss": 0.004, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.20638783276081085, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0052, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.26159438490867615, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0039, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.30387070775032043, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0038, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.24292278289794922, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0042, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.3707493543624878, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0056, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.41142478585243225, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0044, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.22052627801895142, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0047, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.14626234769821167, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0047, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.25504666566848755, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0046, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.2020457535982132, + "learning_rate": 8.640192851412488e-06, + "loss": 0.006, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.2440478354692459, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0047, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.12040785700082779, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0044, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.25539812445640564, + "learning_rate": 8.611979388060327e-06, + "loss": 0.006, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.20701228082180023, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0041, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.24188214540481567, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0063, + "step": 18000 + }, + { + "epoch": 1.0791539337288034, + "grad_norm": 0.24987974762916565, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0063, + "step": 18010 + }, + { + "epoch": 1.0797531308047217, + "grad_norm": 0.20973123610019684, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0049, + "step": 18020 + }, + { + "epoch": 1.08035232788064, + "grad_norm": 0.19898714125156403, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0061, + "step": 18030 + }, + { + "epoch": 1.0809515249565582, + "grad_norm": 0.21703247725963593, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0056, + "step": 18040 + }, + { + "epoch": 1.0815507220324765, + "grad_norm": 0.18688541650772095, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0054, + "step": 18050 + }, + { + "epoch": 1.0821499191083948, + "grad_norm": 0.30194586515426636, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0049, + "step": 18060 + }, + { + "epoch": 1.082749116184313, + "grad_norm": 0.17975366115570068, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0046, + "step": 18070 + }, + { + "epoch": 1.0833483132602313, + "grad_norm": 0.25966599583625793, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0044, + "step": 18080 + }, + { + "epoch": 1.0839475103361496, + "grad_norm": 0.1702205240726471, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0058, + "step": 18090 + }, + { + "epoch": 1.0845467074120678, + "grad_norm": 0.18940114974975586, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0052, + "step": 18100 + }, + { + "epoch": 1.085145904487986, + "grad_norm": 0.18239127099514008, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0047, + "step": 18110 + }, + { + "epoch": 1.0857451015639044, + "grad_norm": 0.14571616053581238, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0046, + "step": 18120 + }, + { + "epoch": 1.0863442986398226, + "grad_norm": 0.17203395068645477, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0038, + "step": 18130 + }, + { + "epoch": 1.086943495715741, + "grad_norm": 0.249881312251091, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0056, + "step": 18140 + }, + { + "epoch": 1.0875426927916592, + "grad_norm": 0.296194463968277, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0044, + "step": 18150 + }, + { + "epoch": 1.0881418898675774, + "grad_norm": 0.21376049518585205, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0052, + "step": 18160 + }, + { + "epoch": 1.0887410869434957, + "grad_norm": 0.2952374815940857, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0036, + "step": 18170 + }, + { + "epoch": 1.089340284019414, + "grad_norm": 0.20862646400928497, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0051, + "step": 18180 + }, + { + "epoch": 1.0899394810953322, + "grad_norm": 0.17828255891799927, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0053, + "step": 18190 + }, + { + "epoch": 1.0905386781712505, + "grad_norm": 0.20771050453186035, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0038, + "step": 18200 + }, + { + "epoch": 1.0911378752471688, + "grad_norm": 0.3046565651893616, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0059, + "step": 18210 + }, + { + "epoch": 1.091737072323087, + "grad_norm": 0.12605167925357819, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0046, + "step": 18220 + }, + { + "epoch": 1.0923362693990053, + "grad_norm": 0.13702887296676636, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0038, + "step": 18230 + }, + { + "epoch": 1.0929354664749236, + "grad_norm": 0.11569058150053024, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0042, + "step": 18240 + }, + { + "epoch": 1.0935346635508418, + "grad_norm": 0.27488255500793457, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0054, + "step": 18250 + }, + { + "epoch": 1.09413386062676, + "grad_norm": 0.30820342898368835, + "learning_rate": 8.349909816537207e-06, + "loss": 0.005, + "step": 18260 + }, + { + "epoch": 1.0947330577026784, + "grad_norm": 0.3108576536178589, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0056, + "step": 18270 + }, + { + "epoch": 1.0953322547785966, + "grad_norm": 0.16087505221366882, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0044, + "step": 18280 + }, + { + "epoch": 1.095931451854515, + "grad_norm": 0.27139320969581604, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0055, + "step": 18290 + }, + { + "epoch": 1.0965306489304332, + "grad_norm": 0.17057007551193237, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0036, + "step": 18300 + }, + { + "epoch": 1.0971298460063514, + "grad_norm": 0.13946233689785004, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0057, + "step": 18310 + }, + { + "epoch": 1.0977290430822697, + "grad_norm": 0.2342602014541626, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0038, + "step": 18320 + }, + { + "epoch": 1.098328240158188, + "grad_norm": 0.17249339818954468, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0047, + "step": 18330 + }, + { + "epoch": 1.0989274372341062, + "grad_norm": 0.2641673684120178, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0044, + "step": 18340 + }, + { + "epoch": 1.0995266343100245, + "grad_norm": 0.18304336071014404, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0041, + "step": 18350 + }, + { + "epoch": 1.1001258313859428, + "grad_norm": 0.25955966114997864, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0045, + "step": 18360 + }, + { + "epoch": 1.100725028461861, + "grad_norm": 0.2159314751625061, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0038, + "step": 18370 + }, + { + "epoch": 1.1013242255377793, + "grad_norm": 0.254371702671051, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0043, + "step": 18380 + }, + { + "epoch": 1.1019234226136976, + "grad_norm": 0.10616741329431534, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0036, + "step": 18390 + }, + { + "epoch": 1.1025226196896158, + "grad_norm": 0.38598379492759705, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0065, + "step": 18400 + }, + { + "epoch": 1.1031218167655341, + "grad_norm": 0.3797863721847534, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0048, + "step": 18410 + }, + { + "epoch": 1.1037210138414524, + "grad_norm": 0.2059139758348465, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0062, + "step": 18420 + }, + { + "epoch": 1.1043202109173706, + "grad_norm": 0.19991335272789001, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0043, + "step": 18430 + }, + { + "epoch": 1.104919407993289, + "grad_norm": 0.17376656830310822, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0047, + "step": 18440 + }, + { + "epoch": 1.1055186050692072, + "grad_norm": 0.17102457582950592, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0056, + "step": 18450 + }, + { + "epoch": 1.1061178021451255, + "grad_norm": 0.501983642578125, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0065, + "step": 18460 + }, + { + "epoch": 1.1067169992210437, + "grad_norm": 0.40338510274887085, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0055, + "step": 18470 + }, + { + "epoch": 1.107316196296962, + "grad_norm": 0.10511627048254013, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0052, + "step": 18480 + }, + { + "epoch": 1.1079153933728803, + "grad_norm": 0.2610682249069214, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0038, + "step": 18490 + }, + { + "epoch": 1.1085145904487985, + "grad_norm": 0.09666074812412262, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0058, + "step": 18500 + }, + { + "epoch": 1.1091137875247168, + "grad_norm": 0.19014683365821838, + "learning_rate": 8.117972135268806e-06, + "loss": 0.005, + "step": 18510 + }, + { + "epoch": 1.109712984600635, + "grad_norm": 0.2999255657196045, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 1.1103121816765533, + "grad_norm": 0.20351538062095642, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0049, + "step": 18530 + }, + { + "epoch": 1.1109113787524716, + "grad_norm": 0.1562410295009613, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0034, + "step": 18540 + }, + { + "epoch": 1.1115105758283899, + "grad_norm": 0.14160799980163574, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0035, + "step": 18550 + }, + { + "epoch": 1.1121097729043081, + "grad_norm": 0.10796743631362915, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0056, + "step": 18560 + }, + { + "epoch": 1.1127089699802264, + "grad_norm": 0.28861188888549805, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0043, + "step": 18570 + }, + { + "epoch": 1.1133081670561449, + "grad_norm": 0.3835368752479553, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0037, + "step": 18580 + }, + { + "epoch": 1.113907364132063, + "grad_norm": 0.21850043535232544, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0038, + "step": 18590 + }, + { + "epoch": 1.1145065612079814, + "grad_norm": 0.2950346767902374, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0068, + "step": 18600 + }, + { + "epoch": 1.1151057582838995, + "grad_norm": 0.13051068782806396, + "learning_rate": 8.025779439806006e-06, + "loss": 0.0041, + "step": 18610 + }, + { + "epoch": 1.115704955359818, + "grad_norm": 0.11036359518766403, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0074, + "step": 18620 + }, + { + "epoch": 1.116304152435736, + "grad_norm": 0.35306516289711, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0087, + "step": 18630 + }, + { + "epoch": 1.1169033495116545, + "grad_norm": 0.29782727360725403, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0045, + "step": 18640 + }, + { + "epoch": 1.1175025465875728, + "grad_norm": 0.20690713822841644, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0042, + "step": 18650 + }, + { + "epoch": 1.118101743663491, + "grad_norm": 0.16064110398292542, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0038, + "step": 18660 + }, + { + "epoch": 1.1187009407394093, + "grad_norm": 0.2477649450302124, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0042, + "step": 18670 + }, + { + "epoch": 1.1193001378153276, + "grad_norm": 0.25939393043518066, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0045, + "step": 18680 + }, + { + "epoch": 1.1198993348912458, + "grad_norm": 0.3345301151275635, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0045, + "step": 18690 + }, + { + "epoch": 1.120498531967164, + "grad_norm": 0.19570066034793854, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0052, + "step": 18700 + }, + { + "epoch": 1.1210977290430824, + "grad_norm": 0.09655601531267166, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0044, + "step": 18710 + }, + { + "epoch": 1.1216969261190006, + "grad_norm": 0.13345655798912048, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0031, + "step": 18720 + }, + { + "epoch": 1.122296123194919, + "grad_norm": 0.3130756616592407, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0072, + "step": 18730 + }, + { + "epoch": 1.1228953202708372, + "grad_norm": 0.16259168088436127, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0036, + "step": 18740 + }, + { + "epoch": 1.1234945173467554, + "grad_norm": 0.2581227123737335, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0037, + "step": 18750 + }, + { + "epoch": 1.1240937144226737, + "grad_norm": 0.36706119775772095, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0043, + "step": 18760 + }, + { + "epoch": 1.124692911498592, + "grad_norm": 0.1705426573753357, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0069, + "step": 18770 + }, + { + "epoch": 1.1252921085745102, + "grad_norm": 0.4281153380870819, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0057, + "step": 18780 + }, + { + "epoch": 1.1258913056504285, + "grad_norm": 0.25743696093559265, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0036, + "step": 18790 + }, + { + "epoch": 1.1264905027263468, + "grad_norm": 0.17692404985427856, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0044, + "step": 18800 + }, + { + "epoch": 1.127089699802265, + "grad_norm": 0.17617255449295044, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0043, + "step": 18810 + }, + { + "epoch": 1.1276888968781833, + "grad_norm": 0.193951815366745, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0042, + "step": 18820 + }, + { + "epoch": 1.1282880939541016, + "grad_norm": 0.2187023162841797, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0047, + "step": 18830 + }, + { + "epoch": 1.1288872910300198, + "grad_norm": 0.21488729119300842, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0039, + "step": 18840 + }, + { + "epoch": 1.129486488105938, + "grad_norm": 0.13388743996620178, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0043, + "step": 18850 + }, + { + "epoch": 1.1300856851818564, + "grad_norm": 0.26977118849754333, + "learning_rate": 7.796848308199681e-06, + "loss": 0.004, + "step": 18860 + }, + { + "epoch": 1.1306848822577746, + "grad_norm": 0.40695786476135254, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0049, + "step": 18870 + }, + { + "epoch": 1.131284079333693, + "grad_norm": 0.29070621728897095, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0056, + "step": 18880 + }, + { + "epoch": 1.1318832764096112, + "grad_norm": 0.2745647728443146, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0056, + "step": 18890 + }, + { + "epoch": 1.1324824734855294, + "grad_norm": 0.20881050825119019, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0057, + "step": 18900 + }, + { + "epoch": 1.1330816705614477, + "grad_norm": 0.17475518584251404, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0041, + "step": 18910 + }, + { + "epoch": 1.133680867637366, + "grad_norm": 0.2414310723543167, + "learning_rate": 7.742248115573104e-06, + "loss": 0.004, + "step": 18920 + }, + { + "epoch": 1.1342800647132842, + "grad_norm": 0.20051640272140503, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0042, + "step": 18930 + }, + { + "epoch": 1.1348792617892025, + "grad_norm": 0.18383435904979706, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0038, + "step": 18940 + }, + { + "epoch": 1.1354784588651208, + "grad_norm": 0.16546988487243652, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0041, + "step": 18950 + }, + { + "epoch": 1.136077655941039, + "grad_norm": 0.17165544629096985, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0057, + "step": 18960 + }, + { + "epoch": 1.1366768530169573, + "grad_norm": 0.25065234303474426, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0048, + "step": 18970 + }, + { + "epoch": 1.1372760500928756, + "grad_norm": 0.19762223958969116, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0038, + "step": 18980 + }, + { + "epoch": 1.1378752471687938, + "grad_norm": 0.23894545435905457, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0033, + "step": 18990 + }, + { + "epoch": 1.1384744442447121, + "grad_norm": 0.2860289216041565, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0053, + "step": 19000 + }, + { + "epoch": 1.1390736413206304, + "grad_norm": 0.3699626624584198, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0061, + "step": 19010 + }, + { + "epoch": 1.1396728383965486, + "grad_norm": 0.2370971292257309, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0043, + "step": 19020 + }, + { + "epoch": 1.140272035472467, + "grad_norm": 0.19790691137313843, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0042, + "step": 19030 + }, + { + "epoch": 1.1408712325483852, + "grad_norm": 0.14648208022117615, + "learning_rate": 7.633462930388875e-06, + "loss": 0.005, + "step": 19040 + }, + { + "epoch": 1.1414704296243035, + "grad_norm": 0.105158232152462, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0032, + "step": 19050 + }, + { + "epoch": 1.1420696267002217, + "grad_norm": 0.24994254112243652, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0042, + "step": 19060 + }, + { + "epoch": 1.14266882377614, + "grad_norm": 0.30648791790008545, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0058, + "step": 19070 + }, + { + "epoch": 1.1432680208520583, + "grad_norm": 0.16284243762493134, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0047, + "step": 19080 + }, + { + "epoch": 1.1438672179279765, + "grad_norm": 0.14919471740722656, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0045, + "step": 19090 + }, + { + "epoch": 1.1444664150038948, + "grad_norm": 0.14879491925239563, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0047, + "step": 19100 + }, + { + "epoch": 1.145065612079813, + "grad_norm": 0.11741457879543304, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0041, + "step": 19110 + }, + { + "epoch": 1.1456648091557313, + "grad_norm": 0.09406878799200058, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0029, + "step": 19120 + }, + { + "epoch": 1.1462640062316496, + "grad_norm": 0.20860706269741058, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0054, + "step": 19130 + }, + { + "epoch": 1.1468632033075679, + "grad_norm": 0.24234607815742493, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0047, + "step": 19140 + }, + { + "epoch": 1.1474624003834861, + "grad_norm": 0.27025938034057617, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0042, + "step": 19150 + }, + { + "epoch": 1.1480615974594044, + "grad_norm": 0.15129081904888153, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0046, + "step": 19160 + }, + { + "epoch": 1.1486607945353227, + "grad_norm": 0.11173490434885025, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0035, + "step": 19170 + }, + { + "epoch": 1.149259991611241, + "grad_norm": 0.2204807698726654, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0036, + "step": 19180 + }, + { + "epoch": 1.1498591886871592, + "grad_norm": 0.20111115276813507, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0087, + "step": 19190 + }, + { + "epoch": 1.1504583857630775, + "grad_norm": 0.213748961687088, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0045, + "step": 19200 + }, + { + "epoch": 1.1510575828389957, + "grad_norm": 0.21150177717208862, + "learning_rate": 7.480328799175369e-06, + "loss": 0.004, + "step": 19210 + }, + { + "epoch": 1.151656779914914, + "grad_norm": 0.2450210005044937, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0036, + "step": 19220 + }, + { + "epoch": 1.1522559769908323, + "grad_norm": 0.16161729395389557, + "learning_rate": 7.4623904967312e-06, + "loss": 0.004, + "step": 19230 + }, + { + "epoch": 1.1528551740667505, + "grad_norm": 0.15077564120292664, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0038, + "step": 19240 + }, + { + "epoch": 1.1534543711426688, + "grad_norm": 0.3078431487083435, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0051, + "step": 19250 + }, + { + "epoch": 1.154053568218587, + "grad_norm": 0.15213221311569214, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0032, + "step": 19260 + }, + { + "epoch": 1.1546527652945053, + "grad_norm": 0.12404917925596237, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0042, + "step": 19270 + }, + { + "epoch": 1.1552519623704236, + "grad_norm": 0.18779516220092773, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0041, + "step": 19280 + }, + { + "epoch": 1.1558511594463419, + "grad_norm": 0.4039568603038788, + "learning_rate": 7.408675563767873e-06, + "loss": 0.005, + "step": 19290 + }, + { + "epoch": 1.1564503565222601, + "grad_norm": 0.2045651078224182, + "learning_rate": 7.399737764864619e-06, + "loss": 0.0057, + "step": 19300 + }, + { + "epoch": 1.1570495535981784, + "grad_norm": 0.3885338306427002, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0049, + "step": 19310 + }, + { + "epoch": 1.1576487506740967, + "grad_norm": 0.253049373626709, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0059, + "step": 19320 + }, + { + "epoch": 1.158247947750015, + "grad_norm": 0.250356525182724, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0076, + "step": 19330 + }, + { + "epoch": 1.1588471448259332, + "grad_norm": 0.3269367814064026, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0112, + "step": 19340 + }, + { + "epoch": 1.1594463419018515, + "grad_norm": 0.15401138365268707, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0052, + "step": 19350 + }, + { + "epoch": 1.1600455389777697, + "grad_norm": 0.1631775051355362, + "learning_rate": 7.346200065486093e-06, + "loss": 0.004, + "step": 19360 + }, + { + "epoch": 1.160644736053688, + "grad_norm": 0.17112085223197937, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0038, + "step": 19370 + }, + { + "epoch": 1.1612439331296063, + "grad_norm": 0.24018551409244537, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0056, + "step": 19380 + }, + { + "epoch": 1.1618431302055245, + "grad_norm": 0.17964349687099457, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0057, + "step": 19390 + }, + { + "epoch": 1.1624423272814428, + "grad_norm": 0.1747465431690216, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0053, + "step": 19400 + }, + { + "epoch": 1.163041524357361, + "grad_norm": 0.21299205720424652, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0038, + "step": 19410 + }, + { + "epoch": 1.1636407214332793, + "grad_norm": 0.13219258189201355, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0057, + "step": 19420 + }, + { + "epoch": 1.1642399185091976, + "grad_norm": 1.0558332204818726, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0066, + "step": 19430 + }, + { + "epoch": 1.1648391155851159, + "grad_norm": 0.2154799997806549, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0041, + "step": 19440 + }, + { + "epoch": 1.1654383126610341, + "grad_norm": 0.13665339350700378, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0044, + "step": 19450 + }, + { + "epoch": 1.1660375097369524, + "grad_norm": 0.2101723700761795, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0039, + "step": 19460 + }, + { + "epoch": 1.1666367068128707, + "grad_norm": 0.13208501040935516, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0054, + "step": 19470 + }, + { + "epoch": 1.167235903888789, + "grad_norm": 0.09342823177576065, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0032, + "step": 19480 + }, + { + "epoch": 1.1678351009647072, + "grad_norm": 0.22464905679225922, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0055, + "step": 19490 + }, + { + "epoch": 1.1684342980406255, + "grad_norm": 0.17030438780784607, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0042, + "step": 19500 + }, + { + "epoch": 1.1690334951165438, + "grad_norm": 0.17673689126968384, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0055, + "step": 19510 + }, + { + "epoch": 1.169632692192462, + "grad_norm": 0.24041922390460968, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0048, + "step": 19520 + }, + { + "epoch": 1.1702318892683803, + "grad_norm": 0.14808662235736847, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0031, + "step": 19530 + }, + { + "epoch": 1.1708310863442986, + "grad_norm": 0.2489791214466095, + "learning_rate": 7.186522173441719e-06, + "loss": 0.004, + "step": 19540 + }, + { + "epoch": 1.1714302834202168, + "grad_norm": 0.19468742609024048, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0042, + "step": 19550 + }, + { + "epoch": 1.172029480496135, + "grad_norm": 0.15028323233127594, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0061, + "step": 19560 + }, + { + "epoch": 1.1726286775720534, + "grad_norm": 0.13852037489414215, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0045, + "step": 19570 + }, + { + "epoch": 1.1732278746479716, + "grad_norm": 0.1401798278093338, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0063, + "step": 19580 + }, + { + "epoch": 1.17382707172389, + "grad_norm": 0.1831122189760208, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0034, + "step": 19590 + }, + { + "epoch": 1.1744262687998082, + "grad_norm": 0.2867920994758606, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0044, + "step": 19600 + }, + { + "epoch": 1.1750254658757266, + "grad_norm": 0.13363438844680786, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0038, + "step": 19610 + }, + { + "epoch": 1.1756246629516447, + "grad_norm": 0.20085696876049042, + "learning_rate": 7.116016051769541e-06, + "loss": 0.004, + "step": 19620 + }, + { + "epoch": 1.1762238600275632, + "grad_norm": 0.1598372906446457, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0042, + "step": 19630 + }, + { + "epoch": 1.1768230571034812, + "grad_norm": 0.09672598540782928, + "learning_rate": 7.098434895408162e-06, + "loss": 0.004, + "step": 19640 + }, + { + "epoch": 1.1774222541793997, + "grad_norm": 0.18206225335597992, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0048, + "step": 19650 + }, + { + "epoch": 1.1780214512553178, + "grad_norm": 0.1818019449710846, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0038, + "step": 19660 + }, + { + "epoch": 1.1786206483312363, + "grad_norm": 0.21658800542354584, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0044, + "step": 19670 + }, + { + "epoch": 1.1792198454071543, + "grad_norm": 0.08513368666172028, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0038, + "step": 19680 + }, + { + "epoch": 1.1798190424830728, + "grad_norm": 0.10634194314479828, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0044, + "step": 19690 + }, + { + "epoch": 1.1804182395589908, + "grad_norm": 0.12106078863143921, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0037, + "step": 19700 + }, + { + "epoch": 1.1810174366349093, + "grad_norm": 0.11508465558290482, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0036, + "step": 19710 + }, + { + "epoch": 1.1816166337108274, + "grad_norm": 0.20805053412914276, + "learning_rate": 7.028294242074066e-06, + "loss": 0.004, + "step": 19720 + }, + { + "epoch": 1.1822158307867459, + "grad_norm": 0.23920200765132904, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0045, + "step": 19730 + }, + { + "epoch": 1.1828150278626641, + "grad_norm": 0.1300375908613205, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0045, + "step": 19740 + }, + { + "epoch": 1.1834142249385824, + "grad_norm": 0.23444809019565582, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0036, + "step": 19750 + }, + { + "epoch": 1.1840134220145007, + "grad_norm": 0.2636217772960663, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0044, + "step": 19760 + }, + { + "epoch": 1.184612619090419, + "grad_norm": 0.31166398525238037, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.005, + "step": 19770 + }, + { + "epoch": 1.1852118161663372, + "grad_norm": 0.32881107926368713, + "learning_rate": 6.975884226362e-06, + "loss": 0.0055, + "step": 19780 + }, + { + "epoch": 1.1858110132422555, + "grad_norm": 0.41748252511024475, + "learning_rate": 6.967165692827958e-06, + "loss": 0.006, + "step": 19790 + }, + { + "epoch": 1.1864102103181737, + "grad_norm": 0.1588834673166275, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0039, + "step": 19800 + }, + { + "epoch": 1.187009407394092, + "grad_norm": 0.23697984218597412, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0039, + "step": 19810 + }, + { + "epoch": 1.1876086044700103, + "grad_norm": 0.19356773793697357, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0061, + "step": 19820 + }, + { + "epoch": 1.1882078015459285, + "grad_norm": 0.16373206675052643, + "learning_rate": 6.932338988482141e-06, + "loss": 0.004, + "step": 19830 + }, + { + "epoch": 1.1888069986218468, + "grad_norm": 0.1331700086593628, + "learning_rate": 6.923644220932124e-06, + "loss": 0.004, + "step": 19840 + }, + { + "epoch": 1.189406195697765, + "grad_norm": 0.4039696753025055, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0057, + "step": 19850 + }, + { + "epoch": 1.1900053927736833, + "grad_norm": 0.30325421690940857, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0065, + "step": 19860 + }, + { + "epoch": 1.1906045898496016, + "grad_norm": 0.21767468750476837, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0038, + "step": 19870 + }, + { + "epoch": 1.1912037869255199, + "grad_norm": 0.17474445700645447, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0056, + "step": 19880 + }, + { + "epoch": 1.1918029840014381, + "grad_norm": 0.17118008434772491, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0045, + "step": 19890 + }, + { + "epoch": 1.1924021810773564, + "grad_norm": 0.44261473417282104, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0063, + "step": 19900 + }, + { + "epoch": 1.1930013781532747, + "grad_norm": 0.18502798676490784, + "learning_rate": 6.862915366041247e-06, + "loss": 0.004, + "step": 19910 + }, + { + "epoch": 1.193600575229193, + "grad_norm": 0.19384194910526276, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0036, + "step": 19920 + }, + { + "epoch": 1.1941997723051112, + "grad_norm": 0.1448352187871933, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0044, + "step": 19930 + }, + { + "epoch": 1.1947989693810295, + "grad_norm": 0.3728172779083252, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0038, + "step": 19940 + }, + { + "epoch": 1.1953981664569477, + "grad_norm": 0.31421783566474915, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0043, + "step": 19950 + }, + { + "epoch": 1.195997363532866, + "grad_norm": 0.28181371092796326, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0045, + "step": 19960 + }, + { + "epoch": 1.1965965606087843, + "grad_norm": 0.2249889373779297, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0041, + "step": 19970 + }, + { + "epoch": 1.1971957576847025, + "grad_norm": 0.26402008533477783, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0043, + "step": 19980 + }, + { + "epoch": 1.1977949547606208, + "grad_norm": 0.22621415555477142, + "learning_rate": 6.793802468038111e-06, + "loss": 0.004, + "step": 19990 + }, + { + "epoch": 1.198394151836539, + "grad_norm": 0.2681289315223694, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0045, + "step": 20000 + }, + { + "epoch": 1.1989933489124573, + "grad_norm": 0.17681041359901428, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0037, + "step": 20010 + }, + { + "epoch": 1.1995925459883756, + "grad_norm": 0.16526542603969574, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0032, + "step": 20020 + }, + { + "epoch": 1.2001917430642939, + "grad_norm": 0.30313149094581604, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0046, + "step": 20030 + }, + { + "epoch": 1.2007909401402121, + "grad_norm": 0.17628541588783264, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0065, + "step": 20040 + }, + { + "epoch": 1.2013901372161304, + "grad_norm": 0.1840096414089203, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0051, + "step": 20050 + }, + { + "epoch": 1.2019893342920487, + "grad_norm": 0.146232470870018, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0035, + "step": 20060 + }, + { + "epoch": 1.202588531367967, + "grad_norm": 0.4804438352584839, + "learning_rate": 6.725005485342219e-06, + "loss": 0.005, + "step": 20070 + }, + { + "epoch": 1.2031877284438852, + "grad_norm": 0.2245558500289917, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0039, + "step": 20080 + }, + { + "epoch": 1.2037869255198035, + "grad_norm": 0.21845588088035583, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0053, + "step": 20090 + }, + { + "epoch": 1.2043861225957218, + "grad_norm": 0.1743943691253662, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0037, + "step": 20100 + }, + { + "epoch": 1.20498531967164, + "grad_norm": 0.16978098452091217, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0036, + "step": 20110 + }, + { + "epoch": 1.2055845167475583, + "grad_norm": 0.27158796787261963, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0043, + "step": 20120 + }, + { + "epoch": 1.2061837138234766, + "grad_norm": 0.13516400754451752, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0048, + "step": 20130 + }, + { + "epoch": 1.2067829108993948, + "grad_norm": 0.1645064353942871, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0038, + "step": 20140 + }, + { + "epoch": 1.207382107975313, + "grad_norm": 0.07616083323955536, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0046, + "step": 20150 + }, + { + "epoch": 1.2079813050512314, + "grad_norm": 0.13306911289691925, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0039, + "step": 20160 + }, + { + "epoch": 1.2085805021271496, + "grad_norm": 0.19445037841796875, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0044, + "step": 20170 + }, + { + "epoch": 1.209179699203068, + "grad_norm": 0.18423207104206085, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0049, + "step": 20180 + }, + { + "epoch": 1.2097788962789862, + "grad_norm": 0.19280213117599487, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0043, + "step": 20190 + }, + { + "epoch": 1.2103780933549044, + "grad_norm": 0.25472623109817505, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0033, + "step": 20200 + }, + { + "epoch": 1.2109772904308227, + "grad_norm": 0.16799427568912506, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0031, + "step": 20210 + }, + { + "epoch": 1.211576487506741, + "grad_norm": 0.2097395807504654, + "learning_rate": 6.596880604028027e-06, + "loss": 0.004, + "step": 20220 + }, + { + "epoch": 1.2121756845826592, + "grad_norm": 0.31450021266937256, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0047, + "step": 20230 + }, + { + "epoch": 1.2127748816585775, + "grad_norm": 0.16530238091945648, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0034, + "step": 20240 + }, + { + "epoch": 1.2133740787344958, + "grad_norm": 0.2506805956363678, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0038, + "step": 20250 + }, + { + "epoch": 1.213973275810414, + "grad_norm": 0.1876160055398941, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0035, + "step": 20260 + }, + { + "epoch": 1.2145724728863323, + "grad_norm": 0.23704354465007782, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0041, + "step": 20270 + }, + { + "epoch": 1.2151716699622506, + "grad_norm": 0.13814999163150787, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0042, + "step": 20280 + }, + { + "epoch": 1.2157708670381688, + "grad_norm": 0.1164403185248375, + "learning_rate": 6.53748481975927e-06, + "loss": 0.0042, + "step": 20290 + }, + { + "epoch": 1.216370064114087, + "grad_norm": 0.23078426718711853, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0038, + "step": 20300 + }, + { + "epoch": 1.2169692611900054, + "grad_norm": 0.21749110519886017, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0046, + "step": 20310 + }, + { + "epoch": 1.2175684582659236, + "grad_norm": 0.24972137808799744, + "learning_rate": 6.512107839793337e-06, + "loss": 0.0041, + "step": 20320 + }, + { + "epoch": 1.218167655341842, + "grad_norm": 0.2491082102060318, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0043, + "step": 20330 + }, + { + "epoch": 1.2187668524177602, + "grad_norm": 0.14915086328983307, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0048, + "step": 20340 + }, + { + "epoch": 1.2193660494936784, + "grad_norm": 0.2794116735458374, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0035, + "step": 20350 + }, + { + "epoch": 1.2199652465695967, + "grad_norm": 0.13765662908554077, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0047, + "step": 20360 + }, + { + "epoch": 1.220564443645515, + "grad_norm": 0.14874878525733948, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0042, + "step": 20370 + }, + { + "epoch": 1.2211636407214332, + "grad_norm": 0.1800280064344406, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0057, + "step": 20380 + }, + { + "epoch": 1.2217628377973515, + "grad_norm": 0.17518648505210876, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0049, + "step": 20390 + }, + { + "epoch": 1.2223620348732698, + "grad_norm": 0.16315865516662598, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0045, + "step": 20400 + }, + { + "epoch": 1.222961231949188, + "grad_norm": 0.3590790033340454, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0039, + "step": 20410 + }, + { + "epoch": 1.2235604290251063, + "grad_norm": 0.14534324407577515, + "learning_rate": 6.427861749601945e-06, + "loss": 0.004, + "step": 20420 + }, + { + "epoch": 1.2241596261010246, + "grad_norm": 0.1662825047969818, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0042, + "step": 20430 + }, + { + "epoch": 1.2247588231769428, + "grad_norm": 0.27466440200805664, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0045, + "step": 20440 + }, + { + "epoch": 1.2253580202528611, + "grad_norm": 0.1323469579219818, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0047, + "step": 20450 + }, + { + "epoch": 1.2259572173287794, + "grad_norm": 0.12367355078458786, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0077, + "step": 20460 + }, + { + "epoch": 1.2265564144046976, + "grad_norm": 0.18238325417041779, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0058, + "step": 20470 + }, + { + "epoch": 1.227155611480616, + "grad_norm": 0.2733745574951172, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0038, + "step": 20480 + }, + { + "epoch": 1.2277548085565342, + "grad_norm": 0.3367181420326233, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0039, + "step": 20490 + }, + { + "epoch": 1.2283540056324525, + "grad_norm": 0.20671530067920685, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0034, + "step": 20500 + }, + { + "epoch": 1.2289532027083707, + "grad_norm": 0.23353071510791779, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0033, + "step": 20510 + }, + { + "epoch": 1.229552399784289, + "grad_norm": 0.21081902086734772, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0031, + "step": 20520 + }, + { + "epoch": 1.2301515968602073, + "grad_norm": 0.3426077365875244, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0049, + "step": 20530 + }, + { + "epoch": 1.2307507939361255, + "grad_norm": 0.3905622959136963, + "learning_rate": 6.327475567095824e-06, + "loss": 0.004, + "step": 20540 + }, + { + "epoch": 1.2313499910120438, + "grad_norm": 0.1888400912284851, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0041, + "step": 20550 + }, + { + "epoch": 1.231949188087962, + "grad_norm": 0.23982487618923187, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0041, + "step": 20560 + }, + { + "epoch": 1.2325483851638803, + "grad_norm": 0.2061331421136856, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0046, + "step": 20570 + }, + { + "epoch": 1.2331475822397986, + "grad_norm": 0.17000116407871246, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0033, + "step": 20580 + }, + { + "epoch": 1.2337467793157169, + "grad_norm": 0.15905790030956268, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0049, + "step": 20590 + }, + { + "epoch": 1.2343459763916351, + "grad_norm": 0.16794419288635254, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0052, + "step": 20600 + }, + { + "epoch": 1.2349451734675534, + "grad_norm": 0.3003343641757965, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0061, + "step": 20610 + }, + { + "epoch": 1.2355443705434717, + "grad_norm": 0.1429288536310196, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0042, + "step": 20620 + }, + { + "epoch": 1.23614356761939, + "grad_norm": 0.18542084097862244, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0047, + "step": 20630 + }, + { + "epoch": 1.2367427646953082, + "grad_norm": 0.2692892253398895, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0035, + "step": 20640 + }, + { + "epoch": 1.2373419617712265, + "grad_norm": 0.23286236822605133, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0037, + "step": 20650 + }, + { + "epoch": 1.2379411588471447, + "grad_norm": 0.0963423103094101, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0041, + "step": 20660 + }, + { + "epoch": 1.238540355923063, + "grad_norm": 0.1425798237323761, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0043, + "step": 20670 + }, + { + "epoch": 1.2391395529989815, + "grad_norm": 0.0960182398557663, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0046, + "step": 20680 + }, + { + "epoch": 1.2397387500748995, + "grad_norm": 0.2674477994441986, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0043, + "step": 20690 + }, + { + "epoch": 1.240337947150818, + "grad_norm": 0.16276703774929047, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0041, + "step": 20700 + }, + { + "epoch": 1.240937144226736, + "grad_norm": 0.24255621433258057, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.003, + "step": 20710 + }, + { + "epoch": 1.2415363413026546, + "grad_norm": 0.20395220816135406, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0054, + "step": 20720 + }, + { + "epoch": 1.2421355383785726, + "grad_norm": 0.12099681794643402, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0082, + "step": 20730 + }, + { + "epoch": 1.242734735454491, + "grad_norm": 0.14017170667648315, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0042, + "step": 20740 + }, + { + "epoch": 1.2433339325304091, + "grad_norm": 0.28132137656211853, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0043, + "step": 20750 + }, + { + "epoch": 1.2439331296063276, + "grad_norm": 0.27220970392227173, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0039, + "step": 20760 + }, + { + "epoch": 1.2445323266822457, + "grad_norm": 0.23647353053092957, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0058, + "step": 20770 + }, + { + "epoch": 1.2451315237581642, + "grad_norm": 0.20623824000358582, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0053, + "step": 20780 + }, + { + "epoch": 1.2457307208340822, + "grad_norm": 0.12366114556789398, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0037, + "step": 20790 + }, + { + "epoch": 1.2463299179100007, + "grad_norm": 0.23330192267894745, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0056, + "step": 20800 + }, + { + "epoch": 1.246929114985919, + "grad_norm": 0.19991633296012878, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0031, + "step": 20810 + }, + { + "epoch": 1.2475283120618372, + "grad_norm": 0.1496160626411438, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0058, + "step": 20820 + }, + { + "epoch": 1.2481275091377555, + "grad_norm": 0.13247868418693542, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0037, + "step": 20830 + }, + { + "epoch": 1.2487267062136738, + "grad_norm": 0.19072194397449493, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0057, + "step": 20840 + }, + { + "epoch": 1.249325903289592, + "grad_norm": 0.10773085057735443, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0042, + "step": 20850 + }, + { + "epoch": 1.2499251003655103, + "grad_norm": 0.14058449864387512, + "learning_rate": 6.063685039328116e-06, + "loss": 0.005, + "step": 20860 + }, + { + "epoch": 1.2505242974414286, + "grad_norm": 0.10825464874505997, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0042, + "step": 20870 + }, + { + "epoch": 1.2511234945173468, + "grad_norm": 0.18059906363487244, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0046, + "step": 20880 + }, + { + "epoch": 1.251722691593265, + "grad_norm": 0.1713389754295349, + "learning_rate": 6.039253929027638e-06, + "loss": 0.005, + "step": 20890 + }, + { + "epoch": 1.2523218886691834, + "grad_norm": 0.23789434134960175, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0047, + "step": 20900 + }, + { + "epoch": 1.2529210857451016, + "grad_norm": 0.17626744508743286, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0041, + "step": 20910 + }, + { + "epoch": 1.25352028282102, + "grad_norm": 0.2091904729604721, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0044, + "step": 20920 + }, + { + "epoch": 1.2541194798969382, + "grad_norm": 0.17293672263622284, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0043, + "step": 20930 + }, + { + "epoch": 1.2547186769728564, + "grad_norm": 0.13156521320343018, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0039, + "step": 20940 + }, + { + "epoch": 1.2553178740487747, + "grad_norm": 0.19591976702213287, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0043, + "step": 20950 + }, + { + "epoch": 1.255917071124693, + "grad_norm": 0.16212835907936096, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0039, + "step": 20960 + }, + { + "epoch": 1.2565162682006112, + "grad_norm": 0.10661022365093231, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0037, + "step": 20970 + }, + { + "epoch": 1.2571154652765295, + "grad_norm": 0.16630858182907104, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0038, + "step": 20980 + }, + { + "epoch": 1.2577146623524478, + "grad_norm": 0.11001022905111313, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0037, + "step": 20990 + }, + { + "epoch": 1.258313859428366, + "grad_norm": 0.1888381838798523, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0044, + "step": 21000 + }, + { + "epoch": 1.2589130565042843, + "grad_norm": 0.19239328801631927, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0044, + "step": 21010 + }, + { + "epoch": 1.2595122535802026, + "grad_norm": 0.16555139422416687, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0032, + "step": 21020 + }, + { + "epoch": 1.2601114506561208, + "grad_norm": 0.19748231768608093, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0043, + "step": 21030 + }, + { + "epoch": 1.2607106477320391, + "grad_norm": 0.1546473354101181, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0049, + "step": 21040 + }, + { + "epoch": 1.2613098448079574, + "grad_norm": 0.30511707067489624, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0037, + "step": 21050 + }, + { + "epoch": 1.2619090418838756, + "grad_norm": 0.1722872257232666, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0048, + "step": 21060 + }, + { + "epoch": 1.262508238959794, + "grad_norm": 0.1784086525440216, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0049, + "step": 21070 + }, + { + "epoch": 1.2631074360357122, + "grad_norm": 0.15101182460784912, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0042, + "step": 21080 + }, + { + "epoch": 1.2637066331116305, + "grad_norm": 0.1252688318490982, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0041, + "step": 21090 + }, + { + "epoch": 1.2643058301875487, + "grad_norm": 0.15101821720600128, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0043, + "step": 21100 + }, + { + "epoch": 1.264905027263467, + "grad_norm": 0.21302345395088196, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0035, + "step": 21110 + }, + { + "epoch": 1.2655042243393853, + "grad_norm": 0.1591431051492691, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0033, + "step": 21120 + }, + { + "epoch": 1.2661034214153035, + "grad_norm": 0.16010484099388123, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0049, + "step": 21130 + }, + { + "epoch": 1.2667026184912218, + "grad_norm": 0.19287234544754028, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0037, + "step": 21140 + }, + { + "epoch": 1.26730181556714, + "grad_norm": 0.1804349720478058, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0036, + "step": 21150 + }, + { + "epoch": 1.2679010126430583, + "grad_norm": 0.14769446849822998, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0044, + "step": 21160 + }, + { + "epoch": 1.2685002097189766, + "grad_norm": 0.15914054214954376, + "learning_rate": 5.813791207086085e-06, + "loss": 0.004, + "step": 21170 + }, + { + "epoch": 1.2690994067948949, + "grad_norm": 0.19632315635681152, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0034, + "step": 21180 + }, + { + "epoch": 1.2696986038708131, + "grad_norm": 0.3017818331718445, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0046, + "step": 21190 + }, + { + "epoch": 1.2702978009467314, + "grad_norm": 0.2728461027145386, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0044, + "step": 21200 + }, + { + "epoch": 1.2708969980226497, + "grad_norm": 0.18619874119758606, + "learning_rate": 5.781966956563247e-06, + "loss": 0.004, + "step": 21210 + }, + { + "epoch": 1.271496195098568, + "grad_norm": 0.1235085129737854, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0037, + "step": 21220 + }, + { + "epoch": 1.2720953921744862, + "grad_norm": 0.15798084437847137, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0035, + "step": 21230 + }, + { + "epoch": 1.2726945892504045, + "grad_norm": 0.15713484585285187, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0036, + "step": 21240 + }, + { + "epoch": 1.2732937863263227, + "grad_norm": 0.15594886243343353, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0038, + "step": 21250 + }, + { + "epoch": 1.273892983402241, + "grad_norm": 0.1558992713689804, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0037, + "step": 21260 + }, + { + "epoch": 1.2744921804781593, + "grad_norm": 0.20599815249443054, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0054, + "step": 21270 + }, + { + "epoch": 1.2750913775540775, + "grad_norm": 0.2785670757293701, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0042, + "step": 21280 + }, + { + "epoch": 1.2756905746299958, + "grad_norm": 0.22550497949123383, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0035, + "step": 21290 + }, + { + "epoch": 1.276289771705914, + "grad_norm": 0.15210074186325073, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0035, + "step": 21300 + }, + { + "epoch": 1.2768889687818323, + "grad_norm": 0.18905121088027954, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0035, + "step": 21310 + }, + { + "epoch": 1.2774881658577506, + "grad_norm": 0.1337066888809204, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0046, + "step": 21320 + }, + { + "epoch": 1.2780873629336689, + "grad_norm": 0.23699362576007843, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0048, + "step": 21330 + }, + { + "epoch": 1.2786865600095871, + "grad_norm": 0.2480958253145218, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0037, + "step": 21340 + }, + { + "epoch": 1.2792857570855054, + "grad_norm": 0.09328999370336533, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0047, + "step": 21350 + }, + { + "epoch": 1.2798849541614237, + "grad_norm": 0.3416430950164795, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0048, + "step": 21360 + }, + { + "epoch": 1.280484151237342, + "grad_norm": 0.13258710503578186, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0032, + "step": 21370 + }, + { + "epoch": 1.2810833483132602, + "grad_norm": 0.18493984639644623, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0037, + "step": 21380 + }, + { + "epoch": 1.2816825453891785, + "grad_norm": 0.10433483123779297, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0045, + "step": 21390 + }, + { + "epoch": 1.2822817424650967, + "grad_norm": 0.18333138525485992, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0038, + "step": 21400 + }, + { + "epoch": 1.282880939541015, + "grad_norm": 0.25164106488227844, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0058, + "step": 21410 + }, + { + "epoch": 1.2834801366169333, + "grad_norm": 0.17989882826805115, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0041, + "step": 21420 + }, + { + "epoch": 1.2840793336928515, + "grad_norm": 0.1597793847322464, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0036, + "step": 21430 + }, + { + "epoch": 1.2846785307687698, + "grad_norm": 0.1543695032596588, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0036, + "step": 21440 + }, + { + "epoch": 1.285277727844688, + "grad_norm": 0.2985675036907196, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0043, + "step": 21450 + }, + { + "epoch": 1.2858769249206063, + "grad_norm": 0.1357773244380951, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0036, + "step": 21460 + }, + { + "epoch": 1.2864761219965246, + "grad_norm": 0.23978300392627716, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.005, + "step": 21470 + }, + { + "epoch": 1.2870753190724429, + "grad_norm": 0.12806151807308197, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0035, + "step": 21480 + }, + { + "epoch": 1.2876745161483611, + "grad_norm": 0.2222731113433838, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0039, + "step": 21490 + }, + { + "epoch": 1.2882737132242794, + "grad_norm": 0.16744646430015564, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0035, + "step": 21500 + }, + { + "epoch": 1.2888729103001977, + "grad_norm": 0.2162114977836609, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0048, + "step": 21510 + }, + { + "epoch": 1.289472107376116, + "grad_norm": 0.14857177436351776, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0036, + "step": 21520 + }, + { + "epoch": 1.2900713044520342, + "grad_norm": 0.21318115293979645, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0032, + "step": 21530 + }, + { + "epoch": 1.2906705015279525, + "grad_norm": 0.257682204246521, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0036, + "step": 21540 + }, + { + "epoch": 1.2912696986038708, + "grad_norm": 0.254349946975708, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0042, + "step": 21550 + }, + { + "epoch": 1.291868895679789, + "grad_norm": 0.148925319314003, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0029, + "step": 21560 + }, + { + "epoch": 1.2924680927557073, + "grad_norm": 0.1902056336402893, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0031, + "step": 21570 + }, + { + "epoch": 1.2930672898316256, + "grad_norm": 0.17580094933509827, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0026, + "step": 21580 + }, + { + "epoch": 1.2936664869075438, + "grad_norm": 0.18856695294380188, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0045, + "step": 21590 + }, + { + "epoch": 1.294265683983462, + "grad_norm": 0.17185454070568085, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0039, + "step": 21600 + }, + { + "epoch": 1.2948648810593804, + "grad_norm": 0.1997966468334198, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0043, + "step": 21610 + }, + { + "epoch": 1.2954640781352986, + "grad_norm": 0.14173944294452667, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0033, + "step": 21620 + }, + { + "epoch": 1.296063275211217, + "grad_norm": 0.20653635263442993, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0039, + "step": 21630 + }, + { + "epoch": 1.2966624722871352, + "grad_norm": 0.19571708142757416, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0026, + "step": 21640 + }, + { + "epoch": 1.2972616693630534, + "grad_norm": 0.0877508670091629, + "learning_rate": 5.438496901657042e-06, + "loss": 0.005, + "step": 21650 + }, + { + "epoch": 1.2978608664389717, + "grad_norm": 0.17305001616477966, + "learning_rate": 5.430834687545416e-06, + "loss": 0.0038, + "step": 21660 + }, + { + "epoch": 1.2984600635148902, + "grad_norm": 0.16555450856685638, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0035, + "step": 21670 + }, + { + "epoch": 1.2990592605908082, + "grad_norm": 0.15395715832710266, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0035, + "step": 21680 + }, + { + "epoch": 1.2996584576667267, + "grad_norm": 0.2430422455072403, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0032, + "step": 21690 + }, + { + "epoch": 1.3002576547426448, + "grad_norm": 0.2465265393257141, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0034, + "step": 21700 + }, + { + "epoch": 1.3008568518185633, + "grad_norm": 0.08382703363895416, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0038, + "step": 21710 + }, + { + "epoch": 1.3014560488944813, + "grad_norm": 0.3427184224128723, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0042, + "step": 21720 + }, + { + "epoch": 1.3020552459703998, + "grad_norm": 0.13029031455516815, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0039, + "step": 21730 + }, + { + "epoch": 1.3026544430463178, + "grad_norm": 0.11826448887586594, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0035, + "step": 21740 + }, + { + "epoch": 1.3032536401222363, + "grad_norm": 0.1612391620874405, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0039, + "step": 21750 + }, + { + "epoch": 1.3038528371981544, + "grad_norm": 0.21143540740013123, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0057, + "step": 21760 + }, + { + "epoch": 1.3044520342740729, + "grad_norm": 0.22977286577224731, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.005, + "step": 21770 + }, + { + "epoch": 1.305051231349991, + "grad_norm": 0.11853202432394028, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0058, + "step": 21780 + }, + { + "epoch": 1.3056504284259094, + "grad_norm": 0.24277184903621674, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0038, + "step": 21790 + }, + { + "epoch": 1.3062496255018274, + "grad_norm": 0.2625603675842285, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0048, + "step": 21800 + }, + { + "epoch": 1.306848822577746, + "grad_norm": 0.1333419382572174, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0033, + "step": 21810 + }, + { + "epoch": 1.307448019653664, + "grad_norm": 0.09627685695886612, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0035, + "step": 21820 + }, + { + "epoch": 1.3080472167295825, + "grad_norm": 0.416618674993515, + "learning_rate": 5.301584321328435e-06, + "loss": 0.0038, + "step": 21830 + }, + { + "epoch": 1.3086464138055005, + "grad_norm": 0.18699553608894348, + "learning_rate": 5.294041118587667e-06, + "loss": 0.004, + "step": 21840 + }, + { + "epoch": 1.309245610881419, + "grad_norm": 0.1827329397201538, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0039, + "step": 21850 + }, + { + "epoch": 1.309844807957337, + "grad_norm": 0.19719162583351135, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0034, + "step": 21860 + }, + { + "epoch": 1.3104440050332555, + "grad_norm": 0.09895205497741699, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0042, + "step": 21870 + }, + { + "epoch": 1.3110432021091736, + "grad_norm": 0.11187861114740372, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0036, + "step": 21880 + }, + { + "epoch": 1.311642399185092, + "grad_norm": 0.154103085398674, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0037, + "step": 21890 + }, + { + "epoch": 1.3122415962610101, + "grad_norm": 0.11124159395694733, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0038, + "step": 21900 + }, + { + "epoch": 1.3128407933369286, + "grad_norm": 0.27686378359794617, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0041, + "step": 21910 + }, + { + "epoch": 1.3134399904128466, + "grad_norm": 0.12900429964065552, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0036, + "step": 21920 + }, + { + "epoch": 1.3140391874887651, + "grad_norm": 0.26441213488578796, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0032, + "step": 21930 + }, + { + "epoch": 1.3146383845646834, + "grad_norm": 0.2187345325946808, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.004, + "step": 21940 + }, + { + "epoch": 1.3152375816406017, + "grad_norm": 0.08503159135580063, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0034, + "step": 21950 + }, + { + "epoch": 1.31583677871652, + "grad_norm": 0.12869144976139069, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0035, + "step": 21960 + }, + { + "epoch": 1.3164359757924382, + "grad_norm": 0.13212713599205017, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0027, + "step": 21970 + }, + { + "epoch": 1.3170351728683565, + "grad_norm": 0.23211228847503662, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0032, + "step": 21980 + }, + { + "epoch": 1.3176343699442747, + "grad_norm": 0.2017366737127304, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0043, + "step": 21990 + }, + { + "epoch": 1.318233567020193, + "grad_norm": 0.21221789717674255, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0039, + "step": 22000 + }, + { + "epoch": 1.3188327640961113, + "grad_norm": 0.24497511982917786, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0046, + "step": 22010 + }, + { + "epoch": 1.3194319611720295, + "grad_norm": 0.15008985996246338, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0039, + "step": 22020 + }, + { + "epoch": 1.3200311582479478, + "grad_norm": 0.15641193091869354, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0039, + "step": 22030 + }, + { + "epoch": 1.320630355323866, + "grad_norm": 0.2608455419540405, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0036, + "step": 22040 + }, + { + "epoch": 1.3212295523997843, + "grad_norm": 0.09808705747127533, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0038, + "step": 22050 + }, + { + "epoch": 1.3218287494757026, + "grad_norm": 0.18084567785263062, + "learning_rate": 5.129800405815733e-06, + "loss": 0.0045, + "step": 22060 + }, + { + "epoch": 1.3224279465516209, + "grad_norm": 0.1957635134458542, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0036, + "step": 22070 + }, + { + "epoch": 1.3230271436275391, + "grad_norm": 0.1479685753583908, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0045, + "step": 22080 + }, + { + "epoch": 1.3236263407034574, + "grad_norm": 0.14854201674461365, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0035, + "step": 22090 + }, + { + "epoch": 1.3242255377793757, + "grad_norm": 0.14744973182678223, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0033, + "step": 22100 + }, + { + "epoch": 1.324824734855294, + "grad_norm": 0.7196730375289917, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0071, + "step": 22110 + }, + { + "epoch": 1.3254239319312122, + "grad_norm": 0.22570419311523438, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0056, + "step": 22120 + }, + { + "epoch": 1.3260231290071305, + "grad_norm": 0.16870586574077606, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0042, + "step": 22130 + }, + { + "epoch": 1.3266223260830488, + "grad_norm": 0.12610554695129395, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0051, + "step": 22140 + }, + { + "epoch": 1.327221523158967, + "grad_norm": 0.11198554188013077, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0042, + "step": 22150 + }, + { + "epoch": 1.3278207202348853, + "grad_norm": 0.13166265189647675, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0037, + "step": 22160 + }, + { + "epoch": 1.3284199173108036, + "grad_norm": 0.1181526631116867, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0037, + "step": 22170 + }, + { + "epoch": 1.3290191143867218, + "grad_norm": 0.2055635005235672, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0027, + "step": 22180 + }, + { + "epoch": 1.32961831146264, + "grad_norm": 0.13400030136108398, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0028, + "step": 22190 + }, + { + "epoch": 1.3302175085385584, + "grad_norm": 0.09746947884559631, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0048, + "step": 22200 + }, + { + "epoch": 1.3308167056144766, + "grad_norm": 0.22124870121479034, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0048, + "step": 22210 + }, + { + "epoch": 1.331415902690395, + "grad_norm": 0.09961193799972534, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0038, + "step": 22220 + }, + { + "epoch": 1.3320150997663132, + "grad_norm": 0.20024695992469788, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0036, + "step": 22230 + }, + { + "epoch": 1.3326142968422314, + "grad_norm": 0.3697144687175751, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0044, + "step": 22240 + }, + { + "epoch": 1.3332134939181497, + "grad_norm": 0.1713833063840866, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0038, + "step": 22250 + }, + { + "epoch": 1.333812690994068, + "grad_norm": 0.1914745569229126, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0051, + "step": 22260 + }, + { + "epoch": 1.3344118880699862, + "grad_norm": 0.190393328666687, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0035, + "step": 22270 + }, + { + "epoch": 1.3350110851459045, + "grad_norm": 0.17361588776111603, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0038, + "step": 22280 + }, + { + "epoch": 1.3356102822218228, + "grad_norm": 0.19456325471401215, + "learning_rate": 4.961660586405147e-06, + "loss": 0.0036, + "step": 22290 + }, + { + "epoch": 1.336209479297741, + "grad_norm": 0.15772588551044464, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0039, + "step": 22300 + }, + { + "epoch": 1.3368086763736593, + "grad_norm": 0.11680205166339874, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0045, + "step": 22310 + }, + { + "epoch": 1.3374078734495776, + "grad_norm": 0.3643893599510193, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0044, + "step": 22320 + }, + { + "epoch": 1.3380070705254958, + "grad_norm": 0.1628265231847763, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0036, + "step": 22330 + }, + { + "epoch": 1.338606267601414, + "grad_norm": 0.10073156654834747, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0041, + "step": 22340 + }, + { + "epoch": 1.3392054646773324, + "grad_norm": 0.13039462268352509, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0045, + "step": 22350 + }, + { + "epoch": 1.3398046617532506, + "grad_norm": 0.12775596976280212, + "learning_rate": 4.911226880894818e-06, + "loss": 0.003, + "step": 22360 + }, + { + "epoch": 1.340403858829169, + "grad_norm": 0.1513100564479828, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0044, + "step": 22370 + }, + { + "epoch": 1.3410030559050872, + "grad_norm": 0.1346164345741272, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0036, + "step": 22380 + }, + { + "epoch": 1.3416022529810054, + "grad_norm": 0.12880294024944305, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0041, + "step": 22390 + }, + { + "epoch": 1.3422014500569237, + "grad_norm": 0.3154917359352112, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0038, + "step": 22400 + }, + { + "epoch": 1.342800647132842, + "grad_norm": 0.18458192050457, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.0057, + "step": 22410 + }, + { + "epoch": 1.3433998442087602, + "grad_norm": 0.2524041533470154, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0038, + "step": 22420 + }, + { + "epoch": 1.3439990412846785, + "grad_norm": 0.11894001811742783, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0033, + "step": 22430 + }, + { + "epoch": 1.3445982383605968, + "grad_norm": 0.1094699576497078, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0037, + "step": 22440 + }, + { + "epoch": 1.345197435436515, + "grad_norm": 0.11090611666440964, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0045, + "step": 22450 + }, + { + "epoch": 1.3457966325124333, + "grad_norm": 0.3179106116294861, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0041, + "step": 22460 + }, + { + "epoch": 1.3463958295883516, + "grad_norm": 0.09424899518489838, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0034, + "step": 22470 + }, + { + "epoch": 1.3469950266642698, + "grad_norm": 0.3028348982334137, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0035, + "step": 22480 + }, + { + "epoch": 1.3475942237401881, + "grad_norm": 0.30831560492515564, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0045, + "step": 22490 + }, + { + "epoch": 1.3481934208161064, + "grad_norm": 0.34811046719551086, + "learning_rate": 4.81141273556404e-06, + "loss": 0.005, + "step": 22500 + }, + { + "epoch": 1.3487926178920246, + "grad_norm": 0.18413113057613373, + "learning_rate": 4.804337352679613e-06, + "loss": 0.0044, + "step": 22510 + }, + { + "epoch": 1.349391814967943, + "grad_norm": 0.11229179799556732, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.004, + "step": 22520 + }, + { + "epoch": 1.3499910120438612, + "grad_norm": 0.2966957688331604, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0056, + "step": 22530 + }, + { + "epoch": 1.3505902091197795, + "grad_norm": 0.10525348782539368, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0037, + "step": 22540 + }, + { + "epoch": 1.3511894061956977, + "grad_norm": 0.1479673534631729, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0049, + "step": 22550 + }, + { + "epoch": 1.351788603271616, + "grad_norm": 0.5229315757751465, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0051, + "step": 22560 + }, + { + "epoch": 1.3523878003475343, + "grad_norm": 0.17021632194519043, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0038, + "step": 22570 + }, + { + "epoch": 1.3529869974234525, + "grad_norm": 0.10177282989025116, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0036, + "step": 22580 + }, + { + "epoch": 1.3535861944993708, + "grad_norm": 0.17768025398254395, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0034, + "step": 22590 + }, + { + "epoch": 1.354185391575289, + "grad_norm": 0.2090948224067688, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0027, + "step": 22600 + }, + { + "epoch": 1.3547845886512073, + "grad_norm": 0.1722206026315689, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0031, + "step": 22610 + }, + { + "epoch": 1.3553837857271256, + "grad_norm": 0.09709088504314423, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0037, + "step": 22620 + }, + { + "epoch": 1.3559829828030439, + "grad_norm": 0.1969165802001953, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0034, + "step": 22630 + }, + { + "epoch": 1.3565821798789621, + "grad_norm": 0.0810595229268074, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0038, + "step": 22640 + }, + { + "epoch": 1.3571813769548804, + "grad_norm": 0.22003750503063202, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0041, + "step": 22650 + }, + { + "epoch": 1.3577805740307987, + "grad_norm": 0.2809178829193115, + "learning_rate": 4.699083753549858e-06, + "loss": 0.003, + "step": 22660 + }, + { + "epoch": 1.358379771106717, + "grad_norm": 0.1343737691640854, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0026, + "step": 22670 + }, + { + "epoch": 1.3589789681826352, + "grad_norm": 0.19191010296344757, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0035, + "step": 22680 + }, + { + "epoch": 1.3595781652585535, + "grad_norm": 0.16617201268672943, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0031, + "step": 22690 + }, + { + "epoch": 1.3601773623344717, + "grad_norm": 0.24936997890472412, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0032, + "step": 22700 + }, + { + "epoch": 1.36077655941039, + "grad_norm": 0.5643696188926697, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0053, + "step": 22710 + }, + { + "epoch": 1.3613757564863083, + "grad_norm": 0.19725625216960907, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0031, + "step": 22720 + }, + { + "epoch": 1.3619749535622265, + "grad_norm": 0.1692969799041748, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0035, + "step": 22730 + }, + { + "epoch": 1.362574150638145, + "grad_norm": 0.17487913370132446, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0041, + "step": 22740 + }, + { + "epoch": 1.363173347714063, + "grad_norm": 0.25642889738082886, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0043, + "step": 22750 + }, + { + "epoch": 1.3637725447899816, + "grad_norm": 0.3692823350429535, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0039, + "step": 22760 + }, + { + "epoch": 1.3643717418658996, + "grad_norm": 0.230118989944458, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0047, + "step": 22770 + }, + { + "epoch": 1.364970938941818, + "grad_norm": 0.1609203815460205, + "learning_rate": 4.616077433849538e-06, + "loss": 0.0038, + "step": 22780 + }, + { + "epoch": 1.3655701360177361, + "grad_norm": 0.21201254427433014, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0029, + "step": 22790 + }, + { + "epoch": 1.3661693330936546, + "grad_norm": 0.10142157226800919, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0034, + "step": 22800 + }, + { + "epoch": 1.3667685301695727, + "grad_norm": 0.19121089577674866, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0033, + "step": 22810 + }, + { + "epoch": 1.3673677272454912, + "grad_norm": 0.156619131565094, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0045, + "step": 22820 + }, + { + "epoch": 1.3679669243214092, + "grad_norm": 0.14690659940242767, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0042, + "step": 22830 + }, + { + "epoch": 1.3685661213973277, + "grad_norm": 0.13466109335422516, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0041, + "step": 22840 + }, + { + "epoch": 1.3691653184732457, + "grad_norm": 0.3713383674621582, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0056, + "step": 22850 + }, + { + "epoch": 1.3697645155491642, + "grad_norm": 0.12184764444828033, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0038, + "step": 22860 + }, + { + "epoch": 1.3703637126250823, + "grad_norm": 0.23971956968307495, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0035, + "step": 22870 + }, + { + "epoch": 1.3709629097010008, + "grad_norm": 0.3320925235748291, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0052, + "step": 22880 + }, + { + "epoch": 1.3715621067769188, + "grad_norm": 0.11913793534040451, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0029, + "step": 22890 + }, + { + "epoch": 1.3721613038528373, + "grad_norm": 0.11725693941116333, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0044, + "step": 22900 + }, + { + "epoch": 1.3727605009287553, + "grad_norm": 0.1550632119178772, + "learning_rate": 4.527371771040039e-06, + "loss": 0.0049, + "step": 22910 + }, + { + "epoch": 1.3733596980046738, + "grad_norm": 0.23413509130477905, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0043, + "step": 22920 + }, + { + "epoch": 1.3739588950805919, + "grad_norm": 0.16070885956287384, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0032, + "step": 22930 + }, + { + "epoch": 1.3745580921565104, + "grad_norm": 0.12317437678575516, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0038, + "step": 22940 + }, + { + "epoch": 1.3751572892324284, + "grad_norm": 0.3462170660495758, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0048, + "step": 22950 + }, + { + "epoch": 1.375756486308347, + "grad_norm": 0.12654773890972137, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0045, + "step": 22960 + }, + { + "epoch": 1.376355683384265, + "grad_norm": 0.06262557208538055, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0026, + "step": 22970 + }, + { + "epoch": 1.3769548804601834, + "grad_norm": 0.1439850926399231, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0045, + "step": 22980 + }, + { + "epoch": 1.3775540775361017, + "grad_norm": 0.24463413655757904, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0042, + "step": 22990 + }, + { + "epoch": 1.37815327461202, + "grad_norm": 0.22048236429691315, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0036, + "step": 23000 + }, + { + "epoch": 1.3787524716879382, + "grad_norm": 0.10628963261842728, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0037, + "step": 23010 + }, + { + "epoch": 1.3793516687638565, + "grad_norm": 0.14685721695423126, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0043, + "step": 23020 + }, + { + "epoch": 1.3799508658397748, + "grad_norm": 0.18807503581047058, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0042, + "step": 23030 + }, + { + "epoch": 1.380550062915693, + "grad_norm": 0.19162075221538544, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0047, + "step": 23040 + }, + { + "epoch": 1.3811492599916113, + "grad_norm": 0.2444164752960205, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0041, + "step": 23050 + }, + { + "epoch": 1.3817484570675296, + "grad_norm": 0.12120077759027481, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0037, + "step": 23060 + }, + { + "epoch": 1.3823476541434478, + "grad_norm": 0.19946682453155518, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.003, + "step": 23070 + }, + { + "epoch": 1.3829468512193661, + "grad_norm": 0.23982395231723785, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0035, + "step": 23080 + }, + { + "epoch": 1.3835460482952844, + "grad_norm": 0.13806626200675964, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0031, + "step": 23090 + }, + { + "epoch": 1.3841452453712026, + "grad_norm": 0.2610985040664673, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0047, + "step": 23100 + }, + { + "epoch": 1.384744442447121, + "grad_norm": 0.1384919434785843, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0055, + "step": 23110 + }, + { + "epoch": 1.3853436395230392, + "grad_norm": 0.14737965166568756, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0037, + "step": 23120 + }, + { + "epoch": 1.3859428365989575, + "grad_norm": 0.1304326057434082, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0036, + "step": 23130 + }, + { + "epoch": 1.3865420336748757, + "grad_norm": 0.22288398444652557, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0035, + "step": 23140 + }, + { + "epoch": 1.387141230750794, + "grad_norm": 0.11266916245222092, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0071, + "step": 23150 + }, + { + "epoch": 1.3877404278267123, + "grad_norm": 0.15941838920116425, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0024, + "step": 23160 + }, + { + "epoch": 1.3883396249026305, + "grad_norm": 0.18921831250190735, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0026, + "step": 23170 + }, + { + "epoch": 1.3889388219785488, + "grad_norm": 0.10112889111042023, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0037, + "step": 23180 + }, + { + "epoch": 1.389538019054467, + "grad_norm": 0.1865631341934204, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0028, + "step": 23190 + }, + { + "epoch": 1.3901372161303853, + "grad_norm": 0.20046782493591309, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0042, + "step": 23200 + }, + { + "epoch": 1.3907364132063036, + "grad_norm": 0.11953336745500565, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0026, + "step": 23210 + }, + { + "epoch": 1.3913356102822219, + "grad_norm": 0.17050383985042572, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0029, + "step": 23220 + }, + { + "epoch": 1.3919348073581401, + "grad_norm": 0.28782936930656433, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0042, + "step": 23230 + }, + { + "epoch": 1.3925340044340584, + "grad_norm": 0.2104359269142151, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0034, + "step": 23240 + }, + { + "epoch": 1.3931332015099767, + "grad_norm": 0.12790441513061523, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0048, + "step": 23250 + }, + { + "epoch": 1.393732398585895, + "grad_norm": 0.12111827731132507, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0043, + "step": 23260 + }, + { + "epoch": 1.3943315956618132, + "grad_norm": 0.2542783319950104, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0043, + "step": 23270 + }, + { + "epoch": 1.3949307927377315, + "grad_norm": 0.17177502810955048, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0036, + "step": 23280 + }, + { + "epoch": 1.3955299898136497, + "grad_norm": 0.14121277630329132, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0051, + "step": 23290 + }, + { + "epoch": 1.396129186889568, + "grad_norm": 0.11357807368040085, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0033, + "step": 23300 + }, + { + "epoch": 1.3967283839654863, + "grad_norm": 0.3277477025985718, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0043, + "step": 23310 + }, + { + "epoch": 1.3973275810414045, + "grad_norm": 0.37000587582588196, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0037, + "step": 23320 + }, + { + "epoch": 1.3979267781173228, + "grad_norm": 0.11122190207242966, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0034, + "step": 23330 + }, + { + "epoch": 1.398525975193241, + "grad_norm": 0.14530375599861145, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0039, + "step": 23340 + }, + { + "epoch": 1.3991251722691593, + "grad_norm": 0.19974422454833984, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0033, + "step": 23350 + }, + { + "epoch": 1.3997243693450776, + "grad_norm": 0.15466761589050293, + "learning_rate": 4.230335566422999e-06, + "loss": 0.003, + "step": 23360 + }, + { + "epoch": 1.4003235664209959, + "grad_norm": 0.19129224121570587, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0042, + "step": 23370 + }, + { + "epoch": 1.4009227634969141, + "grad_norm": 0.2474614828824997, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0046, + "step": 23380 + }, + { + "epoch": 1.4015219605728324, + "grad_norm": 0.15569351613521576, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0048, + "step": 23390 + }, + { + "epoch": 1.4021211576487507, + "grad_norm": 0.09572251886129379, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0044, + "step": 23400 + }, + { + "epoch": 1.402720354724669, + "grad_norm": 0.13737086951732635, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0043, + "step": 23410 + }, + { + "epoch": 1.4033195518005872, + "grad_norm": 0.12266672402620316, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0039, + "step": 23420 + }, + { + "epoch": 1.4039187488765055, + "grad_norm": 0.09208404272794724, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0039, + "step": 23430 + }, + { + "epoch": 1.4045179459524237, + "grad_norm": 0.16571840643882751, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0047, + "step": 23440 + }, + { + "epoch": 1.405117143028342, + "grad_norm": 0.3071173131465912, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0039, + "step": 23450 + }, + { + "epoch": 1.4057163401042603, + "grad_norm": 0.09059276431798935, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0031, + "step": 23460 + }, + { + "epoch": 1.4063155371801785, + "grad_norm": 0.16070133447647095, + "learning_rate": 4.160146936563338e-06, + "loss": 0.004, + "step": 23470 + }, + { + "epoch": 1.4069147342560968, + "grad_norm": 0.12942227721214294, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0027, + "step": 23480 + }, + { + "epoch": 1.407513931332015, + "grad_norm": 0.13913804292678833, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0048, + "step": 23490 + }, + { + "epoch": 1.4081131284079333, + "grad_norm": 0.206321582198143, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0053, + "step": 23500 + }, + { + "epoch": 1.4087123254838516, + "grad_norm": 0.20973987877368927, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0041, + "step": 23510 + }, + { + "epoch": 1.4093115225597699, + "grad_norm": 0.23191478848457336, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0063, + "step": 23520 + }, + { + "epoch": 1.4099107196356881, + "grad_norm": 0.18233250081539154, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0029, + "step": 23530 + }, + { + "epoch": 1.4105099167116064, + "grad_norm": 0.133034810423851, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0043, + "step": 23540 + }, + { + "epoch": 1.4111091137875247, + "grad_norm": 0.10777711123228073, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0028, + "step": 23550 + }, + { + "epoch": 1.411708310863443, + "grad_norm": 0.14128559827804565, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0034, + "step": 23560 + }, + { + "epoch": 1.4123075079393612, + "grad_norm": 0.13215866684913635, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0036, + "step": 23570 + }, + { + "epoch": 1.4129067050152795, + "grad_norm": 0.18918493390083313, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0043, + "step": 23580 + }, + { + "epoch": 1.4135059020911978, + "grad_norm": 0.14459657669067383, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0045, + "step": 23590 + }, + { + "epoch": 1.414105099167116, + "grad_norm": 0.17287056148052216, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0039, + "step": 23600 + }, + { + "epoch": 1.4147042962430343, + "grad_norm": 0.13909804821014404, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0039, + "step": 23610 + }, + { + "epoch": 1.4153034933189526, + "grad_norm": 0.14798089861869812, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0037, + "step": 23620 + }, + { + "epoch": 1.4159026903948708, + "grad_norm": 0.10916659235954285, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0023, + "step": 23630 + }, + { + "epoch": 1.416501887470789, + "grad_norm": 0.1151762530207634, + "learning_rate": 4.053587511509546e-06, + "loss": 0.005, + "step": 23640 + }, + { + "epoch": 1.4171010845467074, + "grad_norm": 0.14232765138149261, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0032, + "step": 23650 + }, + { + "epoch": 1.4177002816226256, + "grad_norm": 0.09513483196496964, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0043, + "step": 23660 + }, + { + "epoch": 1.418299478698544, + "grad_norm": 0.09156285226345062, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0039, + "step": 23670 + }, + { + "epoch": 1.4188986757744622, + "grad_norm": 0.1405397206544876, + "learning_rate": 4.028855757736123e-06, + "loss": 0.004, + "step": 23680 + }, + { + "epoch": 1.4194978728503804, + "grad_norm": 0.15840958058834076, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0037, + "step": 23690 + }, + { + "epoch": 1.4200970699262987, + "grad_norm": 0.190508171916008, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0037, + "step": 23700 + }, + { + "epoch": 1.420696267002217, + "grad_norm": 0.15277954936027527, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0028, + "step": 23710 + }, + { + "epoch": 1.4212954640781352, + "grad_norm": 0.14111991226673126, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0034, + "step": 23720 + }, + { + "epoch": 1.4218946611540535, + "grad_norm": 0.31528833508491516, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0044, + "step": 23730 + }, + { + "epoch": 1.4224938582299718, + "grad_norm": 0.1420607715845108, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0039, + "step": 23740 + }, + { + "epoch": 1.42309305530589, + "grad_norm": 0.1340852528810501, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0032, + "step": 23750 + }, + { + "epoch": 1.4236922523818083, + "grad_norm": 0.11166475713253021, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0033, + "step": 23760 + }, + { + "epoch": 1.4242914494577266, + "grad_norm": 0.13635945320129395, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0028, + "step": 23770 + }, + { + "epoch": 1.4248906465336448, + "grad_norm": 0.15865778923034668, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0036, + "step": 23780 + }, + { + "epoch": 1.4254898436095633, + "grad_norm": 0.08569981157779694, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0026, + "step": 23790 + }, + { + "epoch": 1.4260890406854814, + "grad_norm": 0.1041082963347435, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0033, + "step": 23800 + }, + { + "epoch": 1.4266882377613999, + "grad_norm": 0.17262709140777588, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0041, + "step": 23810 + }, + { + "epoch": 1.427287434837318, + "grad_norm": 0.20455610752105713, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0035, + "step": 23820 + }, + { + "epoch": 1.4278866319132364, + "grad_norm": 0.15869568288326263, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0027, + "step": 23830 + }, + { + "epoch": 1.4284858289891544, + "grad_norm": 0.14855770766735077, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0042, + "step": 23840 + }, + { + "epoch": 1.429085026065073, + "grad_norm": 0.08842955529689789, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0028, + "step": 23850 + }, + { + "epoch": 1.429684223140991, + "grad_norm": 0.18251122534275055, + "learning_rate": 3.919189353330104e-06, + "loss": 0.003, + "step": 23860 + }, + { + "epoch": 1.4302834202169095, + "grad_norm": 0.24990014731884003, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0038, + "step": 23870 + }, + { + "epoch": 1.4308826172928275, + "grad_norm": 0.1088186502456665, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0036, + "step": 23880 + }, + { + "epoch": 1.431481814368746, + "grad_norm": 0.09780745953321457, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0042, + "step": 23890 + }, + { + "epoch": 1.432081011444664, + "grad_norm": 0.1625395119190216, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0033, + "step": 23900 + }, + { + "epoch": 1.4326802085205825, + "grad_norm": 0.16848890483379364, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0033, + "step": 23910 + }, + { + "epoch": 1.4332794055965006, + "grad_norm": 0.19756828248500824, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0051, + "step": 23920 + }, + { + "epoch": 1.433878602672419, + "grad_norm": 0.15720513463020325, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0033, + "step": 23930 + }, + { + "epoch": 1.4344777997483371, + "grad_norm": 0.22365699708461761, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0028, + "step": 23940 + }, + { + "epoch": 1.4350769968242556, + "grad_norm": 0.07928138971328735, + "learning_rate": 3.865363184624925e-06, + "loss": 0.003, + "step": 23950 + }, + { + "epoch": 1.4356761939001736, + "grad_norm": 0.26314112544059753, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0049, + "step": 23960 + }, + { + "epoch": 1.4362753909760921, + "grad_norm": 0.1249697357416153, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0036, + "step": 23970 + }, + { + "epoch": 1.4368745880520102, + "grad_norm": 0.09758924692869186, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0031, + "step": 23980 + }, + { + "epoch": 1.4374737851279287, + "grad_norm": 0.08506497740745544, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0037, + "step": 23990 + }, + { + "epoch": 1.4380729822038467, + "grad_norm": 0.1978219896554947, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0036, + "step": 24000 + }, + { + "epoch": 1.4386721792797652, + "grad_norm": 0.15215060114860535, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0033, + "step": 24010 + }, + { + "epoch": 1.4392713763556833, + "grad_norm": 0.1608658879995346, + "learning_rate": 3.823967005382315e-06, + "loss": 0.0034, + "step": 24020 + }, + { + "epoch": 1.4398705734316017, + "grad_norm": 0.10854586958885193, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0026, + "step": 24030 + }, + { + "epoch": 1.4404697705075198, + "grad_norm": 0.1394745409488678, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0036, + "step": 24040 + }, + { + "epoch": 1.4410689675834383, + "grad_norm": 0.0879194363951683, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0034, + "step": 24050 + }, + { + "epoch": 1.4416681646593565, + "grad_norm": 0.11169253289699554, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0037, + "step": 24060 + }, + { + "epoch": 1.4422673617352748, + "grad_norm": 0.12410115450620651, + "learning_rate": 3.794650811106129e-06, + "loss": 0.0033, + "step": 24070 + }, + { + "epoch": 1.442866558811193, + "grad_norm": 0.13719962537288666, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.0032, + "step": 24080 + }, + { + "epoch": 1.4434657558871113, + "grad_norm": 0.10031221807003021, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0046, + "step": 24090 + }, + { + "epoch": 1.4440649529630296, + "grad_norm": 0.1156797707080841, + "learning_rate": 3.777162510056721e-06, + "loss": 0.0042, + "step": 24100 + }, + { + "epoch": 1.4446641500389479, + "grad_norm": 0.1494375318288803, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0036, + "step": 24110 + }, + { + "epoch": 1.4452633471148661, + "grad_norm": 0.08620154112577438, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0034, + "step": 24120 + }, + { + "epoch": 1.4458625441907844, + "grad_norm": 0.16659799218177795, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0053, + "step": 24130 + }, + { + "epoch": 1.4464617412667027, + "grad_norm": 0.1313968300819397, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.0046, + "step": 24140 + }, + { + "epoch": 1.447060938342621, + "grad_norm": 0.21495603024959564, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0037, + "step": 24150 + }, + { + "epoch": 1.4476601354185392, + "grad_norm": 0.11284582316875458, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0033, + "step": 24160 + }, + { + "epoch": 1.4482593324944575, + "grad_norm": 0.18478819727897644, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0038, + "step": 24170 + }, + { + "epoch": 1.4488585295703758, + "grad_norm": 0.12338980287313461, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0029, + "step": 24180 + }, + { + "epoch": 1.449457726646294, + "grad_norm": 0.09782207757234573, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0037, + "step": 24190 + }, + { + "epoch": 1.4500569237222123, + "grad_norm": 0.10959567129611969, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0029, + "step": 24200 + }, + { + "epoch": 1.4506561207981306, + "grad_norm": 0.17048455774784088, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0035, + "step": 24210 + }, + { + "epoch": 1.4512553178740488, + "grad_norm": 0.12739142775535583, + "learning_rate": 3.707974016467e-06, + "loss": 0.0028, + "step": 24220 + }, + { + "epoch": 1.451854514949967, + "grad_norm": 0.19227802753448486, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0045, + "step": 24230 + }, + { + "epoch": 1.4524537120258854, + "grad_norm": 0.11818226426839828, + "learning_rate": 3.696562092850226e-06, + "loss": 0.0032, + "step": 24240 + }, + { + "epoch": 1.4530529091018036, + "grad_norm": 0.10820474475622177, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0026, + "step": 24250 + }, + { + "epoch": 1.453652106177722, + "grad_norm": 0.11386270821094513, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0026, + "step": 24260 + }, + { + "epoch": 1.4542513032536402, + "grad_norm": 0.23488907516002655, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0035, + "step": 24270 + }, + { + "epoch": 1.4548505003295584, + "grad_norm": 0.12526266276836395, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0037, + "step": 24280 + }, + { + "epoch": 1.4554496974054767, + "grad_norm": 0.22899770736694336, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0035, + "step": 24290 + }, + { + "epoch": 1.456048894481395, + "grad_norm": 0.13044586777687073, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0039, + "step": 24300 + }, + { + "epoch": 1.4566480915573132, + "grad_norm": 0.3652730882167816, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0041, + "step": 24310 + }, + { + "epoch": 1.4572472886332315, + "grad_norm": 0.1416187435388565, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.0036, + "step": 24320 + }, + { + "epoch": 1.4578464857091498, + "grad_norm": 0.11176013946533203, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0037, + "step": 24330 + }, + { + "epoch": 1.458445682785068, + "grad_norm": 0.09744516015052795, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0037, + "step": 24340 + }, + { + "epoch": 1.4590448798609863, + "grad_norm": 0.11925745010375977, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0034, + "step": 24350 + }, + { + "epoch": 1.4596440769369046, + "grad_norm": 0.0942603051662445, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0031, + "step": 24360 + }, + { + "epoch": 1.4602432740128228, + "grad_norm": 0.12849931418895721, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.0042, + "step": 24370 + }, + { + "epoch": 1.460842471088741, + "grad_norm": 0.11910247802734375, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0025, + "step": 24380 + }, + { + "epoch": 1.4614416681646594, + "grad_norm": 0.09603044390678406, + "learning_rate": 3.612069140022124e-06, + "loss": 0.004, + "step": 24390 + }, + { + "epoch": 1.4620408652405776, + "grad_norm": 0.1962766945362091, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0038, + "step": 24400 + }, + { + "epoch": 1.462640062316496, + "grad_norm": 0.15775476396083832, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0043, + "step": 24410 + }, + { + "epoch": 1.4632392593924142, + "grad_norm": 0.1549777239561081, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0041, + "step": 24420 + }, + { + "epoch": 1.4638384564683324, + "grad_norm": 0.24444808065891266, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0029, + "step": 24430 + }, + { + "epoch": 1.4644376535442507, + "grad_norm": 0.12734061479568481, + "learning_rate": 3.584337233394337e-06, + "loss": 0.003, + "step": 24440 + }, + { + "epoch": 1.465036850620169, + "grad_norm": 0.23149384558200836, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0042, + "step": 24450 + }, + { + "epoch": 1.4656360476960872, + "grad_norm": 0.1598765254020691, + "learning_rate": 3.573305344104808e-06, + "loss": 0.0041, + "step": 24460 + }, + { + "epoch": 1.4662352447720055, + "grad_norm": 0.12173855304718018, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0046, + "step": 24470 + }, + { + "epoch": 1.4668344418479238, + "grad_norm": 0.09653043001890182, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0032, + "step": 24480 + }, + { + "epoch": 1.467433638923842, + "grad_norm": 0.13262024521827698, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.003, + "step": 24490 + }, + { + "epoch": 1.4680328359997603, + "grad_norm": 0.2603001892566681, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0031, + "step": 24500 + }, + { + "epoch": 1.4686320330756786, + "grad_norm": 0.24721759557724, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0028, + "step": 24510 + }, + { + "epoch": 1.4692312301515968, + "grad_norm": 0.11963216960430145, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0047, + "step": 24520 + }, + { + "epoch": 1.4698304272275151, + "grad_norm": 0.12025906145572662, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.0024, + "step": 24530 + }, + { + "epoch": 1.4704296243034334, + "grad_norm": 0.1969287395477295, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0033, + "step": 24540 + }, + { + "epoch": 1.4710288213793516, + "grad_norm": 0.24025285243988037, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0047, + "step": 24550 + }, + { + "epoch": 1.47162801845527, + "grad_norm": 0.07612641155719757, + "learning_rate": 3.518669865884119e-06, + "loss": 0.0026, + "step": 24560 + }, + { + "epoch": 1.4722272155311882, + "grad_norm": 0.18313643336296082, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0038, + "step": 24570 + }, + { + "epoch": 1.4728264126071064, + "grad_norm": 0.3311282694339752, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0036, + "step": 24580 + }, + { + "epoch": 1.4734256096830247, + "grad_norm": 0.16643930971622467, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0034, + "step": 24590 + }, + { + "epoch": 1.474024806758943, + "grad_norm": 0.11099164932966232, + "learning_rate": 3.497061149826966e-06, + "loss": 0.003, + "step": 24600 + }, + { + "epoch": 1.4746240038348613, + "grad_norm": 0.11017951369285583, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0031, + "step": 24610 + }, + { + "epoch": 1.4752232009107795, + "grad_norm": 0.17948199808597565, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0037, + "step": 24620 + }, + { + "epoch": 1.4758223979866978, + "grad_norm": 0.1002451479434967, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.0042, + "step": 24630 + }, + { + "epoch": 1.476421595062616, + "grad_norm": 0.13393986225128174, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0038, + "step": 24640 + }, + { + "epoch": 1.4770207921385343, + "grad_norm": 0.0963628888130188, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0027, + "step": 24650 + }, + { + "epoch": 1.4776199892144526, + "grad_norm": 0.14946860074996948, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.0039, + "step": 24660 + }, + { + "epoch": 1.4782191862903709, + "grad_norm": 0.2011580467224121, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.0045, + "step": 24670 + }, + { + "epoch": 1.4788183833662891, + "grad_norm": 0.12523533403873444, + "learning_rate": 3.454266765790622e-06, + "loss": 0.0033, + "step": 24680 + }, + { + "epoch": 1.4794175804422074, + "grad_norm": 0.22948165237903595, + "learning_rate": 3.448957251110008e-06, + "loss": 0.0031, + "step": 24690 + }, + { + "epoch": 1.4800167775181257, + "grad_norm": 0.24120132625102997, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0046, + "step": 24700 + }, + { + "epoch": 1.480615974594044, + "grad_norm": 0.30398526787757874, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0043, + "step": 24710 + }, + { + "epoch": 1.4812151716699622, + "grad_norm": 0.13554388284683228, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0033, + "step": 24720 + }, + { + "epoch": 1.4818143687458805, + "grad_norm": 0.14989149570465088, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.003, + "step": 24730 + }, + { + "epoch": 1.4824135658217987, + "grad_norm": 0.15678660571575165, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0037, + "step": 24740 + }, + { + "epoch": 1.483012762897717, + "grad_norm": 0.29919424653053284, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.0042, + "step": 24750 + }, + { + "epoch": 1.4836119599736353, + "grad_norm": 0.08935242891311646, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.004, + "step": 24760 + }, + { + "epoch": 1.4842111570495535, + "grad_norm": 0.22928708791732788, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0042, + "step": 24770 + }, + { + "epoch": 1.4848103541254718, + "grad_norm": 0.18873436748981476, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0036, + "step": 24780 + }, + { + "epoch": 1.48540955120139, + "grad_norm": 0.0956149622797966, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0037, + "step": 24790 + }, + { + "epoch": 1.4860087482773083, + "grad_norm": 0.13334470987319946, + "learning_rate": 3.391138816571675e-06, + "loss": 0.0034, + "step": 24800 + }, + { + "epoch": 1.4866079453532266, + "grad_norm": 0.13492803275585175, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0033, + "step": 24810 + }, + { + "epoch": 1.4872071424291449, + "grad_norm": 0.13227517902851105, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0032, + "step": 24820 + }, + { + "epoch": 1.4878063395050631, + "grad_norm": 0.11342936754226685, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0042, + "step": 24830 + }, + { + "epoch": 1.4884055365809814, + "grad_norm": 0.3178110122680664, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0038, + "step": 24840 + }, + { + "epoch": 1.4890047336568997, + "grad_norm": 0.04432455077767372, + "learning_rate": 3.36521439484193e-06, + "loss": 0.0028, + "step": 24850 + }, + { + "epoch": 1.4896039307328182, + "grad_norm": 0.09680923074483871, + "learning_rate": 3.36005636574796e-06, + "loss": 0.0036, + "step": 24860 + }, + { + "epoch": 1.4902031278087362, + "grad_norm": 0.2477794885635376, + "learning_rate": 3.354907302553392e-06, + "loss": 0.004, + "step": 24870 + }, + { + "epoch": 1.4908023248846547, + "grad_norm": 0.11931425333023071, + "learning_rate": 3.349767211300933e-06, + "loss": 0.004, + "step": 24880 + }, + { + "epoch": 1.4914015219605727, + "grad_norm": 0.1410735696554184, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0032, + "step": 24890 + }, + { + "epoch": 1.4920007190364912, + "grad_norm": 0.16996408998966217, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.0041, + "step": 24900 + }, + { + "epoch": 1.4925999161124093, + "grad_norm": 0.1275407373905182, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.0038, + "step": 24910 + }, + { + "epoch": 1.4931991131883278, + "grad_norm": 0.10107860714197159, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0036, + "step": 24920 + }, + { + "epoch": 1.4937983102642458, + "grad_norm": 0.10196204483509064, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0032, + "step": 24930 + }, + { + "epoch": 1.4943975073401643, + "grad_norm": 0.10152500867843628, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0038, + "step": 24940 + }, + { + "epoch": 1.4949967044160823, + "grad_norm": 0.19691230356693268, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.0045, + "step": 24950 + }, + { + "epoch": 1.4955959014920008, + "grad_norm": 0.33672890067100525, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0038, + "step": 24960 + }, + { + "epoch": 1.4961950985679189, + "grad_norm": 0.09857437759637833, + "learning_rate": 3.303911119253872e-06, + "loss": 0.004, + "step": 24970 + }, + { + "epoch": 1.4967942956438374, + "grad_norm": 0.13289818167686462, + "learning_rate": 3.298861077451818e-06, + "loss": 0.003, + "step": 24980 + }, + { + "epoch": 1.4973934927197554, + "grad_norm": 0.18509522080421448, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0064, + "step": 24990 + }, + { + "epoch": 1.497992689795674, + "grad_norm": 0.11460676789283752, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0024, + "step": 25000 + }, + { + "epoch": 1.498591886871592, + "grad_norm": 0.12012742459774017, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.0039, + "step": 25010 + }, + { + "epoch": 1.4991910839475104, + "grad_norm": 0.356365442276001, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.0035, + "step": 25020 + }, + { + "epoch": 1.4997902810234285, + "grad_norm": 0.5451288223266602, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0068, + "step": 25030 + }, + { + "epoch": 1.500389478099347, + "grad_norm": 0.1067429855465889, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0034, + "step": 25040 + }, + { + "epoch": 1.500988675175265, + "grad_norm": 0.2349347621202469, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0041, + "step": 25050 + }, + { + "epoch": 1.5015878722511835, + "grad_norm": 0.09102735668420792, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0032, + "step": 25060 + }, + { + "epoch": 1.5021870693271016, + "grad_norm": 0.11968998610973358, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0034, + "step": 25070 + }, + { + "epoch": 1.50278626640302, + "grad_norm": 0.1355520486831665, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0034, + "step": 25080 + }, + { + "epoch": 1.503385463478938, + "grad_norm": 0.11785157769918442, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0044, + "step": 25090 + }, + { + "epoch": 1.5039846605548566, + "grad_norm": 0.12043727189302444, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0047, + "step": 25100 + }, + { + "epoch": 1.5045838576307746, + "grad_norm": 0.13475126028060913, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0033, + "step": 25110 + }, + { + "epoch": 1.5051830547066931, + "grad_norm": 0.12776954472064972, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0032, + "step": 25120 + }, + { + "epoch": 1.5057822517826112, + "grad_norm": 0.10374128818511963, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0054, + "step": 25130 + }, + { + "epoch": 1.5063814488585296, + "grad_norm": 0.08750293403863907, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0035, + "step": 25140 + }, + { + "epoch": 1.5069806459344477, + "grad_norm": 0.1284732222557068, + "learning_rate": 3.214397932123149e-06, + "loss": 0.0047, + "step": 25150 + }, + { + "epoch": 1.5075798430103662, + "grad_norm": 0.12900014221668243, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0042, + "step": 25160 + }, + { + "epoch": 1.5081790400862842, + "grad_norm": 0.11983122676610947, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0031, + "step": 25170 + }, + { + "epoch": 1.5087782371622027, + "grad_norm": 0.20311471819877625, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0045, + "step": 25180 + }, + { + "epoch": 1.5093774342381208, + "grad_norm": 0.1965232491493225, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.0047, + "step": 25190 + }, + { + "epoch": 1.5099766313140393, + "grad_norm": 0.10592305660247803, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0031, + "step": 25200 + }, + { + "epoch": 1.5105758283899573, + "grad_norm": 0.10558371245861053, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0032, + "step": 25210 + }, + { + "epoch": 1.5111750254658758, + "grad_norm": 0.12083200365304947, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0025, + "step": 25220 + }, + { + "epoch": 1.5117742225417938, + "grad_norm": 0.2367735505104065, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0024, + "step": 25230 + }, + { + "epoch": 1.5123734196177123, + "grad_norm": 0.1387612670660019, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.004, + "step": 25240 + }, + { + "epoch": 1.5129726166936306, + "grad_norm": 0.18766231834888458, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0035, + "step": 25250 + }, + { + "epoch": 1.5135718137695489, + "grad_norm": 0.18110574781894684, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0031, + "step": 25260 + }, + { + "epoch": 1.5141710108454671, + "grad_norm": 0.1886875331401825, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.003, + "step": 25270 + }, + { + "epoch": 1.5147702079213854, + "grad_norm": 0.09323479980230331, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.004, + "step": 25280 + }, + { + "epoch": 1.5153694049973037, + "grad_norm": 0.1508265882730484, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0039, + "step": 25290 + }, + { + "epoch": 1.515968602073222, + "grad_norm": 0.11250200122594833, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0039, + "step": 25300 + }, + { + "epoch": 1.5165677991491402, + "grad_norm": 0.23230062425136566, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.004, + "step": 25310 + }, + { + "epoch": 1.5171669962250585, + "grad_norm": 0.179047629237175, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.004, + "step": 25320 + }, + { + "epoch": 1.5177661933009767, + "grad_norm": 0.13797952234745026, + "learning_rate": 3.127844986891409e-06, + "loss": 0.0042, + "step": 25330 + }, + { + "epoch": 1.518365390376895, + "grad_norm": 0.12740616500377655, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0027, + "step": 25340 + }, + { + "epoch": 1.5189645874528133, + "grad_norm": 0.11396504938602448, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.0037, + "step": 25350 + }, + { + "epoch": 1.5195637845287315, + "grad_norm": 0.12815812230110168, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.0033, + "step": 25360 + }, + { + "epoch": 1.5201629816046498, + "grad_norm": 0.17100073397159576, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0039, + "step": 25370 + }, + { + "epoch": 1.520762178680568, + "grad_norm": 0.09657446295022964, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0031, + "step": 25380 + }, + { + "epoch": 1.5213613757564863, + "grad_norm": 0.3235829472541809, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0049, + "step": 25390 + }, + { + "epoch": 1.5219605728324046, + "grad_norm": 0.17849496006965637, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0033, + "step": 25400 + }, + { + "epoch": 1.5225597699083229, + "grad_norm": 0.16907230019569397, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0028, + "step": 25410 + }, + { + "epoch": 1.5231589669842411, + "grad_norm": 0.26099368929862976, + "learning_rate": 3.085688933413021e-06, + "loss": 0.003, + "step": 25420 + }, + { + "epoch": 1.5237581640601594, + "grad_norm": 0.21024562418460846, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0035, + "step": 25430 + }, + { + "epoch": 1.5243573611360777, + "grad_norm": 0.10564325749874115, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.0032, + "step": 25440 + }, + { + "epoch": 1.524956558211996, + "grad_norm": 0.10607697814702988, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0037, + "step": 25450 + }, + { + "epoch": 1.5255557552879142, + "grad_norm": 0.20698976516723633, + "learning_rate": 3.067194157156521e-06, + "loss": 0.003, + "step": 25460 + }, + { + "epoch": 1.5261549523638325, + "grad_norm": 0.20934849977493286, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0033, + "step": 25470 + }, + { + "epoch": 1.5267541494397507, + "grad_norm": 0.12407243996858597, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0037, + "step": 25480 + }, + { + "epoch": 1.527353346515669, + "grad_norm": 0.13003374636173248, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.0032, + "step": 25490 + }, + { + "epoch": 1.5279525435915873, + "grad_norm": 0.15529648959636688, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0048, + "step": 25500 + }, + { + "epoch": 1.5285517406675055, + "grad_norm": 0.12824782729148865, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.0031, + "step": 25510 + }, + { + "epoch": 1.5291509377434238, + "grad_norm": 0.12616124749183655, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.0037, + "step": 25520 + }, + { + "epoch": 1.529750134819342, + "grad_norm": 0.2119731307029724, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0037, + "step": 25530 + }, + { + "epoch": 1.5303493318952603, + "grad_norm": 0.22325192391872406, + "learning_rate": 3.030651808761638e-06, + "loss": 0.0035, + "step": 25540 + }, + { + "epoch": 1.5309485289711786, + "grad_norm": 0.10937803238630295, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0037, + "step": 25550 + }, + { + "epoch": 1.5315477260470969, + "grad_norm": 0.3106321692466736, + "learning_rate": 3.021609639602321e-06, + "loss": 0.0034, + "step": 25560 + }, + { + "epoch": 1.5321469231230151, + "grad_norm": 0.2864716649055481, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.0037, + "step": 25570 + }, + { + "epoch": 1.5327461201989334, + "grad_norm": 0.10637935250997543, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.0037, + "step": 25580 + }, + { + "epoch": 1.5333453172748517, + "grad_norm": 0.11078158766031265, + "learning_rate": 3.008116622200155e-06, + "loss": 0.0034, + "step": 25590 + }, + { + "epoch": 1.53394451435077, + "grad_norm": 0.06270865350961685, + "learning_rate": 3.003637700546652e-06, + "loss": 0.003, + "step": 25600 + }, + { + "epoch": 1.5345437114266882, + "grad_norm": 0.12176132947206497, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0043, + "step": 25610 + }, + { + "epoch": 1.5351429085026065, + "grad_norm": 0.16978275775909424, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0032, + "step": 25620 + }, + { + "epoch": 1.5357421055785248, + "grad_norm": 0.2582871913909912, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.0036, + "step": 25630 + }, + { + "epoch": 1.536341302654443, + "grad_norm": 0.27402547001838684, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0031, + "step": 25640 + }, + { + "epoch": 1.5369404997303613, + "grad_norm": 0.15350353717803955, + "learning_rate": 2.981383959667165e-06, + "loss": 0.004, + "step": 25650 + }, + { + "epoch": 1.5375396968062796, + "grad_norm": 0.0939447432756424, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0033, + "step": 25660 + }, + { + "epoch": 1.5381388938821978, + "grad_norm": 0.16549192368984222, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0048, + "step": 25670 + }, + { + "epoch": 1.538738090958116, + "grad_norm": 0.11002931743860245, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0028, + "step": 25680 + }, + { + "epoch": 1.5393372880340344, + "grad_norm": 0.17383548617362976, + "learning_rate": 2.963750320724704e-06, + "loss": 0.0032, + "step": 25690 + }, + { + "epoch": 1.5399364851099526, + "grad_norm": 0.18648599088191986, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0039, + "step": 25700 + }, + { + "epoch": 1.540535682185871, + "grad_norm": 0.2366044819355011, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0036, + "step": 25710 + }, + { + "epoch": 1.5411348792617892, + "grad_norm": 0.1678195595741272, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.0032, + "step": 25720 + }, + { + "epoch": 1.5417340763377074, + "grad_norm": 0.31918013095855713, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0045, + "step": 25730 + }, + { + "epoch": 1.5423332734136257, + "grad_norm": 0.14635732769966125, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.0039, + "step": 25740 + }, + { + "epoch": 1.542932470489544, + "grad_norm": 0.19166909158229828, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0028, + "step": 25750 + }, + { + "epoch": 1.5435316675654622, + "grad_norm": 0.11960610002279282, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0029, + "step": 25760 + }, + { + "epoch": 1.5441308646413805, + "grad_norm": 0.06636705994606018, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0025, + "step": 25770 + }, + { + "epoch": 1.5447300617172988, + "grad_norm": 0.17033624649047852, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0028, + "step": 25780 + }, + { + "epoch": 1.5453292587932173, + "grad_norm": 0.07974246889352798, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.003, + "step": 25790 + }, + { + "epoch": 1.5459284558691353, + "grad_norm": 0.1188567653298378, + "learning_rate": 2.916036854664115e-06, + "loss": 0.0043, + "step": 25800 + }, + { + "epoch": 1.5465276529450538, + "grad_norm": 0.11378541588783264, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0028, + "step": 25810 + }, + { + "epoch": 1.5471268500209718, + "grad_norm": 0.11495907604694366, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0037, + "step": 25820 + }, + { + "epoch": 1.5477260470968903, + "grad_norm": 0.144247367978096, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0031, + "step": 25830 + }, + { + "epoch": 1.5483252441728084, + "grad_norm": 0.14722205698490143, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0046, + "step": 25840 + }, + { + "epoch": 1.5489244412487269, + "grad_norm": 0.10647077113389969, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.0026, + "step": 25850 + }, + { + "epoch": 1.549523638324645, + "grad_norm": 0.17438668012619019, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0032, + "step": 25860 + }, + { + "epoch": 1.5501228354005634, + "grad_norm": 0.17071637511253357, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0035, + "step": 25870 + }, + { + "epoch": 1.5507220324764814, + "grad_norm": 0.2201206386089325, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0041, + "step": 25880 + }, + { + "epoch": 1.5513212295524, + "grad_norm": 0.14397655427455902, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0037, + "step": 25890 + }, + { + "epoch": 1.551920426628318, + "grad_norm": 0.055822595953941345, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0035, + "step": 25900 + }, + { + "epoch": 1.5525196237042365, + "grad_norm": 0.13084810972213745, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0046, + "step": 25910 + }, + { + "epoch": 1.5531188207801545, + "grad_norm": 0.3321281373500824, + "learning_rate": 2.865295218604555e-06, + "loss": 0.0038, + "step": 25920 + }, + { + "epoch": 1.553718017856073, + "grad_norm": 0.1274777501821518, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0041, + "step": 25930 + }, + { + "epoch": 1.554317214931991, + "grad_norm": 0.09797787666320801, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0034, + "step": 25940 + }, + { + "epoch": 1.5549164120079095, + "grad_norm": 0.1270579695701599, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0042, + "step": 25950 + }, + { + "epoch": 1.5555156090838276, + "grad_norm": 0.09015227854251862, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0034, + "step": 25960 + }, + { + "epoch": 1.556114806159746, + "grad_norm": 0.12557077407836914, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.0037, + "step": 25970 + }, + { + "epoch": 1.5567140032356641, + "grad_norm": 0.2725144922733307, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0036, + "step": 25980 + }, + { + "epoch": 1.5573132003115826, + "grad_norm": 0.13758502900600433, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0039, + "step": 25990 + }, + { + "epoch": 1.5579123973875006, + "grad_norm": 0.19999243319034576, + "learning_rate": 2.832230653119002e-06, + "loss": 0.0038, + "step": 26000 + }, + { + "epoch": 1.5585115944634191, + "grad_norm": 0.1323961615562439, + "learning_rate": 2.828140665735232e-06, + "loss": 0.0033, + "step": 26010 + }, + { + "epoch": 1.5591107915393372, + "grad_norm": 0.12714031338691711, + "learning_rate": 2.8240602684835614e-06, + "loss": 0.0033, + "step": 26020 + }, + { + "epoch": 1.5597099886152557, + "grad_norm": 0.40822476148605347, + "learning_rate": 2.8199894661525695e-06, + "loss": 0.0041, + "step": 26030 + }, + { + "epoch": 1.5603091856911737, + "grad_norm": 0.14638100564479828, + "learning_rate": 2.8159282635195604e-06, + "loss": 0.0042, + "step": 26040 + }, + { + "epoch": 1.5609083827670922, + "grad_norm": 0.17443427443504333, + "learning_rate": 2.8118766653505857e-06, + "loss": 0.0031, + "step": 26050 + }, + { + "epoch": 1.5615075798430103, + "grad_norm": 0.09581520408391953, + "learning_rate": 2.8078346764004217e-06, + "loss": 0.0036, + "step": 26060 + }, + { + "epoch": 1.5621067769189287, + "grad_norm": 0.14804130792617798, + "learning_rate": 2.8038023014125693e-06, + "loss": 0.004, + "step": 26070 + }, + { + "epoch": 1.5627059739948468, + "grad_norm": 0.4015085697174072, + "learning_rate": 2.799779545119241e-06, + "loss": 0.0062, + "step": 26080 + }, + { + "epoch": 1.5633051710707653, + "grad_norm": 0.3468920886516571, + "learning_rate": 2.7957664122413685e-06, + "loss": 0.004, + "step": 26090 + }, + { + "epoch": 1.5639043681466833, + "grad_norm": 0.19594644010066986, + "learning_rate": 2.7917629074885855e-06, + "loss": 0.0042, + "step": 26100 + }, + { + "epoch": 1.5645035652226018, + "grad_norm": 0.09097496420145035, + "learning_rate": 2.78776903555923e-06, + "loss": 0.0027, + "step": 26110 + }, + { + "epoch": 1.5651027622985199, + "grad_norm": 0.11387573927640915, + "learning_rate": 2.7837848011403307e-06, + "loss": 0.0025, + "step": 26120 + }, + { + "epoch": 1.5657019593744383, + "grad_norm": 0.17657096683979034, + "learning_rate": 2.7798102089076096e-06, + "loss": 0.0036, + "step": 26130 + }, + { + "epoch": 1.5663011564503564, + "grad_norm": 0.09257909655570984, + "learning_rate": 2.7758452635254706e-06, + "loss": 0.0033, + "step": 26140 + }, + { + "epoch": 1.5669003535262749, + "grad_norm": 0.15154404938220978, + "learning_rate": 2.771889969647e-06, + "loss": 0.0046, + "step": 26150 + }, + { + "epoch": 1.567499550602193, + "grad_norm": 0.07300597429275513, + "learning_rate": 2.7679443319139547e-06, + "loss": 0.0028, + "step": 26160 + }, + { + "epoch": 1.5680987476781114, + "grad_norm": 0.12779368460178375, + "learning_rate": 2.76400835495676e-06, + "loss": 0.0028, + "step": 26170 + }, + { + "epoch": 1.5686979447540295, + "grad_norm": 0.12631577253341675, + "learning_rate": 2.760082043394504e-06, + "loss": 0.0035, + "step": 26180 + }, + { + "epoch": 1.569297141829948, + "grad_norm": 0.3630695044994354, + "learning_rate": 2.756165401834933e-06, + "loss": 0.0034, + "step": 26190 + }, + { + "epoch": 1.569896338905866, + "grad_norm": 0.18113726377487183, + "learning_rate": 2.7522584348744443e-06, + "loss": 0.005, + "step": 26200 + }, + { + "epoch": 1.5704955359817845, + "grad_norm": 0.21797926723957062, + "learning_rate": 2.748361147098079e-06, + "loss": 0.0033, + "step": 26210 + }, + { + "epoch": 1.5710947330577025, + "grad_norm": 0.1614106148481369, + "learning_rate": 2.7444735430795245e-06, + "loss": 0.0034, + "step": 26220 + }, + { + "epoch": 1.571693930133621, + "grad_norm": 0.10198274999856949, + "learning_rate": 2.740595627381096e-06, + "loss": 0.0038, + "step": 26230 + }, + { + "epoch": 1.572293127209539, + "grad_norm": 0.14413216710090637, + "learning_rate": 2.7367274045537477e-06, + "loss": 0.0022, + "step": 26240 + }, + { + "epoch": 1.5728923242854576, + "grad_norm": 0.08031613379716873, + "learning_rate": 2.732868879137055e-06, + "loss": 0.0037, + "step": 26250 + }, + { + "epoch": 1.5734915213613756, + "grad_norm": 0.31797754764556885, + "learning_rate": 2.7290200556592094e-06, + "loss": 0.0035, + "step": 26260 + }, + { + "epoch": 1.574090718437294, + "grad_norm": 0.0591890886425972, + "learning_rate": 2.72518093863702e-06, + "loss": 0.0027, + "step": 26270 + }, + { + "epoch": 1.5746899155132121, + "grad_norm": 0.15585894882678986, + "learning_rate": 2.721351532575906e-06, + "loss": 0.0064, + "step": 26280 + }, + { + "epoch": 1.5752891125891306, + "grad_norm": 0.13518628478050232, + "learning_rate": 2.717531841969889e-06, + "loss": 0.0042, + "step": 26290 + }, + { + "epoch": 1.5758883096650487, + "grad_norm": 0.13154275715351105, + "learning_rate": 2.713721871301588e-06, + "loss": 0.0033, + "step": 26300 + }, + { + "epoch": 1.5764875067409672, + "grad_norm": 0.33374130725860596, + "learning_rate": 2.709921625042214e-06, + "loss": 0.0061, + "step": 26310 + }, + { + "epoch": 1.5770867038168854, + "grad_norm": 0.12396867573261261, + "learning_rate": 2.7061311076515717e-06, + "loss": 0.004, + "step": 26320 + }, + { + "epoch": 1.5776859008928037, + "grad_norm": 0.08533058315515518, + "learning_rate": 2.7023503235780395e-06, + "loss": 0.0031, + "step": 26330 + }, + { + "epoch": 1.578285097968722, + "grad_norm": 0.25102120637893677, + "learning_rate": 2.6985792772585826e-06, + "loss": 0.003, + "step": 26340 + }, + { + "epoch": 1.5788842950446402, + "grad_norm": 0.10319694876670837, + "learning_rate": 2.6948179731187315e-06, + "loss": 0.0035, + "step": 26350 + }, + { + "epoch": 1.5794834921205585, + "grad_norm": 0.1508130133152008, + "learning_rate": 2.6910664155725847e-06, + "loss": 0.0046, + "step": 26360 + }, + { + "epoch": 1.5800826891964768, + "grad_norm": 0.09007565677165985, + "learning_rate": 2.6873246090228063e-06, + "loss": 0.0025, + "step": 26370 + }, + { + "epoch": 1.580681886272395, + "grad_norm": 0.13807767629623413, + "learning_rate": 2.683592557860616e-06, + "loss": 0.003, + "step": 26380 + }, + { + "epoch": 1.5812810833483133, + "grad_norm": 0.1909133791923523, + "learning_rate": 2.6798702664657803e-06, + "loss": 0.0034, + "step": 26390 + }, + { + "epoch": 1.5818802804242316, + "grad_norm": 0.14300945401191711, + "learning_rate": 2.6761577392066163e-06, + "loss": 0.004, + "step": 26400 + }, + { + "epoch": 1.5824794775001498, + "grad_norm": 0.08184076100587845, + "learning_rate": 2.6724549804399845e-06, + "loss": 0.0025, + "step": 26410 + }, + { + "epoch": 1.583078674576068, + "grad_norm": 0.1493527740240097, + "learning_rate": 2.6687619945112743e-06, + "loss": 0.0026, + "step": 26420 + }, + { + "epoch": 1.5836778716519864, + "grad_norm": 0.09850698709487915, + "learning_rate": 2.6650787857544134e-06, + "loss": 0.0032, + "step": 26430 + }, + { + "epoch": 1.5842770687279046, + "grad_norm": 0.0875677615404129, + "learning_rate": 2.661405358491851e-06, + "loss": 0.0039, + "step": 26440 + }, + { + "epoch": 1.584876265803823, + "grad_norm": 0.2319948524236679, + "learning_rate": 2.6577417170345594e-06, + "loss": 0.0039, + "step": 26450 + }, + { + "epoch": 1.5854754628797412, + "grad_norm": 0.10797403752803802, + "learning_rate": 2.6540878656820246e-06, + "loss": 0.0036, + "step": 26460 + }, + { + "epoch": 1.5860746599556594, + "grad_norm": 0.19400249421596527, + "learning_rate": 2.6504438087222474e-06, + "loss": 0.0034, + "step": 26470 + }, + { + "epoch": 1.5866738570315777, + "grad_norm": 0.1569194793701172, + "learning_rate": 2.6468095504317275e-06, + "loss": 0.0039, + "step": 26480 + }, + { + "epoch": 1.587273054107496, + "grad_norm": 0.17117120325565338, + "learning_rate": 2.643185095075473e-06, + "loss": 0.003, + "step": 26490 + }, + { + "epoch": 1.5878722511834142, + "grad_norm": 0.19703997671604156, + "learning_rate": 2.6395704469069837e-06, + "loss": 0.0041, + "step": 26500 + }, + { + "epoch": 1.5884714482593325, + "grad_norm": 0.09663215279579163, + "learning_rate": 2.635965610168249e-06, + "loss": 0.005, + "step": 26510 + }, + { + "epoch": 1.5890706453352508, + "grad_norm": 0.13411357998847961, + "learning_rate": 2.6323705890897464e-06, + "loss": 0.0035, + "step": 26520 + }, + { + "epoch": 1.589669842411169, + "grad_norm": 0.15013787150382996, + "learning_rate": 2.628785387890433e-06, + "loss": 0.0031, + "step": 26530 + }, + { + "epoch": 1.5902690394870873, + "grad_norm": 0.15517787635326385, + "learning_rate": 2.6252100107777422e-06, + "loss": 0.0028, + "step": 26540 + }, + { + "epoch": 1.5908682365630056, + "grad_norm": 0.23037715256214142, + "learning_rate": 2.6216444619475786e-06, + "loss": 0.0034, + "step": 26550 + }, + { + "epoch": 1.5914674336389238, + "grad_norm": 0.1925845891237259, + "learning_rate": 2.6180887455843135e-06, + "loss": 0.0028, + "step": 26560 + }, + { + "epoch": 1.5920666307148421, + "grad_norm": 0.08933448791503906, + "learning_rate": 2.6145428658607753e-06, + "loss": 0.0029, + "step": 26570 + }, + { + "epoch": 1.5926658277907604, + "grad_norm": 0.14989611506462097, + "learning_rate": 2.6110068269382534e-06, + "loss": 0.0034, + "step": 26580 + }, + { + "epoch": 1.5932650248666786, + "grad_norm": 0.2904585897922516, + "learning_rate": 2.6074806329664854e-06, + "loss": 0.0027, + "step": 26590 + }, + { + "epoch": 1.593864221942597, + "grad_norm": 0.17784662544727325, + "learning_rate": 2.6039642880836585e-06, + "loss": 0.0039, + "step": 26600 + }, + { + "epoch": 1.5944634190185152, + "grad_norm": 0.07810595631599426, + "learning_rate": 2.600457796416397e-06, + "loss": 0.0025, + "step": 26610 + }, + { + "epoch": 1.5950626160944334, + "grad_norm": 0.06783948838710785, + "learning_rate": 2.5969611620797636e-06, + "loss": 0.003, + "step": 26620 + }, + { + "epoch": 1.5956618131703517, + "grad_norm": 0.13763132691383362, + "learning_rate": 2.593474389177255e-06, + "loss": 0.0037, + "step": 26630 + }, + { + "epoch": 1.59626101024627, + "grad_norm": 0.1127597987651825, + "learning_rate": 2.5899974818007924e-06, + "loss": 0.0045, + "step": 26640 + }, + { + "epoch": 1.5968602073221883, + "grad_norm": 0.07828421145677567, + "learning_rate": 2.586530444030723e-06, + "loss": 0.0027, + "step": 26650 + }, + { + "epoch": 1.5974594043981065, + "grad_norm": 0.1327218860387802, + "learning_rate": 2.583073279935805e-06, + "loss": 0.0042, + "step": 26660 + }, + { + "epoch": 1.5980586014740248, + "grad_norm": 0.09427100419998169, + "learning_rate": 2.5796259935732143e-06, + "loss": 0.0027, + "step": 26670 + }, + { + "epoch": 1.598657798549943, + "grad_norm": 0.2112533301115036, + "learning_rate": 2.5761885889885346e-06, + "loss": 0.0046, + "step": 26680 + }, + { + "epoch": 1.5992569956258613, + "grad_norm": 0.24039748311042786, + "learning_rate": 2.5727610702157518e-06, + "loss": 0.0032, + "step": 26690 + }, + { + "epoch": 1.5998561927017796, + "grad_norm": 0.28341665863990784, + "learning_rate": 2.5693434412772496e-06, + "loss": 0.005, + "step": 26700 + }, + { + "epoch": 1.6004553897776979, + "grad_norm": 0.23401512205600739, + "learning_rate": 2.565935706183804e-06, + "loss": 0.0029, + "step": 26710 + }, + { + "epoch": 1.6010545868536161, + "grad_norm": 0.13487978279590607, + "learning_rate": 2.5625378689345837e-06, + "loss": 0.0028, + "step": 26720 + }, + { + "epoch": 1.6016537839295344, + "grad_norm": 0.10604815185070038, + "learning_rate": 2.5591499335171394e-06, + "loss": 0.0029, + "step": 26730 + }, + { + "epoch": 1.6022529810054527, + "grad_norm": 0.12193044275045395, + "learning_rate": 2.555771903907403e-06, + "loss": 0.0031, + "step": 26740 + }, + { + "epoch": 1.602852178081371, + "grad_norm": 0.291572630405426, + "learning_rate": 2.5524037840696787e-06, + "loss": 0.0033, + "step": 26750 + }, + { + "epoch": 1.6034513751572892, + "grad_norm": 0.14938616752624512, + "learning_rate": 2.5490455779566446e-06, + "loss": 0.0027, + "step": 26760 + }, + { + "epoch": 1.6040505722332075, + "grad_norm": 0.16085144877433777, + "learning_rate": 2.545697289509341e-06, + "loss": 0.0022, + "step": 26770 + }, + { + "epoch": 1.6046497693091257, + "grad_norm": 0.14876601099967957, + "learning_rate": 2.5423589226571733e-06, + "loss": 0.0034, + "step": 26780 + }, + { + "epoch": 1.605248966385044, + "grad_norm": 0.13766804337501526, + "learning_rate": 2.5390304813179e-06, + "loss": 0.0029, + "step": 26790 + }, + { + "epoch": 1.6058481634609623, + "grad_norm": 0.1824955940246582, + "learning_rate": 2.5357119693976297e-06, + "loss": 0.0028, + "step": 26800 + }, + { + "epoch": 1.6064473605368805, + "grad_norm": 0.09187015891075134, + "learning_rate": 2.532403390790823e-06, + "loss": 0.0028, + "step": 26810 + }, + { + "epoch": 1.6070465576127988, + "grad_norm": 0.1488831490278244, + "learning_rate": 2.529104749380281e-06, + "loss": 0.0023, + "step": 26820 + }, + { + "epoch": 1.607645754688717, + "grad_norm": 0.16146720945835114, + "learning_rate": 2.5258160490371446e-06, + "loss": 0.0036, + "step": 26830 + }, + { + "epoch": 1.6082449517646353, + "grad_norm": 0.19863533973693848, + "learning_rate": 2.5225372936208854e-06, + "loss": 0.0034, + "step": 26840 + }, + { + "epoch": 1.6088441488405536, + "grad_norm": 0.08710742741823196, + "learning_rate": 2.5192684869793043e-06, + "loss": 0.0028, + "step": 26850 + }, + { + "epoch": 1.609443345916472, + "grad_norm": 0.1280236840248108, + "learning_rate": 2.51600963294853e-06, + "loss": 0.0037, + "step": 26860 + }, + { + "epoch": 1.6100425429923901, + "grad_norm": 0.29420942068099976, + "learning_rate": 2.5127607353530097e-06, + "loss": 0.0031, + "step": 26870 + }, + { + "epoch": 1.6106417400683086, + "grad_norm": 0.16633544862270355, + "learning_rate": 2.5095217980055052e-06, + "loss": 0.0037, + "step": 26880 + }, + { + "epoch": 1.6112409371442267, + "grad_norm": 0.10398953408002853, + "learning_rate": 2.5062928247070873e-06, + "loss": 0.0032, + "step": 26890 + }, + { + "epoch": 1.6118401342201452, + "grad_norm": 0.1609172523021698, + "learning_rate": 2.503073819247138e-06, + "loss": 0.0031, + "step": 26900 + }, + { + "epoch": 1.6124393312960632, + "grad_norm": 0.14156407117843628, + "learning_rate": 2.4998647854033393e-06, + "loss": 0.0032, + "step": 26910 + }, + { + "epoch": 1.6130385283719817, + "grad_norm": 0.3801378309726715, + "learning_rate": 2.4966657269416738e-06, + "loss": 0.0039, + "step": 26920 + }, + { + "epoch": 1.6136377254478997, + "grad_norm": 0.1612473726272583, + "learning_rate": 2.49347664761641e-06, + "loss": 0.0026, + "step": 26930 + }, + { + "epoch": 1.6142369225238182, + "grad_norm": 0.3169429898262024, + "learning_rate": 2.490297551170112e-06, + "loss": 0.0038, + "step": 26940 + }, + { + "epoch": 1.6148361195997363, + "grad_norm": 0.11678534001111984, + "learning_rate": 2.487128441333628e-06, + "loss": 0.0041, + "step": 26950 + }, + { + "epoch": 1.6154353166756548, + "grad_norm": 0.08701438456773758, + "learning_rate": 2.4839693218260844e-06, + "loss": 0.0025, + "step": 26960 + }, + { + "epoch": 1.6160345137515728, + "grad_norm": 0.14214813709259033, + "learning_rate": 2.4808201963548844e-06, + "loss": 0.0032, + "step": 26970 + }, + { + "epoch": 1.6166337108274913, + "grad_norm": 0.06335555016994476, + "learning_rate": 2.477681068615698e-06, + "loss": 0.0022, + "step": 26980 + }, + { + "epoch": 1.6172329079034093, + "grad_norm": 0.1225769966840744, + "learning_rate": 2.4745519422924715e-06, + "loss": 0.0035, + "step": 26990 + }, + { + "epoch": 1.6178321049793278, + "grad_norm": 0.12757551670074463, + "learning_rate": 2.471432821057406e-06, + "loss": 0.0034, + "step": 27000 + }, + { + "epoch": 1.6184313020552459, + "grad_norm": 0.04847760871052742, + "learning_rate": 2.4683237085709673e-06, + "loss": 0.0025, + "step": 27010 + }, + { + "epoch": 1.6190304991311644, + "grad_norm": 0.11208045482635498, + "learning_rate": 2.4652246084818678e-06, + "loss": 0.0028, + "step": 27020 + }, + { + "epoch": 1.6196296962070824, + "grad_norm": 0.10029870271682739, + "learning_rate": 2.4621355244270764e-06, + "loss": 0.0042, + "step": 27030 + }, + { + "epoch": 1.620228893283001, + "grad_norm": 0.10894428193569183, + "learning_rate": 2.4590564600318047e-06, + "loss": 0.003, + "step": 27040 + }, + { + "epoch": 1.620828090358919, + "grad_norm": 0.16484397649765015, + "learning_rate": 2.4559874189095077e-06, + "loss": 0.0027, + "step": 27050 + }, + { + "epoch": 1.6214272874348374, + "grad_norm": 0.18669992685317993, + "learning_rate": 2.4529284046618745e-06, + "loss": 0.0031, + "step": 27060 + }, + { + "epoch": 1.6220264845107555, + "grad_norm": 0.10345451533794403, + "learning_rate": 2.4498794208788296e-06, + "loss": 0.0024, + "step": 27070 + }, + { + "epoch": 1.622625681586674, + "grad_norm": 0.14037790894508362, + "learning_rate": 2.446840471138524e-06, + "loss": 0.0027, + "step": 27080 + }, + { + "epoch": 1.623224878662592, + "grad_norm": 0.2581053078174591, + "learning_rate": 2.443811559007335e-06, + "loss": 0.0027, + "step": 27090 + }, + { + "epoch": 1.6238240757385105, + "grad_norm": 0.12379001826047897, + "learning_rate": 2.440792688039862e-06, + "loss": 0.0024, + "step": 27100 + }, + { + "epoch": 1.6244232728144286, + "grad_norm": 0.17116566002368927, + "learning_rate": 2.437783861778914e-06, + "loss": 0.0025, + "step": 27110 + }, + { + "epoch": 1.625022469890347, + "grad_norm": 0.13846145570278168, + "learning_rate": 2.4347850837555197e-06, + "loss": 0.0042, + "step": 27120 + }, + { + "epoch": 1.625621666966265, + "grad_norm": 0.09063230454921722, + "learning_rate": 2.4317963574889108e-06, + "loss": 0.0032, + "step": 27130 + }, + { + "epoch": 1.6262208640421836, + "grad_norm": 0.19914232194423676, + "learning_rate": 2.428817686486524e-06, + "loss": 0.0043, + "step": 27140 + }, + { + "epoch": 1.6268200611181016, + "grad_norm": 0.13414347171783447, + "learning_rate": 2.425849074243997e-06, + "loss": 0.0031, + "step": 27150 + }, + { + "epoch": 1.6274192581940201, + "grad_norm": 0.11173701286315918, + "learning_rate": 2.4228905242451593e-06, + "loss": 0.0037, + "step": 27160 + }, + { + "epoch": 1.6280184552699382, + "grad_norm": 0.11112072318792343, + "learning_rate": 2.419942039962035e-06, + "loss": 0.0032, + "step": 27170 + }, + { + "epoch": 1.6286176523458566, + "grad_norm": 0.27570319175720215, + "learning_rate": 2.4170036248548345e-06, + "loss": 0.0048, + "step": 27180 + }, + { + "epoch": 1.6292168494217747, + "grad_norm": 0.09076099097728729, + "learning_rate": 2.414075282371954e-06, + "loss": 0.0033, + "step": 27190 + }, + { + "epoch": 1.6298160464976932, + "grad_norm": 0.185089111328125, + "learning_rate": 2.411157015949963e-06, + "loss": 0.005, + "step": 27200 + }, + { + "epoch": 1.6304152435736112, + "grad_norm": 0.06751414388418198, + "learning_rate": 2.408248829013611e-06, + "loss": 0.0026, + "step": 27210 + }, + { + "epoch": 1.6310144406495297, + "grad_norm": 0.14673012495040894, + "learning_rate": 2.4053507249758174e-06, + "loss": 0.0029, + "step": 27220 + }, + { + "epoch": 1.6316136377254478, + "grad_norm": 0.11741532385349274, + "learning_rate": 2.40246270723767e-06, + "loss": 0.0034, + "step": 27230 + }, + { + "epoch": 1.6322128348013663, + "grad_norm": 0.06512618809938431, + "learning_rate": 2.399584779188417e-06, + "loss": 0.0032, + "step": 27240 + }, + { + "epoch": 1.6328120318772843, + "grad_norm": 0.22004343569278717, + "learning_rate": 2.396716944205467e-06, + "loss": 0.004, + "step": 27250 + }, + { + "epoch": 1.6334112289532028, + "grad_norm": 0.1706841140985489, + "learning_rate": 2.3938592056543853e-06, + "loss": 0.0028, + "step": 27260 + }, + { + "epoch": 1.6340104260291208, + "grad_norm": 0.1023155003786087, + "learning_rate": 2.391011566888887e-06, + "loss": 0.0029, + "step": 27270 + }, + { + "epoch": 1.6346096231050393, + "grad_norm": 0.17524677515029907, + "learning_rate": 2.3881740312508346e-06, + "loss": 0.0039, + "step": 27280 + }, + { + "epoch": 1.6352088201809574, + "grad_norm": 0.10368278622627258, + "learning_rate": 2.3853466020702323e-06, + "loss": 0.0026, + "step": 27290 + }, + { + "epoch": 1.6358080172568759, + "grad_norm": 0.06621989607810974, + "learning_rate": 2.382529282665229e-06, + "loss": 0.0028, + "step": 27300 + }, + { + "epoch": 1.636407214332794, + "grad_norm": 0.2700876295566559, + "learning_rate": 2.379722076342103e-06, + "loss": 0.0045, + "step": 27310 + }, + { + "epoch": 1.6370064114087124, + "grad_norm": 0.07727917283773422, + "learning_rate": 2.376924986395271e-06, + "loss": 0.0034, + "step": 27320 + }, + { + "epoch": 1.6376056084846304, + "grad_norm": 0.11636139452457428, + "learning_rate": 2.37413801610727e-06, + "loss": 0.0026, + "step": 27330 + }, + { + "epoch": 1.638204805560549, + "grad_norm": 0.07539201527833939, + "learning_rate": 2.371361168748767e-06, + "loss": 0.0029, + "step": 27340 + }, + { + "epoch": 1.638804002636467, + "grad_norm": 0.14615486562252045, + "learning_rate": 2.3685944475785463e-06, + "loss": 0.0037, + "step": 27350 + }, + { + "epoch": 1.6394031997123855, + "grad_norm": 0.10396217554807663, + "learning_rate": 2.3658378558435098e-06, + "loss": 0.004, + "step": 27360 + }, + { + "epoch": 1.6400023967883035, + "grad_norm": 0.08993582427501678, + "learning_rate": 2.363091396778672e-06, + "loss": 0.0037, + "step": 27370 + }, + { + "epoch": 1.640601593864222, + "grad_norm": 0.15601681172847748, + "learning_rate": 2.3603550736071535e-06, + "loss": 0.0032, + "step": 27380 + }, + { + "epoch": 1.6412007909401403, + "grad_norm": 0.27940425276756287, + "learning_rate": 2.357628889540182e-06, + "loss": 0.0035, + "step": 27390 + }, + { + "epoch": 1.6417999880160585, + "grad_norm": 0.19063127040863037, + "learning_rate": 2.3549128477770894e-06, + "loss": 0.0032, + "step": 27400 + }, + { + "epoch": 1.6423991850919768, + "grad_norm": 0.0989932119846344, + "learning_rate": 2.3522069515052996e-06, + "loss": 0.0033, + "step": 27410 + }, + { + "epoch": 1.642998382167895, + "grad_norm": 0.1885364055633545, + "learning_rate": 2.349511203900333e-06, + "loss": 0.0046, + "step": 27420 + }, + { + "epoch": 1.6435975792438133, + "grad_norm": 0.19619473814964294, + "learning_rate": 2.3468256081258e-06, + "loss": 0.0032, + "step": 27430 + }, + { + "epoch": 1.6441967763197316, + "grad_norm": 0.3142991364002228, + "learning_rate": 2.344150167333397e-06, + "loss": 0.0041, + "step": 27440 + }, + { + "epoch": 1.6447959733956499, + "grad_norm": 0.09447146952152252, + "learning_rate": 2.3414848846629013e-06, + "loss": 0.0034, + "step": 27450 + }, + { + "epoch": 1.6453951704715681, + "grad_norm": 0.12683053314685822, + "learning_rate": 2.3388297632421727e-06, + "loss": 0.0044, + "step": 27460 + }, + { + "epoch": 1.6459943675474864, + "grad_norm": 0.30415666103363037, + "learning_rate": 2.3361848061871417e-06, + "loss": 0.0048, + "step": 27470 + }, + { + "epoch": 1.6465935646234047, + "grad_norm": 0.16130568087100983, + "learning_rate": 2.333550016601814e-06, + "loss": 0.0032, + "step": 27480 + }, + { + "epoch": 1.647192761699323, + "grad_norm": 0.19884297251701355, + "learning_rate": 2.3309253975782623e-06, + "loss": 0.0036, + "step": 27490 + }, + { + "epoch": 1.6477919587752412, + "grad_norm": 0.2124500721693039, + "learning_rate": 2.3283109521966236e-06, + "loss": 0.0048, + "step": 27500 + }, + { + "epoch": 1.6483911558511595, + "grad_norm": 0.20656649768352509, + "learning_rate": 2.325706683525094e-06, + "loss": 0.0044, + "step": 27510 + }, + { + "epoch": 1.6489903529270777, + "grad_norm": 0.08909416943788528, + "learning_rate": 2.3231125946199277e-06, + "loss": 0.0042, + "step": 27520 + }, + { + "epoch": 1.649589550002996, + "grad_norm": 0.1665533483028412, + "learning_rate": 2.320528688525433e-06, + "loss": 0.0045, + "step": 27530 + }, + { + "epoch": 1.6501887470789143, + "grad_norm": 0.11362092941999435, + "learning_rate": 2.317954968273969e-06, + "loss": 0.0028, + "step": 27540 + }, + { + "epoch": 1.6507879441548325, + "grad_norm": 0.11079458892345428, + "learning_rate": 2.3153914368859386e-06, + "loss": 0.0044, + "step": 27550 + }, + { + "epoch": 1.6513871412307508, + "grad_norm": 0.1600227653980255, + "learning_rate": 2.3128380973697868e-06, + "loss": 0.0032, + "step": 27560 + }, + { + "epoch": 1.651986338306669, + "grad_norm": 0.10425245016813278, + "learning_rate": 2.3102949527220025e-06, + "loss": 0.0024, + "step": 27570 + }, + { + "epoch": 1.6525855353825873, + "grad_norm": 0.142449289560318, + "learning_rate": 2.3077620059271054e-06, + "loss": 0.0031, + "step": 27580 + }, + { + "epoch": 1.6531847324585056, + "grad_norm": 0.13777248561382294, + "learning_rate": 2.305239259957653e-06, + "loss": 0.0031, + "step": 27590 + }, + { + "epoch": 1.6537839295344239, + "grad_norm": 0.21916678547859192, + "learning_rate": 2.302726717774224e-06, + "loss": 0.0034, + "step": 27600 + }, + { + "epoch": 1.6543831266103421, + "grad_norm": 0.11044235527515411, + "learning_rate": 2.3002243823254294e-06, + "loss": 0.0032, + "step": 27610 + }, + { + "epoch": 1.6549823236862604, + "grad_norm": 0.30877354741096497, + "learning_rate": 2.2977322565478988e-06, + "loss": 0.0039, + "step": 27620 + }, + { + "epoch": 1.6555815207621787, + "grad_norm": 0.12299321591854095, + "learning_rate": 2.2952503433662806e-06, + "loss": 0.003, + "step": 27630 + }, + { + "epoch": 1.656180717838097, + "grad_norm": 0.10495458543300629, + "learning_rate": 2.2927786456932383e-06, + "loss": 0.0036, + "step": 27640 + }, + { + "epoch": 1.6567799149140152, + "grad_norm": 0.13938122987747192, + "learning_rate": 2.2903171664294446e-06, + "loss": 0.0045, + "step": 27650 + }, + { + "epoch": 1.6573791119899335, + "grad_norm": 0.1632867157459259, + "learning_rate": 2.287865908463585e-06, + "loss": 0.0043, + "step": 27660 + }, + { + "epoch": 1.6579783090658518, + "grad_norm": 0.11505074799060822, + "learning_rate": 2.2854248746723464e-06, + "loss": 0.004, + "step": 27670 + }, + { + "epoch": 1.65857750614177, + "grad_norm": 0.19847853481769562, + "learning_rate": 2.2829940679204192e-06, + "loss": 0.0033, + "step": 27680 + }, + { + "epoch": 1.6591767032176883, + "grad_norm": 0.0759914219379425, + "learning_rate": 2.280573491060488e-06, + "loss": 0.0025, + "step": 27690 + }, + { + "epoch": 1.6597759002936066, + "grad_norm": 0.23778557777404785, + "learning_rate": 2.278163146933236e-06, + "loss": 0.0029, + "step": 27700 + }, + { + "epoch": 1.6603750973695248, + "grad_norm": 0.14102019369602203, + "learning_rate": 2.275763038367336e-06, + "loss": 0.0026, + "step": 27710 + }, + { + "epoch": 1.660974294445443, + "grad_norm": 0.09396950155496597, + "learning_rate": 2.2733731681794505e-06, + "loss": 0.0032, + "step": 27720 + }, + { + "epoch": 1.6615734915213614, + "grad_norm": 0.1578163504600525, + "learning_rate": 2.270993539174225e-06, + "loss": 0.0034, + "step": 27730 + }, + { + "epoch": 1.6621726885972796, + "grad_norm": 0.12897615134716034, + "learning_rate": 2.268624154144283e-06, + "loss": 0.0034, + "step": 27740 + }, + { + "epoch": 1.662771885673198, + "grad_norm": 0.05674497038125992, + "learning_rate": 2.266265015870234e-06, + "loss": 0.0028, + "step": 27750 + }, + { + "epoch": 1.6633710827491162, + "grad_norm": 0.12161347270011902, + "learning_rate": 2.2639161271206562e-06, + "loss": 0.0033, + "step": 27760 + }, + { + "epoch": 1.6639702798250344, + "grad_norm": 0.11158734560012817, + "learning_rate": 2.261577490652103e-06, + "loss": 0.004, + "step": 27770 + }, + { + "epoch": 1.6645694769009527, + "grad_norm": 0.09899834543466568, + "learning_rate": 2.259249109209093e-06, + "loss": 0.003, + "step": 27780 + }, + { + "epoch": 1.665168673976871, + "grad_norm": 0.2654432952404022, + "learning_rate": 2.256930985524111e-06, + "loss": 0.0032, + "step": 27790 + }, + { + "epoch": 1.6657678710527892, + "grad_norm": 0.1188909262418747, + "learning_rate": 2.2546231223176062e-06, + "loss": 0.0034, + "step": 27800 + }, + { + "epoch": 1.6663670681287075, + "grad_norm": 0.4437197148799896, + "learning_rate": 2.2523255222979846e-06, + "loss": 0.005, + "step": 27810 + }, + { + "epoch": 1.6669662652046258, + "grad_norm": 0.17790400981903076, + "learning_rate": 2.2500381881616064e-06, + "loss": 0.0033, + "step": 27820 + }, + { + "epoch": 1.667565462280544, + "grad_norm": 0.10867536813020706, + "learning_rate": 2.2477611225927847e-06, + "loss": 0.0032, + "step": 27830 + }, + { + "epoch": 1.6681646593564623, + "grad_norm": 0.10958084464073181, + "learning_rate": 2.2454943282637852e-06, + "loss": 0.0024, + "step": 27840 + }, + { + "epoch": 1.6687638564323806, + "grad_norm": 0.06520948559045792, + "learning_rate": 2.2432378078348166e-06, + "loss": 0.0029, + "step": 27850 + }, + { + "epoch": 1.6693630535082988, + "grad_norm": 0.13580842316150665, + "learning_rate": 2.2409915639540295e-06, + "loss": 0.0029, + "step": 27860 + }, + { + "epoch": 1.669962250584217, + "grad_norm": 0.15817365050315857, + "learning_rate": 2.2387555992575192e-06, + "loss": 0.002, + "step": 27870 + }, + { + "epoch": 1.6705614476601354, + "grad_norm": 0.35285326838493347, + "learning_rate": 2.236529916369313e-06, + "loss": 0.0062, + "step": 27880 + }, + { + "epoch": 1.6711606447360536, + "grad_norm": 0.24554285407066345, + "learning_rate": 2.2343145179013726e-06, + "loss": 0.0043, + "step": 27890 + }, + { + "epoch": 1.671759841811972, + "grad_norm": 0.16509993374347687, + "learning_rate": 2.232109406453595e-06, + "loss": 0.0032, + "step": 27900 + }, + { + "epoch": 1.6723590388878904, + "grad_norm": 0.13468189537525177, + "learning_rate": 2.229914584613798e-06, + "loss": 0.0026, + "step": 27910 + }, + { + "epoch": 1.6729582359638084, + "grad_norm": 0.17360062897205353, + "learning_rate": 2.22773005495773e-06, + "loss": 0.0029, + "step": 27920 + }, + { + "epoch": 1.673557433039727, + "grad_norm": 0.12582021951675415, + "learning_rate": 2.2255558200490557e-06, + "loss": 0.0035, + "step": 27930 + }, + { + "epoch": 1.674156630115645, + "grad_norm": 0.1015002503991127, + "learning_rate": 2.2233918824393625e-06, + "loss": 0.0033, + "step": 27940 + }, + { + "epoch": 1.6747558271915635, + "grad_norm": 0.3634873926639557, + "learning_rate": 2.221238244668151e-06, + "loss": 0.0031, + "step": 27950 + }, + { + "epoch": 1.6753550242674815, + "grad_norm": 0.15137465298175812, + "learning_rate": 2.219094909262834e-06, + "loss": 0.006, + "step": 27960 + }, + { + "epoch": 1.6759542213434, + "grad_norm": 0.09976715594530106, + "learning_rate": 2.2169618787387374e-06, + "loss": 0.0031, + "step": 27970 + }, + { + "epoch": 1.676553418419318, + "grad_norm": 0.09910957515239716, + "learning_rate": 2.2148391555990905e-06, + "loss": 0.0024, + "step": 27980 + }, + { + "epoch": 1.6771526154952365, + "grad_norm": 0.11276205629110336, + "learning_rate": 2.212726742335025e-06, + "loss": 0.0032, + "step": 27990 + }, + { + "epoch": 1.6777518125711546, + "grad_norm": 0.22798313200473785, + "learning_rate": 2.210624641425579e-06, + "loss": 0.004, + "step": 28000 + }, + { + "epoch": 1.678351009647073, + "grad_norm": 0.10564117878675461, + "learning_rate": 2.208532855337684e-06, + "loss": 0.003, + "step": 28010 + }, + { + "epoch": 1.6789502067229911, + "grad_norm": 0.4329298138618469, + "learning_rate": 2.2064513865261646e-06, + "loss": 0.0048, + "step": 28020 + }, + { + "epoch": 1.6795494037989096, + "grad_norm": 0.19210365414619446, + "learning_rate": 2.204380237433745e-06, + "loss": 0.0034, + "step": 28030 + }, + { + "epoch": 1.6801486008748276, + "grad_norm": 0.15383297204971313, + "learning_rate": 2.202319410491029e-06, + "loss": 0.0032, + "step": 28040 + }, + { + "epoch": 1.6807477979507461, + "grad_norm": 0.0796160027384758, + "learning_rate": 2.2002689081165155e-06, + "loss": 0.0029, + "step": 28050 + }, + { + "epoch": 1.6813469950266642, + "grad_norm": 0.13568224012851715, + "learning_rate": 2.1982287327165827e-06, + "loss": 0.0032, + "step": 28060 + }, + { + "epoch": 1.6819461921025827, + "grad_norm": 0.16137683391571045, + "learning_rate": 2.19619888668549e-06, + "loss": 0.0034, + "step": 28070 + }, + { + "epoch": 1.6825453891785007, + "grad_norm": 0.07282284647226334, + "learning_rate": 2.1941793724053733e-06, + "loss": 0.0031, + "step": 28080 + }, + { + "epoch": 1.6831445862544192, + "grad_norm": 0.17293596267700195, + "learning_rate": 2.1921701922462463e-06, + "loss": 0.0034, + "step": 28090 + }, + { + "epoch": 1.6837437833303373, + "grad_norm": 0.12304569780826569, + "learning_rate": 2.190171348565994e-06, + "loss": 0.0032, + "step": 28100 + }, + { + "epoch": 1.6843429804062557, + "grad_norm": 0.08121246099472046, + "learning_rate": 2.188182843710369e-06, + "loss": 0.0031, + "step": 28110 + }, + { + "epoch": 1.6849421774821738, + "grad_norm": 0.20509444177150726, + "learning_rate": 2.1862046800129964e-06, + "loss": 0.003, + "step": 28120 + }, + { + "epoch": 1.6855413745580923, + "grad_norm": 0.13242113590240479, + "learning_rate": 2.1842368597953578e-06, + "loss": 0.0028, + "step": 28130 + }, + { + "epoch": 1.6861405716340103, + "grad_norm": 0.14173154532909393, + "learning_rate": 2.1822793853668e-06, + "loss": 0.0028, + "step": 28140 + }, + { + "epoch": 1.6867397687099288, + "grad_norm": 0.40674927830696106, + "learning_rate": 2.18033225902453e-06, + "loss": 0.0038, + "step": 28150 + }, + { + "epoch": 1.6873389657858469, + "grad_norm": 0.08623358607292175, + "learning_rate": 2.17839548305361e-06, + "loss": 0.0044, + "step": 28160 + }, + { + "epoch": 1.6879381628617653, + "grad_norm": 0.15655292570590973, + "learning_rate": 2.1764690597269507e-06, + "loss": 0.0035, + "step": 28170 + }, + { + "epoch": 1.6885373599376834, + "grad_norm": 0.13297052681446075, + "learning_rate": 2.17455299130532e-06, + "loss": 0.0036, + "step": 28180 + }, + { + "epoch": 1.6891365570136019, + "grad_norm": 0.16847316920757294, + "learning_rate": 2.17264728003733e-06, + "loss": 0.0045, + "step": 28190 + }, + { + "epoch": 1.68973575408952, + "grad_norm": 0.24691548943519592, + "learning_rate": 2.17075192815944e-06, + "loss": 0.0036, + "step": 28200 + }, + { + "epoch": 1.6903349511654384, + "grad_norm": 0.16196060180664062, + "learning_rate": 2.168866937895951e-06, + "loss": 0.0034, + "step": 28210 + }, + { + "epoch": 1.6909341482413565, + "grad_norm": 0.24568283557891846, + "learning_rate": 2.166992311459001e-06, + "loss": 0.0046, + "step": 28220 + }, + { + "epoch": 1.691533345317275, + "grad_norm": 0.2796950042247772, + "learning_rate": 2.1651280510485727e-06, + "loss": 0.0028, + "step": 28230 + }, + { + "epoch": 1.692132542393193, + "grad_norm": 0.12654386460781097, + "learning_rate": 2.163274158852476e-06, + "loss": 0.0028, + "step": 28240 + }, + { + "epoch": 1.6927317394691115, + "grad_norm": 0.26169532537460327, + "learning_rate": 2.1614306370463605e-06, + "loss": 0.0039, + "step": 28250 + }, + { + "epoch": 1.6933309365450295, + "grad_norm": 0.19274167716503143, + "learning_rate": 2.1595974877936977e-06, + "loss": 0.0035, + "step": 28260 + }, + { + "epoch": 1.693930133620948, + "grad_norm": 0.2024545818567276, + "learning_rate": 2.1577747132457933e-06, + "loss": 0.0031, + "step": 28270 + }, + { + "epoch": 1.694529330696866, + "grad_norm": 0.12702754139900208, + "learning_rate": 2.155962315541773e-06, + "loss": 0.0031, + "step": 28280 + }, + { + "epoch": 1.6951285277727846, + "grad_norm": 0.09141751378774643, + "learning_rate": 2.154160296808588e-06, + "loss": 0.0034, + "step": 28290 + }, + { + "epoch": 1.6957277248487026, + "grad_norm": 0.049686893820762634, + "learning_rate": 2.1523686591610064e-06, + "loss": 0.0023, + "step": 28300 + }, + { + "epoch": 1.696326921924621, + "grad_norm": 0.14534041285514832, + "learning_rate": 2.1505874047016146e-06, + "loss": 0.0031, + "step": 28310 + }, + { + "epoch": 1.6969261190005391, + "grad_norm": 0.08376278728246689, + "learning_rate": 2.1488165355208147e-06, + "loss": 0.0036, + "step": 28320 + }, + { + "epoch": 1.6975253160764576, + "grad_norm": 0.14242660999298096, + "learning_rate": 2.14705605369682e-06, + "loss": 0.0027, + "step": 28330 + }, + { + "epoch": 1.6981245131523757, + "grad_norm": 0.13080888986587524, + "learning_rate": 2.145305961295655e-06, + "loss": 0.0026, + "step": 28340 + }, + { + "epoch": 1.6987237102282942, + "grad_norm": 0.15067961812019348, + "learning_rate": 2.143566260371149e-06, + "loss": 0.0027, + "step": 28350 + }, + { + "epoch": 1.6993229073042122, + "grad_norm": 0.06736161559820175, + "learning_rate": 2.141836952964938e-06, + "loss": 0.002, + "step": 28360 + }, + { + "epoch": 1.6999221043801307, + "grad_norm": 0.17074856162071228, + "learning_rate": 2.1401180411064616e-06, + "loss": 0.0033, + "step": 28370 + }, + { + "epoch": 1.7005213014560487, + "grad_norm": 0.12649405002593994, + "learning_rate": 2.138409526812959e-06, + "loss": 0.0033, + "step": 28380 + }, + { + "epoch": 1.7011204985319672, + "grad_norm": 0.15945205092430115, + "learning_rate": 2.1367114120894663e-06, + "loss": 0.0034, + "step": 28390 + }, + { + "epoch": 1.7017196956078853, + "grad_norm": 0.09780653566122055, + "learning_rate": 2.1350236989288136e-06, + "loss": 0.0025, + "step": 28400 + }, + { + "epoch": 1.7023188926838038, + "grad_norm": 0.4059111773967743, + "learning_rate": 2.1333463893116294e-06, + "loss": 0.0036, + "step": 28410 + }, + { + "epoch": 1.7029180897597218, + "grad_norm": 0.17648592591285706, + "learning_rate": 2.131679485206329e-06, + "loss": 0.0044, + "step": 28420 + }, + { + "epoch": 1.7035172868356403, + "grad_norm": 0.22077783942222595, + "learning_rate": 2.130022988569117e-06, + "loss": 0.0024, + "step": 28430 + }, + { + "epoch": 1.7041164839115586, + "grad_norm": 0.29329779744148254, + "learning_rate": 2.128376901343984e-06, + "loss": 0.0033, + "step": 28440 + }, + { + "epoch": 1.7047156809874768, + "grad_norm": 0.11621195822954178, + "learning_rate": 2.1267412254627056e-06, + "loss": 0.0034, + "step": 28450 + }, + { + "epoch": 1.705314878063395, + "grad_norm": 0.15548695623874664, + "learning_rate": 2.1251159628448386e-06, + "loss": 0.0029, + "step": 28460 + }, + { + "epoch": 1.7059140751393134, + "grad_norm": 0.11100694537162781, + "learning_rate": 2.1235011153977192e-06, + "loss": 0.0028, + "step": 28470 + }, + { + "epoch": 1.7065132722152316, + "grad_norm": 0.226019024848938, + "learning_rate": 2.121896685016461e-06, + "loss": 0.0039, + "step": 28480 + }, + { + "epoch": 1.70711246929115, + "grad_norm": 0.11407872289419174, + "learning_rate": 2.1203026735839514e-06, + "loss": 0.0027, + "step": 28490 + }, + { + "epoch": 1.7077116663670682, + "grad_norm": 0.10995867848396301, + "learning_rate": 2.118719082970852e-06, + "loss": 0.003, + "step": 28500 + }, + { + "epoch": 1.7083108634429864, + "grad_norm": 0.2666904926300049, + "learning_rate": 2.1171459150355947e-06, + "loss": 0.0029, + "step": 28510 + }, + { + "epoch": 1.7089100605189047, + "grad_norm": 0.17604438960552216, + "learning_rate": 2.115583171624381e-06, + "loss": 0.0032, + "step": 28520 + }, + { + "epoch": 1.709509257594823, + "grad_norm": 0.26522526144981384, + "learning_rate": 2.114030854571176e-06, + "loss": 0.0031, + "step": 28530 + }, + { + "epoch": 1.7101084546707412, + "grad_norm": 0.0793771743774414, + "learning_rate": 2.1124889656977097e-06, + "loss": 0.0029, + "step": 28540 + }, + { + "epoch": 1.7107076517466595, + "grad_norm": 0.15632960200309753, + "learning_rate": 2.1109575068134756e-06, + "loss": 0.0039, + "step": 28550 + }, + { + "epoch": 1.7113068488225778, + "grad_norm": 0.14011208713054657, + "learning_rate": 2.1094364797157267e-06, + "loss": 0.0037, + "step": 28560 + }, + { + "epoch": 1.711906045898496, + "grad_norm": 0.12041427195072174, + "learning_rate": 2.107925886189472e-06, + "loss": 0.0043, + "step": 28570 + }, + { + "epoch": 1.7125052429744143, + "grad_norm": 0.1488238424062729, + "learning_rate": 2.1064257280074763e-06, + "loss": 0.0032, + "step": 28580 + }, + { + "epoch": 1.7131044400503326, + "grad_norm": 0.21010251343250275, + "learning_rate": 2.1049360069302594e-06, + "loss": 0.0034, + "step": 28590 + }, + { + "epoch": 1.7137036371262508, + "grad_norm": 0.05566203221678734, + "learning_rate": 2.1034567247060926e-06, + "loss": 0.0027, + "step": 28600 + }, + { + "epoch": 1.7143028342021691, + "grad_norm": 0.21272292733192444, + "learning_rate": 2.1019878830709968e-06, + "loss": 0.0044, + "step": 28610 + }, + { + "epoch": 1.7149020312780874, + "grad_norm": 0.12333092093467712, + "learning_rate": 2.100529483748737e-06, + "loss": 0.0045, + "step": 28620 + }, + { + "epoch": 1.7155012283540056, + "grad_norm": 0.13811928033828735, + "learning_rate": 2.099081528450828e-06, + "loss": 0.0035, + "step": 28630 + }, + { + "epoch": 1.716100425429924, + "grad_norm": 0.07431097328662872, + "learning_rate": 2.097644018876524e-06, + "loss": 0.0022, + "step": 28640 + }, + { + "epoch": 1.7166996225058422, + "grad_norm": 0.08534728735685349, + "learning_rate": 2.096216956712826e-06, + "loss": 0.0017, + "step": 28650 + }, + { + "epoch": 1.7172988195817604, + "grad_norm": 0.3239804804325104, + "learning_rate": 2.0948003436344666e-06, + "loss": 0.0029, + "step": 28660 + }, + { + "epoch": 1.7178980166576787, + "grad_norm": 0.13592901825904846, + "learning_rate": 2.0933941813039244e-06, + "loss": 0.0023, + "step": 28670 + }, + { + "epoch": 1.718497213733597, + "grad_norm": 0.19489605724811554, + "learning_rate": 2.091998471371406e-06, + "loss": 0.0036, + "step": 28680 + }, + { + "epoch": 1.7190964108095153, + "grad_norm": 0.10355830937623978, + "learning_rate": 2.0906132154748557e-06, + "loss": 0.0028, + "step": 28690 + }, + { + "epoch": 1.7196956078854335, + "grad_norm": 0.09176923334598541, + "learning_rate": 2.0892384152399504e-06, + "loss": 0.0038, + "step": 28700 + }, + { + "epoch": 1.7202948049613518, + "grad_norm": 0.11903666704893112, + "learning_rate": 2.0878740722800917e-06, + "loss": 0.0031, + "step": 28710 + }, + { + "epoch": 1.72089400203727, + "grad_norm": 0.0953182652592659, + "learning_rate": 2.086520188196413e-06, + "loss": 0.0035, + "step": 28720 + }, + { + "epoch": 1.7214931991131883, + "grad_norm": 0.16318652033805847, + "learning_rate": 2.085176764577774e-06, + "loss": 0.0067, + "step": 28730 + }, + { + "epoch": 1.7220923961891066, + "grad_norm": 0.07236472517251968, + "learning_rate": 2.083843803000755e-06, + "loss": 0.0028, + "step": 28740 + }, + { + "epoch": 1.7226915932650249, + "grad_norm": 0.1972363442182541, + "learning_rate": 2.0825213050296636e-06, + "loss": 0.0034, + "step": 28750 + }, + { + "epoch": 1.7232907903409431, + "grad_norm": 0.09618771076202393, + "learning_rate": 2.081209272216522e-06, + "loss": 0.0041, + "step": 28760 + }, + { + "epoch": 1.7238899874168614, + "grad_norm": 0.10503746569156647, + "learning_rate": 2.079907706101075e-06, + "loss": 0.0023, + "step": 28770 + }, + { + "epoch": 1.7244891844927797, + "grad_norm": 0.06267210096120834, + "learning_rate": 2.0786166082107833e-06, + "loss": 0.0037, + "step": 28780 + }, + { + "epoch": 1.725088381568698, + "grad_norm": 0.08882488310337067, + "learning_rate": 2.0773359800608217e-06, + "loss": 0.0033, + "step": 28790 + }, + { + "epoch": 1.7256875786446162, + "grad_norm": 0.0851057916879654, + "learning_rate": 2.076065823154079e-06, + "loss": 0.0031, + "step": 28800 + }, + { + "epoch": 1.7262867757205345, + "grad_norm": 0.07359647005796432, + "learning_rate": 2.0748061389811543e-06, + "loss": 0.0023, + "step": 28810 + }, + { + "epoch": 1.7268859727964527, + "grad_norm": 0.10275846719741821, + "learning_rate": 2.073556929020357e-06, + "loss": 0.003, + "step": 28820 + }, + { + "epoch": 1.727485169872371, + "grad_norm": 0.20987747609615326, + "learning_rate": 2.0723181947377057e-06, + "loss": 0.0028, + "step": 28830 + }, + { + "epoch": 1.7280843669482893, + "grad_norm": 0.18235883116722107, + "learning_rate": 2.0710899375869237e-06, + "loss": 0.0023, + "step": 28840 + }, + { + "epoch": 1.7286835640242075, + "grad_norm": 0.33091968297958374, + "learning_rate": 2.0698721590094387e-06, + "loss": 0.0036, + "step": 28850 + }, + { + "epoch": 1.7292827611001258, + "grad_norm": 0.17163614928722382, + "learning_rate": 2.0686648604343824e-06, + "loss": 0.0038, + "step": 28860 + }, + { + "epoch": 1.729881958176044, + "grad_norm": 0.1668156236410141, + "learning_rate": 2.067468043278587e-06, + "loss": 0.0036, + "step": 28870 + }, + { + "epoch": 1.7304811552519623, + "grad_norm": 0.11935430020093918, + "learning_rate": 2.066281708946583e-06, + "loss": 0.0038, + "step": 28880 + }, + { + "epoch": 1.7310803523278806, + "grad_norm": 0.12218618392944336, + "learning_rate": 2.0651058588306007e-06, + "loss": 0.0026, + "step": 28890 + }, + { + "epoch": 1.7316795494037989, + "grad_norm": 0.07735276222229004, + "learning_rate": 2.063940494310565e-06, + "loss": 0.0037, + "step": 28900 + }, + { + "epoch": 1.7322787464797171, + "grad_norm": 0.11424548178911209, + "learning_rate": 2.062785616754097e-06, + "loss": 0.0033, + "step": 28910 + }, + { + "epoch": 1.7328779435556354, + "grad_norm": 0.14824451506137848, + "learning_rate": 2.0616412275165097e-06, + "loss": 0.0033, + "step": 28920 + }, + { + "epoch": 1.7334771406315537, + "grad_norm": 0.1048273965716362, + "learning_rate": 2.0605073279408063e-06, + "loss": 0.0029, + "step": 28930 + }, + { + "epoch": 1.734076337707472, + "grad_norm": 0.29807451367378235, + "learning_rate": 2.0593839193576833e-06, + "loss": 0.0042, + "step": 28940 + }, + { + "epoch": 1.7346755347833902, + "grad_norm": 0.1315613090991974, + "learning_rate": 2.058271003085521e-06, + "loss": 0.0027, + "step": 28950 + }, + { + "epoch": 1.7352747318593085, + "grad_norm": 0.1035790666937828, + "learning_rate": 2.0571685804303905e-06, + "loss": 0.0027, + "step": 28960 + }, + { + "epoch": 1.7358739289352267, + "grad_norm": 0.07466026395559311, + "learning_rate": 2.0560766526860447e-06, + "loss": 0.0034, + "step": 28970 + }, + { + "epoch": 1.7364731260111452, + "grad_norm": 0.11429832130670547, + "learning_rate": 2.054995221133923e-06, + "loss": 0.003, + "step": 28980 + }, + { + "epoch": 1.7370723230870633, + "grad_norm": 0.0868537500500679, + "learning_rate": 2.053924287043144e-06, + "loss": 0.0021, + "step": 28990 + }, + { + "epoch": 1.7376715201629818, + "grad_norm": 0.120276540517807, + "learning_rate": 2.0528638516705106e-06, + "loss": 0.0031, + "step": 29000 + }, + { + "epoch": 1.7382707172388998, + "grad_norm": 0.13164187967777252, + "learning_rate": 2.051813916260501e-06, + "loss": 0.0033, + "step": 29010 + }, + { + "epoch": 1.7388699143148183, + "grad_norm": 0.11800370365381241, + "learning_rate": 2.050774482045273e-06, + "loss": 0.003, + "step": 29020 + }, + { + "epoch": 1.7394691113907363, + "grad_norm": 0.09126367419958115, + "learning_rate": 2.049745550244661e-06, + "loss": 0.003, + "step": 29030 + }, + { + "epoch": 1.7400683084666548, + "grad_norm": 0.4037914276123047, + "learning_rate": 2.0487271220661735e-06, + "loss": 0.0031, + "step": 29040 + }, + { + "epoch": 1.7406675055425729, + "grad_norm": 0.11752860993146896, + "learning_rate": 2.047719198704994e-06, + "loss": 0.0033, + "step": 29050 + }, + { + "epoch": 1.7412667026184914, + "grad_norm": 0.2887340188026428, + "learning_rate": 2.0467217813439762e-06, + "loss": 0.0031, + "step": 29060 + }, + { + "epoch": 1.7418658996944094, + "grad_norm": 0.16717016696929932, + "learning_rate": 2.0457348711536426e-06, + "loss": 0.0029, + "step": 29070 + }, + { + "epoch": 1.742465096770328, + "grad_norm": 0.10888686776161194, + "learning_rate": 2.0447584692921894e-06, + "loss": 0.0029, + "step": 29080 + }, + { + "epoch": 1.743064293846246, + "grad_norm": 0.04226887598633766, + "learning_rate": 2.043792576905478e-06, + "loss": 0.003, + "step": 29090 + }, + { + "epoch": 1.7436634909221644, + "grad_norm": 0.20809385180473328, + "learning_rate": 2.0428371951270394e-06, + "loss": 0.0035, + "step": 29100 + }, + { + "epoch": 1.7442626879980825, + "grad_norm": 0.09200141578912735, + "learning_rate": 2.0418923250780633e-06, + "loss": 0.0027, + "step": 29110 + }, + { + "epoch": 1.744861885074001, + "grad_norm": 0.11600892245769501, + "learning_rate": 2.0409579678674084e-06, + "loss": 0.0024, + "step": 29120 + }, + { + "epoch": 1.745461082149919, + "grad_norm": 0.12580904364585876, + "learning_rate": 2.040034124591597e-06, + "loss": 0.0032, + "step": 29130 + }, + { + "epoch": 1.7460602792258375, + "grad_norm": 0.14242342114448547, + "learning_rate": 2.039120796334809e-06, + "loss": 0.0031, + "step": 29140 + }, + { + "epoch": 1.7466594763017556, + "grad_norm": 0.06451822817325592, + "learning_rate": 2.0382179841688868e-06, + "loss": 0.0029, + "step": 29150 + }, + { + "epoch": 1.747258673377674, + "grad_norm": 0.1550203114748001, + "learning_rate": 2.0373256891533293e-06, + "loss": 0.0039, + "step": 29160 + }, + { + "epoch": 1.747857870453592, + "grad_norm": 0.1968315690755844, + "learning_rate": 2.0364439123352956e-06, + "loss": 0.004, + "step": 29170 + }, + { + "epoch": 1.7484570675295106, + "grad_norm": 0.16437779366970062, + "learning_rate": 2.0355726547495998e-06, + "loss": 0.0027, + "step": 29180 + }, + { + "epoch": 1.7490562646054286, + "grad_norm": 0.1665470451116562, + "learning_rate": 2.034711917418711e-06, + "loss": 0.0032, + "step": 29190 + }, + { + "epoch": 1.7496554616813471, + "grad_norm": 0.070770762860775, + "learning_rate": 2.033861701352752e-06, + "loss": 0.0037, + "step": 29200 + }, + { + "epoch": 1.7502546587572652, + "grad_norm": 0.23530884087085724, + "learning_rate": 2.0330220075494992e-06, + "loss": 0.0031, + "step": 29210 + }, + { + "epoch": 1.7508538558331836, + "grad_norm": 0.10555171966552734, + "learning_rate": 2.0321928369943807e-06, + "loss": 0.0052, + "step": 29220 + }, + { + "epoch": 1.7514530529091017, + "grad_norm": 0.0906955823302269, + "learning_rate": 2.031374190660474e-06, + "loss": 0.0026, + "step": 29230 + }, + { + "epoch": 1.7520522499850202, + "grad_norm": 0.137167826294899, + "learning_rate": 2.0305660695085054e-06, + "loss": 0.0045, + "step": 29240 + }, + { + "epoch": 1.7526514470609382, + "grad_norm": 0.10824514180421829, + "learning_rate": 2.0297684744868494e-06, + "loss": 0.003, + "step": 29250 + }, + { + "epoch": 1.7532506441368567, + "grad_norm": 0.1013123095035553, + "learning_rate": 2.0289814065315306e-06, + "loss": 0.0033, + "step": 29260 + }, + { + "epoch": 1.7538498412127748, + "grad_norm": 0.05192271247506142, + "learning_rate": 2.0282048665662153e-06, + "loss": 0.0025, + "step": 29270 + }, + { + "epoch": 1.7544490382886933, + "grad_norm": 0.2546662986278534, + "learning_rate": 2.0274388555022176e-06, + "loss": 0.0041, + "step": 29280 + }, + { + "epoch": 1.7550482353646113, + "grad_norm": 0.16946350038051605, + "learning_rate": 2.0266833742384928e-06, + "loss": 0.0033, + "step": 29290 + }, + { + "epoch": 1.7556474324405298, + "grad_norm": 0.19652776420116425, + "learning_rate": 2.0259384236616404e-06, + "loss": 0.0041, + "step": 29300 + }, + { + "epoch": 1.7562466295164478, + "grad_norm": 0.1391136795282364, + "learning_rate": 2.0252040046459022e-06, + "loss": 0.0052, + "step": 29310 + }, + { + "epoch": 1.7568458265923663, + "grad_norm": 0.1327095329761505, + "learning_rate": 2.02448011805316e-06, + "loss": 0.0033, + "step": 29320 + }, + { + "epoch": 1.7574450236682844, + "grad_norm": 0.12003149092197418, + "learning_rate": 2.023766764732934e-06, + "loss": 0.003, + "step": 29330 + }, + { + "epoch": 1.7580442207442029, + "grad_norm": 0.10392506420612335, + "learning_rate": 2.0230639455223853e-06, + "loss": 0.0028, + "step": 29340 + }, + { + "epoch": 1.758643417820121, + "grad_norm": 0.19025546312332153, + "learning_rate": 2.0223716612463095e-06, + "loss": 0.0043, + "step": 29350 + }, + { + "epoch": 1.7592426148960394, + "grad_norm": 0.2707730829715729, + "learning_rate": 2.0216899127171424e-06, + "loss": 0.0033, + "step": 29360 + }, + { + "epoch": 1.7598418119719574, + "grad_norm": 0.20897458493709564, + "learning_rate": 2.0210187007349534e-06, + "loss": 0.0037, + "step": 29370 + }, + { + "epoch": 1.760441009047876, + "grad_norm": 0.1678476631641388, + "learning_rate": 2.0203580260874474e-06, + "loss": 0.0031, + "step": 29380 + }, + { + "epoch": 1.761040206123794, + "grad_norm": 0.06412776559591293, + "learning_rate": 2.019707889549963e-06, + "loss": 0.0027, + "step": 29390 + }, + { + "epoch": 1.7616394031997125, + "grad_norm": 0.07877464592456818, + "learning_rate": 2.01906829188547e-06, + "loss": 0.0027, + "step": 29400 + }, + { + "epoch": 1.7622386002756305, + "grad_norm": 0.28429147601127625, + "learning_rate": 2.018439233844574e-06, + "loss": 0.0036, + "step": 29410 + }, + { + "epoch": 1.762837797351549, + "grad_norm": 0.11621754616498947, + "learning_rate": 2.0178207161655087e-06, + "loss": 0.0028, + "step": 29420 + }, + { + "epoch": 1.763436994427467, + "grad_norm": 0.14076471328735352, + "learning_rate": 2.0172127395741398e-06, + "loss": 0.0037, + "step": 29430 + }, + { + "epoch": 1.7640361915033855, + "grad_norm": 0.15799261629581451, + "learning_rate": 2.0166153047839603e-06, + "loss": 0.0035, + "step": 29440 + }, + { + "epoch": 1.7646353885793036, + "grad_norm": 0.08505155891180038, + "learning_rate": 2.016028412496094e-06, + "loss": 0.0024, + "step": 29450 + }, + { + "epoch": 1.765234585655222, + "grad_norm": 0.10400725156068802, + "learning_rate": 2.015452063399292e-06, + "loss": 0.0048, + "step": 29460 + }, + { + "epoch": 1.7658337827311401, + "grad_norm": 0.19443514943122864, + "learning_rate": 2.014886258169932e-06, + "loss": 0.0033, + "step": 29470 + }, + { + "epoch": 1.7664329798070586, + "grad_norm": 0.06996763497591019, + "learning_rate": 2.014330997472017e-06, + "loss": 0.0031, + "step": 29480 + }, + { + "epoch": 1.7670321768829766, + "grad_norm": 0.12015870213508606, + "learning_rate": 2.013786281957177e-06, + "loss": 0.0028, + "step": 29490 + }, + { + "epoch": 1.7676313739588951, + "grad_norm": 0.14683139324188232, + "learning_rate": 2.0132521122646662e-06, + "loss": 0.004, + "step": 29500 + }, + { + "epoch": 1.7682305710348134, + "grad_norm": 0.1023707166314125, + "learning_rate": 2.0127284890213623e-06, + "loss": 0.0031, + "step": 29510 + }, + { + "epoch": 1.7688297681107317, + "grad_norm": 0.16903221607208252, + "learning_rate": 2.012215412841767e-06, + "loss": 0.0027, + "step": 29520 + }, + { + "epoch": 1.76942896518665, + "grad_norm": 0.10042630881071091, + "learning_rate": 2.011712884328003e-06, + "loss": 0.0035, + "step": 29530 + }, + { + "epoch": 1.7700281622625682, + "grad_norm": 0.0850566178560257, + "learning_rate": 2.011220904069815e-06, + "loss": 0.0059, + "step": 29540 + }, + { + "epoch": 1.7706273593384865, + "grad_norm": 0.09834299236536026, + "learning_rate": 2.01073947264457e-06, + "loss": 0.0028, + "step": 29550 + }, + { + "epoch": 1.7712265564144047, + "grad_norm": 0.13409706950187683, + "learning_rate": 2.0102685906172543e-06, + "loss": 0.0031, + "step": 29560 + }, + { + "epoch": 1.771825753490323, + "grad_norm": 0.08727999776601791, + "learning_rate": 2.009808258540475e-06, + "loss": 0.0032, + "step": 29570 + }, + { + "epoch": 1.7724249505662413, + "grad_norm": 0.05625200644135475, + "learning_rate": 2.009358476954456e-06, + "loss": 0.003, + "step": 29580 + }, + { + "epoch": 1.7730241476421595, + "grad_norm": 0.13802480697631836, + "learning_rate": 2.008919246387043e-06, + "loss": 0.003, + "step": 29590 + }, + { + "epoch": 1.7736233447180778, + "grad_norm": 0.12048090994358063, + "learning_rate": 2.0084905673536952e-06, + "loss": 0.0023, + "step": 29600 + }, + { + "epoch": 1.774222541793996, + "grad_norm": 0.06570231169462204, + "learning_rate": 2.0080724403574922e-06, + "loss": 0.0025, + "step": 29610 + }, + { + "epoch": 1.7748217388699143, + "grad_norm": 0.1293211132287979, + "learning_rate": 2.007664865889131e-06, + "loss": 0.0029, + "step": 29620 + }, + { + "epoch": 1.7754209359458326, + "grad_norm": 0.19836539030075073, + "learning_rate": 2.0072678444269208e-06, + "loss": 0.0026, + "step": 29630 + }, + { + "epoch": 1.7760201330217509, + "grad_norm": 0.23906737565994263, + "learning_rate": 2.006881376436789e-06, + "loss": 0.0028, + "step": 29640 + }, + { + "epoch": 1.7766193300976691, + "grad_norm": 0.1388060599565506, + "learning_rate": 2.0065054623722772e-06, + "loss": 0.0034, + "step": 29650 + }, + { + "epoch": 1.7772185271735874, + "grad_norm": 0.09379242360591888, + "learning_rate": 2.0061401026745425e-06, + "loss": 0.0031, + "step": 29660 + }, + { + "epoch": 1.7778177242495057, + "grad_norm": 0.18343773484230042, + "learning_rate": 2.005785297772354e-06, + "loss": 0.003, + "step": 29670 + }, + { + "epoch": 1.778416921325424, + "grad_norm": 0.16866934299468994, + "learning_rate": 2.005441048082095e-06, + "loss": 0.0026, + "step": 29680 + }, + { + "epoch": 1.7790161184013422, + "grad_norm": 0.08610724657773972, + "learning_rate": 2.0051073540077617e-06, + "loss": 0.0027, + "step": 29690 + }, + { + "epoch": 1.7796153154772605, + "grad_norm": 0.060445595532655716, + "learning_rate": 2.0047842159409633e-06, + "loss": 0.002, + "step": 29700 + }, + { + "epoch": 1.7802145125531788, + "grad_norm": 0.19706133008003235, + "learning_rate": 2.004471634260919e-06, + "loss": 0.0029, + "step": 29710 + }, + { + "epoch": 1.780813709629097, + "grad_norm": 0.10716386139392853, + "learning_rate": 2.004169609334462e-06, + "loss": 0.004, + "step": 29720 + }, + { + "epoch": 1.7814129067050153, + "grad_norm": 0.18327921628952026, + "learning_rate": 2.003878141516035e-06, + "loss": 0.0044, + "step": 29730 + }, + { + "epoch": 1.7820121037809336, + "grad_norm": 0.1188778281211853, + "learning_rate": 2.0035972311476916e-06, + "loss": 0.0042, + "step": 29740 + }, + { + "epoch": 1.7826113008568518, + "grad_norm": 0.2874482274055481, + "learning_rate": 2.0033268785590954e-06, + "loss": 0.003, + "step": 29750 + }, + { + "epoch": 1.78321049793277, + "grad_norm": 0.07464325428009033, + "learning_rate": 2.003067084067522e-06, + "loss": 0.0029, + "step": 29760 + }, + { + "epoch": 1.7838096950086884, + "grad_norm": 0.17671462893486023, + "learning_rate": 2.0028178479778523e-06, + "loss": 0.0029, + "step": 29770 + }, + { + "epoch": 1.7844088920846066, + "grad_norm": 0.09008106589317322, + "learning_rate": 2.0025791705825805e-06, + "loss": 0.0031, + "step": 29780 + }, + { + "epoch": 1.785008089160525, + "grad_norm": 0.06681933254003525, + "learning_rate": 2.0023510521618066e-06, + "loss": 0.0029, + "step": 29790 + }, + { + "epoch": 1.7856072862364432, + "grad_norm": 0.09899364411830902, + "learning_rate": 2.0021334929832407e-06, + "loss": 0.0032, + "step": 29800 + }, + { + "epoch": 1.7862064833123614, + "grad_norm": 0.0671558529138565, + "learning_rate": 2.0019264933022016e-06, + "loss": 0.0036, + "step": 29810 + }, + { + "epoch": 1.7868056803882797, + "grad_norm": 0.11834210157394409, + "learning_rate": 2.001730053361614e-06, + "loss": 0.0033, + "step": 29820 + }, + { + "epoch": 1.787404877464198, + "grad_norm": 0.37054625153541565, + "learning_rate": 2.0015441733920105e-06, + "loss": 0.0038, + "step": 29830 + }, + { + "epoch": 1.7880040745401162, + "grad_norm": 0.12430086731910706, + "learning_rate": 2.0013688536115332e-06, + "loss": 0.0043, + "step": 29840 + }, + { + "epoch": 1.7886032716160345, + "grad_norm": 0.15685392916202545, + "learning_rate": 2.0012040942259285e-06, + "loss": 0.0031, + "step": 29850 + }, + { + "epoch": 1.7892024686919528, + "grad_norm": 0.15734116733074188, + "learning_rate": 2.0010498954285506e-06, + "loss": 0.0034, + "step": 29860 + }, + { + "epoch": 1.789801665767871, + "grad_norm": 0.1462196558713913, + "learning_rate": 2.00090625740036e-06, + "loss": 0.0027, + "step": 29870 + }, + { + "epoch": 1.7904008628437893, + "grad_norm": 0.10963186621665955, + "learning_rate": 2.0007731803099256e-06, + "loss": 0.0031, + "step": 29880 + }, + { + "epoch": 1.7910000599197076, + "grad_norm": 0.08986041694879532, + "learning_rate": 2.00065066431342e-06, + "loss": 0.0024, + "step": 29890 + }, + { + "epoch": 1.7915992569956258, + "grad_norm": 0.1555427759885788, + "learning_rate": 2.0005387095546222e-06, + "loss": 0.0048, + "step": 29900 + }, + { + "epoch": 1.792198454071544, + "grad_norm": 0.10785987228155136, + "learning_rate": 2.000437316164917e-06, + "loss": 0.0027, + "step": 29910 + }, + { + "epoch": 1.7927976511474624, + "grad_norm": 0.16140185296535492, + "learning_rate": 2.000346484263297e-06, + "loss": 0.0032, + "step": 29920 + }, + { + "epoch": 1.7933968482233806, + "grad_norm": 0.21847034990787506, + "learning_rate": 2.0002662139563564e-06, + "loss": 0.0038, + "step": 29930 + }, + { + "epoch": 1.793996045299299, + "grad_norm": 0.11339953541755676, + "learning_rate": 2.0001965053382976e-06, + "loss": 0.003, + "step": 29940 + }, + { + "epoch": 1.7945952423752172, + "grad_norm": 0.142179474234581, + "learning_rate": 2.000137358490928e-06, + "loss": 0.0036, + "step": 29950 + }, + { + "epoch": 1.7951944394511354, + "grad_norm": 0.09894557297229767, + "learning_rate": 2.0000887734836583e-06, + "loss": 0.0026, + "step": 29960 + }, + { + "epoch": 1.7957936365270537, + "grad_norm": 0.2643095850944519, + "learning_rate": 2.0000507503735076e-06, + "loss": 0.0027, + "step": 29970 + }, + { + "epoch": 1.796392833602972, + "grad_norm": 0.11731639504432678, + "learning_rate": 2.0000232892050976e-06, + "loss": 0.0028, + "step": 29980 + }, + { + "epoch": 1.7969920306788902, + "grad_norm": 0.07690370082855225, + "learning_rate": 2.000006390010655e-06, + "loss": 0.0028, + "step": 29990 + }, + { + "epoch": 1.7975912277548085, + "grad_norm": 0.10461316257715225, + "learning_rate": 2.0000000528100118e-06, + "loss": 0.0037, + "step": 30000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.873893288742748e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..19d22af7b7d6155175015b5c3c5b452030d153ea --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/checkpoint-30000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccf8e16588ffacf58cd09ed0241d355125d76c992d11c15a4bc8ee94db38dc3b +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1176494009828ca1a8d623c603070781658572df --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": true, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a3180771919159b1a2d73c43bc8cf42097657196 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da8805f8908531b109f7f6b4df063664b6f536cfa398316ef359bee72c5e28e2 +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..627385151b88c21eac3195320bb4364d04f3e13c --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86948cca4d01a987e5a3bd9f9005ea3f68bc6d7e9966463f5f48a450f5cf1971 +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c97ef2523875853c9b2568bed1c7ba1a232166d7 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b5d6df2df75c34f17edb92890cc4bf03e44c09933f4a35f72f6f5be21049eea +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..0da6184f9732635317d9591566929a0f088174db --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -24.608807465362545, + -30.57493604888916, + -14.421680474472046, + -1.8400005650520326, + -2.2583390679359434, + -1.9374337060928344, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 2.947746359062201, + 22.348905650329584, + 21.642364361572263, + 2.36660552740097, + 4.0908002225875855, + 3.2823701507568366, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + -6.435277462005615, + -1.046771764755249, + 3.5443263053894043, + 0.010237408801913261, + 0.7088965773582458, + 0.433538019657135, + 0.11327514797449112, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 7.037599563598633, + 16.91518783569336, + 8.290277481079102, + 0.6919190883636475, + 1.1289485692977905, + 0.9604002833366394, + 0.9935636520385742, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.702568125152588, + -21.763728466033935, + -21.216347326660156, + -2.3684931322097778, + -4.066458044528961, + -3.2888745792388914, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.601868363571164, + 30.525507734680176, + 14.354210775756833, + 1.8357849156379702, + 2.250663768482209, + 1.934181491851806, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.545124530792236, + 1.3164341449737549, + -3.4697155952453613, + -0.00962071679532528, + -0.7082296013832092, + -0.43808361887931824, + 0.13391299545764923, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.980162143707275, + 16.702543258666992, + 8.168180465698242, + 0.6913491487503052, + 1.1232151985168457, + 0.9606267809867859, + 0.990993082523346, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff8c0ecf109af377331583e4079865e7d8037bc8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 5 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f22fdb6dcccad5b1587b461554d7c55ca880346f --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/trainer_state.json @@ -0,0 +1,21043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7975912277548085, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 2.688621997833252, + "learning_rate": 1.8e-07, + "loss": 0.1495, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.1722424030303955, + "learning_rate": 3.8e-07, + "loss": 0.1358, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 2.3095974922180176, + "learning_rate": 5.800000000000001e-07, + "loss": 0.1268, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 2.131070852279663, + "learning_rate": 7.8e-07, + "loss": 0.1224, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 2.273555278778076, + "learning_rate": 9.800000000000001e-07, + "loss": 0.118, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 1.3571869134902954, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.111, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 1.6004165410995483, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0826, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 1.0413638353347778, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.0657, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 1.1965473890304565, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.0493, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 1.1422100067138672, + "learning_rate": 1.98e-06, + "loss": 0.0444, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 0.6911118626594543, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.0457, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 0.6770259737968445, + "learning_rate": 2.38e-06, + "loss": 0.0257, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 0.4811704456806183, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.0208, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 0.7260023951530457, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.0203, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 0.4369716942310333, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.0174, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 0.4100959300994873, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.0133, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 1.0024627447128296, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.0149, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.4598183035850525, + "learning_rate": 3.58e-06, + "loss": 0.0143, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 0.7042055130004883, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.0143, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 0.7677909731864929, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0151, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 0.45090702176094055, + "learning_rate": 4.18e-06, + "loss": 0.0113, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 0.4400976598262787, + "learning_rate": 4.38e-06, + "loss": 0.0155, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 0.2424178272485733, + "learning_rate": 4.58e-06, + "loss": 0.0113, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 0.4720967411994934, + "learning_rate": 4.78e-06, + "loss": 0.0166, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 0.41622042655944824, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0104, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 0.6915765404701233, + "learning_rate": 5.18e-06, + "loss": 0.0108, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.25931113958358765, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0104, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.42486071586608887, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0084, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.3798843324184418, + "learning_rate": 5.78e-06, + "loss": 0.0107, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.3281213343143463, + "learning_rate": 5.98e-06, + "loss": 0.0081, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 0.3394489884376526, + "learning_rate": 6.18e-06, + "loss": 0.01, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 0.38298189640045166, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0098, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 0.3188078999519348, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0104, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.3152049779891968, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0087, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.34163472056388855, + "learning_rate": 6.98e-06, + "loss": 0.01, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 0.43860143423080444, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0065, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.2845093309879303, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0086, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 0.4009752869606018, + "learning_rate": 7.58e-06, + "loss": 0.0099, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.37756970524787903, + "learning_rate": 7.78e-06, + "loss": 0.0097, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.38135284185409546, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0076, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 0.3145769536495209, + "learning_rate": 8.18e-06, + "loss": 0.0106, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 0.32534345984458923, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0069, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.24024507403373718, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0089, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 0.32857799530029297, + "learning_rate": 8.78e-06, + "loss": 0.0105, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.28823110461235046, + "learning_rate": 8.98e-06, + "loss": 0.0101, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 0.32506972551345825, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0126, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 0.19875723123550415, + "learning_rate": 9.38e-06, + "loss": 0.0081, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.3245992958545685, + "learning_rate": 9.58e-06, + "loss": 0.0099, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.24933603405952454, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0117, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.3154098391532898, + "learning_rate": 9.980000000000001e-06, + "loss": 0.009, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.3685779273509979, + "learning_rate": 1.018e-05, + "loss": 0.0101, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 0.7251449823379517, + "learning_rate": 1.038e-05, + "loss": 0.0119, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 0.3183727264404297, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.009, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.3737810254096985, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0089, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.45293235778808594, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.011, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 0.3476772606372833, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.008, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.38373252749443054, + "learning_rate": 1.138e-05, + "loss": 0.0088, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 0.2530902624130249, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.008, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 0.19455896317958832, + "learning_rate": 1.178e-05, + "loss": 0.008, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.3315221071243286, + "learning_rate": 1.198e-05, + "loss": 0.0102, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.23430880904197693, + "learning_rate": 1.218e-05, + "loss": 0.007, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.4636307656764984, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0075, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.3785994052886963, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0109, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.2804955542087555, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0099, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.393702894449234, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0132, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.400641530752182, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0099, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 0.24428881704807281, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0076, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 0.4449252188205719, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0103, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.406582236289978, + "learning_rate": 1.378e-05, + "loss": 0.0098, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.36386170983314514, + "learning_rate": 1.398e-05, + "loss": 0.0088, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.38196030259132385, + "learning_rate": 1.418e-05, + "loss": 0.01, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.28740620613098145, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.008, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.3616485297679901, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0094, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.4004146158695221, + "learning_rate": 1.478e-05, + "loss": 0.009, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.4585514962673187, + "learning_rate": 1.498e-05, + "loss": 0.0092, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.20028235018253326, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0138, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 0.46603646874427795, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0139, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.3518030047416687, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0116, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.22323082387447357, + "learning_rate": 1.578e-05, + "loss": 0.0097, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.26777058839797974, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0081, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.32380548119544983, + "learning_rate": 1.618e-05, + "loss": 0.0087, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.5248059630393982, + "learning_rate": 1.638e-05, + "loss": 0.0102, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.3495309054851532, + "learning_rate": 1.658e-05, + "loss": 0.0121, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.3551771342754364, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.5039486289024353, + "learning_rate": 1.698e-05, + "loss": 0.0094, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.3826751410961151, + "learning_rate": 1.718e-05, + "loss": 0.0107, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.46699973940849304, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0122, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3312668204307556, + "learning_rate": 1.758e-05, + "loss": 0.0087, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 0.28113219141960144, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0121, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.49752357602119446, + "learning_rate": 1.798e-05, + "loss": 0.0101, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.4177795350551605, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0096, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.34015583992004395, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0082, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.4612225890159607, + "learning_rate": 1.858e-05, + "loss": 0.0084, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.3813643753528595, + "learning_rate": 1.878e-05, + "loss": 0.012, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 0.27937838435173035, + "learning_rate": 1.898e-05, + "loss": 0.0104, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.4471273422241211, + "learning_rate": 1.918e-05, + "loss": 0.0125, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.4010440707206726, + "learning_rate": 1.938e-05, + "loss": 0.0106, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.41607654094696045, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0107, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 0.3589233458042145, + "learning_rate": 1.978e-05, + "loss": 0.0081, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.5726460814476013, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0111, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.36717164516448975, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0102, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.47284170985221863, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.01, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.5372244119644165, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0117, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.40928924083709717, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0088, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.4905182421207428, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0107, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.3709850609302521, + "learning_rate": 1.999981616897523e-05, + "loss": 0.01, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 0.6419615745544434, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0095, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.4986196458339691, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0127, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.5523516535758972, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0115, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.5443158745765686, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0113, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 0.5146775245666504, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0101, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.2972394824028015, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0092, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.4030104875564575, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0097, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 0.4765481650829315, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0136, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.4051239788532257, + "learning_rate": 1.999882759038658e-05, + "loss": 0.0113, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.3703782558441162, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0108, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5248176455497742, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0112, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.3100311756134033, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0083, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.45929211378097534, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0114, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 0.5695507526397705, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0095, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.5395359992980957, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0151, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.5106327533721924, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0124, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.3423260450363159, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0132, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.32126766443252563, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.011, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.5105165839195251, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0085, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 0.31927764415740967, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0088, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 0.4421865940093994, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0093, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.2930506765842438, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0091, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.2920694053173065, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0085, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.2661049962043762, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0081, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 0.3047257661819458, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0083, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.2774506211280823, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0091, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.2554785907268524, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0096, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.5792570114135742, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0108, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.3250623941421509, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0125, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 0.5885359048843384, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0117, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.37988749146461487, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.009, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.3751101493835449, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0099, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.31976667046546936, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0097, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 0.37007251381874084, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0079, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.4624205231666565, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0103, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 0.3769538700580597, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0094, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.25460657477378845, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0076, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.3976004719734192, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0109, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.2983521521091461, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0113, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.25581008195877075, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0101, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.29260268807411194, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0102, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.3522181808948517, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0105, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.36269208788871765, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0103, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.40412119030952454, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0116, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.24089744687080383, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0119, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.4667617082595825, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0084, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.30139675736427307, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0101, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.38486286997795105, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0097, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.3526909649372101, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0071, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.3023934066295624, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0125, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.2796316146850586, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0072, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.25742489099502563, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0089, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.3626627027988434, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.01, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.3032572567462921, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0084, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.23514018952846527, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0086, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.3835832476615906, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0091, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.5170259475708008, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0146, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 0.8983817100524902, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0112, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.26260825991630554, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0086, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.481942743062973, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0126, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.311187207698822, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0064, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.3346790373325348, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0073, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.33836621046066284, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0085, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.3678463101387024, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0098, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.6136184334754944, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0154, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.39811593294143677, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0112, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6973778009414673, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0099, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.4773237109184265, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0091, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.3776084780693054, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.009, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 0.5061993598937988, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0097, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.41183987259864807, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.009, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 0.31513598561286926, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0112, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.4571514129638672, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0097, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.3183996379375458, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.01, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.2978666126728058, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0089, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.4791043698787689, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0087, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.5216032266616821, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0124, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.44693392515182495, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0092, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 0.41371819376945496, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0111, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.3593288064002991, + "learning_rate": 1.996106060741973e-05, + "loss": 0.014, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 0.4550306499004364, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0098, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.3510669469833374, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0066, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.2778814136981964, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0108, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.32210350036621094, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0067, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.42160800099372864, + "learning_rate": 1.995639934033493e-05, + "loss": 0.012, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.49051347374916077, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0102, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.3643694519996643, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.009, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.3717772960662842, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0076, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.32102280855178833, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0081, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.36725476384162903, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0102, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.39626258611679077, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0078, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.4183773696422577, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0105, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.3494930863380432, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0078, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.6155357956886292, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0119, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.34380587935447693, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.0105, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.5476253032684326, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.01, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 0.37999996542930603, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0094, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 0.3124147057533264, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0125, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.4887244999408722, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.01, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5969874858856201, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0106, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 0.4295594096183777, + "learning_rate": 1.993971819309759e-05, + "loss": 0.007, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.3899303078651428, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0096, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.3912282884120941, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0075, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.5355616807937622, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0093, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.29141828417778015, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0129, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.24389855563640594, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.009, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.4070908725261688, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0085, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.26783379912376404, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0071, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.2644960880279541, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0089, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.35223162174224854, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0093, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.47337162494659424, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0095, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.25418519973754883, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0093, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 0.36384159326553345, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0082, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.30014440417289734, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0081, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.41121408343315125, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0081, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.5576186776161194, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.008, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.35785913467407227, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0083, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.3306240439414978, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0084, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 0.37215736508369446, + "learning_rate": 1.991774193879505e-05, + "loss": 0.012, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.5504099726676941, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0088, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.24932143092155457, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.007, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.5866615176200867, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0088, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.5174368619918823, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0121, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.2345893532037735, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0095, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.2683233916759491, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0068, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.2471713274717331, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0085, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.5090919733047485, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0108, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.2857886552810669, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0078, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.23729385435581207, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0096, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.30867621302604675, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0088, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.42522960901260376, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0103, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.37170591950416565, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0105, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.3672806918621063, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0121, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.4048611521720886, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.01, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.24768167734146118, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0125, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 0.5003495812416077, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0125, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.4303686022758484, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0084, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.3701602518558502, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0101, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.38272005319595337, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0075, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.2844183146953583, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0105, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.31114980578422546, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0095, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.3436568081378937, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0113, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.273001104593277, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0076, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.2653564512729645, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0077, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.3115384578704834, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0132, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.25932809710502625, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0083, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.28656521439552307, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0066, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.31808462738990784, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.0115, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.18877890706062317, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0092, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.3685394525527954, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0091, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.3878263533115387, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0082, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 0.284507691860199, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0085, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.3473755121231079, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0081, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.39935287833213806, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0081, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.34282153844833374, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0076, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.3581090271472931, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0087, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.37332627177238464, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0089, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 0.5224587321281433, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0089, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.42577075958251953, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0108, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.4602234959602356, + "learning_rate": 1.985504281027289e-05, + "loss": 0.014, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.4852961003780365, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0091, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.4437471628189087, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0112, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.37050408124923706, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0068, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.3345497250556946, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0069, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.36727628111839294, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0081, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 0.37056809663772583, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0152, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.5640603303909302, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0085, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 0.3653910160064697, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0078, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.2954258322715759, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0083, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6086210012435913, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0082, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 0.5260390043258667, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0105, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.3067379295825958, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.0092, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.3480100929737091, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0088, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.26472753286361694, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0067, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.5254784226417542, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0146, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.35744136571884155, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0098, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.36186468601226807, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0084, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 0.35203835368156433, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0115, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.30590811371803284, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0108, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.34612980484962463, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0082, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.2946765720844269, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0075, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.33707642555236816, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.007, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.2572688162326813, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0099, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.3901146352291107, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0185, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.4349755644798279, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0084, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.2383752018213272, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0092, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.46043846011161804, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0073, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.24630354344844818, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0062, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.5232640504837036, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0067, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 0.3850713074207306, + "learning_rate": 1.979809151602651e-05, + "loss": 0.014, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 0.44703760743141174, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0081, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.3762659728527069, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0099, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.4593638479709625, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0093, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.40554332733154297, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0125, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.33439910411834717, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0081, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.2623269855976105, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0062, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.22419600188732147, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0078, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.37183159589767456, + "learning_rate": 1.978133252131276e-05, + "loss": 0.01, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.27857136726379395, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0089, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.27683520317077637, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0069, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.45064759254455566, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0076, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.24215294420719147, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.0071, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.5163891315460205, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.0078, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.3922234773635864, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0077, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.19653558731079102, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0063, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.17621839046478271, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0084, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.6482162475585938, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0075, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.32759004831314087, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0088, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.33347561955451965, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0073, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.42883744835853577, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0084, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 0.3348788917064667, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0082, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.28349289298057556, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0102, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.2733197510242462, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0074, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.3263874351978302, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.01, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.295757532119751, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0071, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5598515868186951, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0093, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.425937294960022, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0083, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.2442379742860794, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0087, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.3378766179084778, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0163, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5137761831283569, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0099, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.3825916647911072, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0096, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.32084307074546814, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0066, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.3979593515396118, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0077, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.3103732764720917, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0067, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.5531997084617615, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0131, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5423216819763184, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0121, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.5038735270500183, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0087, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.44273868203163147, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.008, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.335232675075531, + "learning_rate": 1.971017390295979e-05, + "loss": 0.009, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.4746256470680237, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.26807400584220886, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0075, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.35464033484458923, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0123, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.33803898096084595, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0094, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 0.20334473252296448, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0101, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.34386369585990906, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0081, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.38781842589378357, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0088, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.25994163751602173, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0079, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.3342406451702118, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0091, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.3120318353176117, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0079, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.3556351661682129, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0073, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.21421445906162262, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0095, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.39498451352119446, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0087, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.5480947494506836, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0079, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.16734588146209717, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0072, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.3987548351287842, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0083, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.3929785490036011, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0096, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.2884303331375122, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0102, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.3338335454463959, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0092, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.47452738881111145, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0093, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.25584715604782104, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0068, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 0.3038389980792999, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0076, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.4123639464378357, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0101, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.38520675897598267, + "learning_rate": 1.964833301001045e-05, + "loss": 0.014, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.3355116844177246, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0094, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.3479195535182953, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0105, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.2700177729129791, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0076, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.2166757434606552, + "learning_rate": 1.963745667883003e-05, + "loss": 0.008, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 0.18578873574733734, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0071, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.26316413283348083, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0079, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.28762468695640564, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0115, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 0.3712877631187439, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0074, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.2862299382686615, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0072, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.2730867564678192, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0101, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.327648401260376, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0092, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.41153189539909363, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0083, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.32522135972976685, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0095, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.22764958441257477, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0085, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.3491888642311096, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.009, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.3123551607131958, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0103, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.1881783902645111, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0085, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.40902259945869446, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0089, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.382953941822052, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0088, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 0.23950865864753723, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0064, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.3419397175312042, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0118, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.42207059264183044, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0091, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.40754130482673645, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0087, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.2390766590833664, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0069, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.2974188029766083, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.0091, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.2993582785129547, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0092, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.42652204632759094, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0068, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.3138194680213928, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.009, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.38833311200141907, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0083, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.4015152156352997, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0081, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.42086881399154663, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.007, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.26732996106147766, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0071, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.5763937830924988, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0101, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.2955382764339447, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0075, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.4625638723373413, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0094, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.29631468653678894, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0096, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.46335819363594055, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0103, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.3183141350746155, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.008, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.26456212997436523, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0083, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 0.40924879908561707, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0097, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 0.3981763422489166, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0094, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.36437541246414185, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0064, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.2935962378978729, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0081, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.3478807210922241, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0079, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.3460087180137634, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0069, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.2706817090511322, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0088, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.2674945890903473, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0083, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.2268197238445282, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0072, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.3216208219528198, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0092, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.3226968050003052, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0101, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.2743329405784607, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0075, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.32573118805885315, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0094, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 0.53167325258255, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0099, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.3915646970272064, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0089, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.4526256322860718, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0101, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.323249489068985, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0094, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4046335816383362, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0088, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.34745559096336365, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0078, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.30308133363723755, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0071, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.37923407554626465, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0076, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 0.26785972714424133, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0093, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.2778306305408478, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0083, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.611038088798523, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0098, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.4114893078804016, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0111, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.2732110023498535, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0076, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.2964401841163635, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0095, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.40240928530693054, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0097, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.3901022672653198, + "learning_rate": 1.944152646499645e-05, + "loss": 0.008, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.38001132011413574, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0109, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.35937973856925964, + "learning_rate": 1.943474465322135e-05, + "loss": 0.007, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.2745327651500702, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0075, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.1598518043756485, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.007, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.401614785194397, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0115, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.4127846360206604, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0068, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.22147920727729797, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0061, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.28602245450019836, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0067, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.22147324681282043, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0076, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.2550548315048218, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0088, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.24113087356090546, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0076, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.3658410608768463, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0075, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.3856262266635895, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0112, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.33494284749031067, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0075, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.3767516314983368, + "learning_rate": 1.938969919958475e-05, + "loss": 0.01, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.24380649626255035, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.009, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.30575039982795715, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0079, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.32913386821746826, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.009, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.29845312237739563, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0099, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.44377902150154114, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0092, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.34614384174346924, + "learning_rate": 1.936834723687526e-05, + "loss": 0.009, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.3316318690776825, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0096, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.4076138734817505, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0081, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 0.30320486426353455, + "learning_rate": 1.935753861926916e-05, + "loss": 0.015, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.32243025302886963, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.011, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.323745459318161, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0077, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5750753283500671, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0088, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.22709843516349792, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0101, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.3067542314529419, + "learning_rate": 1.933932815280178e-05, + "loss": 0.007, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.392337828874588, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0089, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.43343180418014526, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0073, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.4371345341205597, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0078, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.35214635729789734, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0077, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.3259161412715912, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0074, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.3849303722381592, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0066, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.3968902826309204, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0091, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.33016201853752136, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0095, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.3859156668186188, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.008, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.3020654618740082, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.007, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.44503262639045715, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0105, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.3908904194831848, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0073, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.39256253838539124, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0078, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.352611243724823, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0077, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.39203983545303345, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0081, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.23835115134716034, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0066, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.24996638298034668, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0098, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.29537609219551086, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0082, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.2898835837841034, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0077, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.4040369391441345, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0083, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.3501318395137787, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0094, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5462452173233032, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0097, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.4217568337917328, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0072, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.18295089900493622, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0083, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.3695569336414337, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0078, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.37818798422813416, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0089, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.29818472266197205, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0084, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.3328498303890228, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.01, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.340724378824234, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0075, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.2966301441192627, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0063, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.30677109956741333, + "learning_rate": 1.922098355206593e-05, + "loss": 0.008, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.2091839611530304, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.0078, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.4229014217853546, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0115, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.40779992938041687, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0075, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.378817081451416, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.008, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.29796919226646423, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0092, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.2702767252922058, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0076, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.31349876523017883, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0085, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 0.30500444769859314, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0093, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.2860834002494812, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0061, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.26036593317985535, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0099, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.19049863517284393, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0075, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.3235284388065338, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0083, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.364092618227005, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.011, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.2409065216779709, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0092, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.36907926201820374, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.008, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.3230077922344208, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0073, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.191047802567482, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0063, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.3346494436264038, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0082, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.21352025866508484, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0075, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.5505086779594421, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0095, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.34264758229255676, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0083, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.20266413688659668, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0074, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.24938757717609406, + "learning_rate": 1.912718096497034e-05, + "loss": 0.007, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.4140026569366455, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0086, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.4424414038658142, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0104, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 0.5327904224395752, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0115, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 0.35958340764045715, + "learning_rate": 1.911035077753307e-05, + "loss": 0.01, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.2547682523727417, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0066, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.3701247274875641, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0115, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.34443217515945435, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0077, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.20353800058364868, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0061, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5660653114318848, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0091, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.26445311307907104, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0073, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.5561402440071106, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0071, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.3700469434261322, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0083, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.35783904790878296, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.008, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.3238641619682312, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0081, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.25247740745544434, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0099, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.435730904340744, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.008, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.37758126854896545, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0068, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.33323949575424194, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.0094, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.4356318712234497, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0093, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 0.37893903255462646, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0058, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.4411139190196991, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0085, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.3852006793022156, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0087, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4287096858024597, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0107, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.43085435032844543, + "learning_rate": 1.902392195640386e-05, + "loss": 0.009, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 0.2709400951862335, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0066, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.358126163482666, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0082, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.25320038199424744, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0077, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 0.31440937519073486, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0077, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.25246965885162354, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0079, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.28420332074165344, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0101, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.25251317024230957, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0075, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.19744229316711426, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0069, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4457854628562927, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0073, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.36817625164985657, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0096, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.3394709825515747, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0073, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.2909093201160431, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0065, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.20237651467323303, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0057, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.29520732164382935, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0072, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.25512900948524475, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0096, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.45816823840141296, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0073, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.33459368348121643, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0096, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.21619321405887604, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0063, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.25518253445625305, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0067, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.2273867279291153, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.007, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.2864684462547302, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0074, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.3077942728996277, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0075, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 0.40526703000068665, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0079, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.41480064392089844, + "learning_rate": 1.891523933768891e-05, + "loss": 0.01, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.2750788629055023, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0064, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 0.29671600461006165, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0095, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 0.24160107970237732, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0069, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 0.2949109971523285, + "learning_rate": 1.889660337749874e-05, + "loss": 0.007, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.2847975492477417, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0059, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.30052465200424194, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0067, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.36128419637680054, + "learning_rate": 1.888252908366661e-05, + "loss": 0.014, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.36974236369132996, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0064, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.43730056285858154, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0084, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.3145422339439392, + "learning_rate": 1.88683715346172e-05, + "loss": 0.008, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.35473865270614624, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0091, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.2501350939273834, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.008, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.34808069467544556, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0099, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.45218509435653687, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.0068, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.34530994296073914, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0098, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.38257333636283875, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0101, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.3040159344673157, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0079, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.3323517143726349, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0068, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.2639414370059967, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0078, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.3493870794773102, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0081, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.5838330984115601, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0091, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 0.428803026676178, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0087, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.3654572069644928, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0114, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.3295663297176361, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0075, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.3469060957431793, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0074, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.3366406261920929, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0066, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.32569241523742676, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0054, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3086700737476349, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0086, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.38562801480293274, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0092, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.3523421585559845, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0085, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.2278694063425064, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0063, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.32141822576522827, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0147, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.3375259041786194, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0077, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4483063220977783, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0062, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.3667140007019043, + "learning_rate": 1.874717450126662e-05, + "loss": 0.008, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.3419000506401062, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0079, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.36556369066238403, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0079, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.33135318756103516, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0064, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.4458329975605011, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0091, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.34939518570899963, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0072, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.34424352645874023, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0077, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.3460613191127777, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0113, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.38822048902511597, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0066, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.35550639033317566, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0083, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 0.30869176983833313, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0087, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.38202086091041565, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0081, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.25744789838790894, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0074, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.29700344800949097, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0082, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.305786669254303, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0076, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.3291271924972534, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0071, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.26111704111099243, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0074, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.348176509141922, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0086, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.27502793073654175, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0076, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.2831551432609558, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0092, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.39652079343795776, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0066, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.3885122239589691, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0087, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.37296077609062195, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0104, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.33606627583503723, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0086, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.3855937421321869, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0097, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.3322301506996155, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0076, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 0.33322253823280334, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.009, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.22358210384845734, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0088, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.5901851058006287, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0088, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.4703235328197479, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0084, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.20072896778583527, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0077, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.3537980616092682, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0098, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.3123277723789215, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0068, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.35979342460632324, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0065, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.38628828525543213, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0074, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.3498038053512573, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0074, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.20784054696559906, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0059, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.1811107099056244, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0085, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.43317103385925293, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0064, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.3815033435821533, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0064, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.35989734530448914, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.008, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.46118423342704773, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.012, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.25334376096725464, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0078, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.26764073967933655, + "learning_rate": 1.852547637090483e-05, + "loss": 0.01, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.2785920202732086, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0066, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.41587865352630615, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0061, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.42850133776664734, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.009, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.32369133830070496, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0091, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.2930110692977905, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0069, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.3199067711830139, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0085, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 0.4349478483200073, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0078, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 0.3054976165294647, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0061, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.2826739251613617, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0068, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.25106528401374817, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.007, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.25897887349128723, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0076, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.26398584246635437, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.0069, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.41751599311828613, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0083, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.17239610850811005, + "learning_rate": 1.844974808419918e-05, + "loss": 0.006, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3300461173057556, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0051, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.2645586133003235, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0068, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.24550332129001617, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0071, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.2889944911003113, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0091, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.476601779460907, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0066, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.35630306601524353, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0074, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.35651877522468567, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0084, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.3889803886413574, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0079, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4214278757572174, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.009, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.30540233850479126, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0083, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.3624532222747803, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0076, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.32963570952415466, + "learning_rate": 1.838347361898993e-05, + "loss": 0.01, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.3533381521701813, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0064, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.3011729419231415, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0065, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.4733760952949524, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0089, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.38553985953330994, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0059, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.2560643255710602, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0073, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.39531010389328003, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0106, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.2701983153820038, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0086, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.352717787027359, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0096, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.29157745838165283, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0073, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.4267994165420532, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0075, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.36308032274246216, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0075, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.33457428216934204, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0103, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.3717971444129944, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0069, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 0.21432936191558838, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0081, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.2878777086734772, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0057, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.4453850984573364, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0095, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.36917057633399963, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0063, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.3252313733100891, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0082, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.2529674470424652, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0057, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.2816419303417206, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0097, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.6464210152626038, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0076, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.33034399151802063, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0069, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.27335023880004883, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0078, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 0.3158395290374756, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0064, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.5128306746482849, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0087, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 0.24884961545467377, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0084, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.324278324842453, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0075, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6472476124763489, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0093, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.21269051730632782, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0066, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.29203882813453674, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0074, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.30436405539512634, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0087, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5066608190536499, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0081, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.32647472620010376, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0066, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.2804315388202667, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0066, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.24779941141605377, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0074, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.34001022577285767, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0101, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.2611280381679535, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0082, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.3129233717918396, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0079, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.2822776734828949, + "learning_rate": 1.815952390818299e-05, + "loss": 0.0098, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.36969345808029175, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0064, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.33959338068962097, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0088, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.2628033459186554, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0062, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.38812723755836487, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0061, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.26403307914733887, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0055, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 0.3789900541305542, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0081, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.28676870465278625, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0127, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.606293797492981, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0082, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.37321826815605164, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0063, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.368115097284317, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0091, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.3368416726589203, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0068, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.23466472327709198, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.006, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.3796599507331848, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0169, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.2202090471982956, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0099, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.5006175637245178, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0086, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.3673453629016876, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0083, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4379428029060364, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.006, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.43015891313552856, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0084, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.2806220054626465, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0061, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.23545289039611816, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0062, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 0.32115358114242554, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0075, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.3217777907848358, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0062, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.3224331736564636, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0072, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.31703537702560425, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0082, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 0.4175204932689667, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.008, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.22969186305999756, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0084, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.3421284258365631, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0077, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.32668444514274597, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0071, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.2729822099208832, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0068, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.33153197169303894, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0074, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.4678424000740051, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0076, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.23711496591567993, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0076, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.3230719566345215, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0084, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.32328692078590393, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0075, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.566879153251648, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0072, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.26277920603752136, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0062, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.339163601398468, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0082, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.23408609628677368, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0061, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.2942394018173218, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0065, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 0.3774799704551697, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0063, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.2847958207130432, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0072, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.2577030062675476, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0088, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.2883673906326294, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0075, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.3596307933330536, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0073, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.30285483598709106, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0076, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.2933914363384247, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0077, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.7666468024253845, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0102, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.31347739696502686, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0072, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.3435507118701935, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0081, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.3266170620918274, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0058, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.284027099609375, + "learning_rate": 1.784745142605655e-05, + "loss": 0.005, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.19972574710845947, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0072, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.2587524950504303, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0067, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.2922254204750061, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0064, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.17053507268428802, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0092, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.2850453555583954, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0073, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.2844892144203186, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0075, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.28969481587409973, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0079, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.4704195261001587, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0102, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.2652505338191986, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0077, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.2656702399253845, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0118, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.2282119244337082, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0086, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.30130353569984436, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0062, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.2295757234096527, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0066, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.25287938117980957, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0065, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.3274557292461395, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0076, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.34377023577690125, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0079, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.36259520053863525, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0055, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.24462608993053436, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0067, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.3615039587020874, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0088, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.40002626180648804, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0086, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.3362888991832733, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0062, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.33698126673698425, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0087, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.3287750482559204, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.0068, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.23409898579120636, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0063, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.23275460302829742, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0066, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.35324692726135254, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0068, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.2781875729560852, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0066, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.3083304166793823, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0069, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.22543831169605255, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0066, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.22566530108451843, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0066, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.3640650808811188, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0073, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.35346123576164246, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0069, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 0.30858153104782104, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0076, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.30895760655403137, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0074, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.30667638778686523, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0082, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.3134152889251709, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0086, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.21407048404216766, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0077, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.3456077575683594, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0083, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.4259016513824463, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.009, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.38690924644470215, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0094, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.31742537021636963, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0065, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.3568819463253021, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0077, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.3771888315677643, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0073, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.25528469681739807, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0067, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.36028411984443665, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0064, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.41987329721450806, + "learning_rate": 1.754802282200567e-05, + "loss": 0.007, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.18902993202209473, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0064, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.1859915405511856, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0086, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.1778331696987152, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0052, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4222147464752197, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.007, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.26806506514549255, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0074, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.34431734681129456, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0056, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.41732800006866455, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0079, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.3027847409248352, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0054, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.47592151165008545, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0066, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9539707899093628, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0095, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.4084669351577759, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0082, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.3052361309528351, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0072, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.23123528063297272, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.009, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.20356184244155884, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0073, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.048543930053711, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0148, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.3017459213733673, + "learning_rate": 1.74400239259128e-05, + "loss": 0.007, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.3679676353931427, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0085, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.20339734852313995, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0087, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.3523346781730652, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0076, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.4162348210811615, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0063, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.3293565511703491, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0067, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.24455691874027252, + "learning_rate": 1.739902378104222e-05, + "loss": 0.007, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 0.17645037174224854, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0051, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.2554231286048889, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0076, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.20006878674030304, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0076, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.27911216020584106, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0082, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.5701723694801331, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0081, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.222118079662323, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0072, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.2762138843536377, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0049, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 1.4110082387924194, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0114, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.31313180923461914, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0078, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.20941513776779175, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0079, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.3963930308818817, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0053, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.2066672146320343, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0049, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.3919369876384735, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0082, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.2544628083705902, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.0054, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.31123557686805725, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0078, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.24768301844596863, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0051, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.26674744486808777, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0052, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.27382466197013855, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0075, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.23384103178977966, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.0059, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.3531075417995453, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0068, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.34425088763237, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0066, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.2716144323348999, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0058, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.35163211822509766, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0071, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.23585639894008636, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0072, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.28066661953926086, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0068, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.3146689832210541, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0071, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.37553170323371887, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.008, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.18403242528438568, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0068, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.3904851973056793, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0072, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.4481397867202759, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0074, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.31124234199523926, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0074, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.3815377354621887, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0084, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.2909438908100128, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0074, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.3408021330833435, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0074, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.23902025818824768, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0076, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.2194853127002716, + "learning_rate": 1.714740708672306e-05, + "loss": 0.006, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 0.4337097108364105, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0092, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.4132380783557892, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0078, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.3434816598892212, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0076, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.25129666924476624, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0058, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.45458248257637024, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0064, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.5350340008735657, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.009, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 0.28008121252059937, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0073, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.33276447653770447, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0064, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37103456258773804, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0078, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 0.4689319133758545, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0073, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.3622629642486572, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.006, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.2822306156158447, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0073, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.19226481020450592, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0059, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.27806007862091064, + "learning_rate": 1.704700993266678e-05, + "loss": 0.007, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.25948378443717957, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0076, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.5857216715812683, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0095, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.30467140674591064, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0073, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.2067701816558838, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0068, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 0.5653601288795471, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0087, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.3107249140739441, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0065, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4027363061904907, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0098, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.2757766544818878, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0091, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.30397671461105347, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0061, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.28112074732780457, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0063, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.14751966297626495, + "learning_rate": 1.696714953556411e-05, + "loss": 0.008, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.2988373935222626, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0055, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.2706286311149597, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0066, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.3612031042575836, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.006, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.25386789441108704, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0065, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.3170768916606903, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0056, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.4776926338672638, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0059, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.34828829765319824, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0088, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.20440815389156342, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0066, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.2943046987056732, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0068, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.16982606053352356, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0073, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.5607914924621582, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0085, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.35823172330856323, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0064, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.23943926393985748, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0068, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.24083787202835083, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0056, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.37987980246543884, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0062, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.35953620076179504, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0069, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.22255095839500427, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0071, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.4121200442314148, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0098, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.2377164363861084, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0076, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.2298472374677658, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0064, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.40824711322784424, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0066, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.33295100927352905, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.007, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.3978032171726227, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0077, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.27672451734542847, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.006, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.2591206729412079, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0089, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.1749347746372223, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0051, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.18699893355369568, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0056, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.240631103515625, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0089, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.3650512993335724, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0075, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.3503545820713043, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0067, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.3086877167224884, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0061, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.41695648431777954, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0064, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.33144691586494446, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0067, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.2679164409637451, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0072, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.22681233286857605, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0071, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.36362454295158386, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0067, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.20192845165729523, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0067, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.3895004093647003, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0055, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.22510671615600586, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0069, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.19641445577144623, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0101, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.2914806008338928, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0076, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.3187137544155121, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0059, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.3116552233695984, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0095, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.2597426772117615, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0058, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.21480600535869598, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0055, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.23912057280540466, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.006, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.317941278219223, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0064, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.58933025598526, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0095, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.21906700730323792, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0105, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.23899045586585999, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0059, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.2969389259815216, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0124, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.3514954447746277, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0066, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.18145518004894257, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0077, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.3087640404701233, + "learning_rate": 1.656303606359183e-05, + "loss": 0.006, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.3532063364982605, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0055, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.34000685811042786, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0096, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.24904295802116394, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0073, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.36314642429351807, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.008, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.20241902768611908, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.009, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.3215351700782776, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0075, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 0.4313117563724518, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0081, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.48170387744903564, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0071, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.3369109630584717, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0066, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.34541958570480347, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0058, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.2493886947631836, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0058, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.22845667600631714, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0096, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.2695702016353607, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0055, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 0.28211796283721924, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0052, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.1901162564754486, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0058, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.2701025605201721, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0061, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.36527693271636963, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0072, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.3061700463294983, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0067, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.5612105131149292, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0087, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.23399880528450012, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0072, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.314933180809021, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0078, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.35548436641693115, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0094, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.37685567140579224, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0084, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.3190719783306122, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0065, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.26337119936943054, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0063, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.3518264889717102, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0072, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.3185817003250122, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0068, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.2995646893978119, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0064, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.3110463619232178, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0063, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.24277286231517792, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0064, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.17603862285614014, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0061, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.28089356422424316, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0076, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.2855492830276489, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0047, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.3247278928756714, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0058, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.18349547684192657, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0061, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.30654969811439514, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.007, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.2674420177936554, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0067, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.38177546858787537, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0091, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.33796218037605286, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0068, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.3754856586456299, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0063, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.21820858120918274, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.007, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.36184942722320557, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0061, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.32240399718284607, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0063, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 0.24755406379699707, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0059, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.397858589887619, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0064, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.389072448015213, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0063, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.3368140757083893, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0071, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.29631632566452026, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0062, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.24265453219413757, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0076, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.19892603158950806, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0064, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.1852462887763977, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0051, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.1886446475982666, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0075, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.25982722640037537, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0068, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.3376137614250183, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0058, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.33173730969429016, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0064, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.3177517354488373, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0072, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.3385971784591675, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0066, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.29163679480552673, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0073, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.2335229516029358, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0056, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.24502214789390564, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0054, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.2009458988904953, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0061, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.3341793715953827, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0082, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.3872147798538208, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0063, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.29940876364707947, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0073, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.4895729720592499, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0086, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.4485950469970703, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.0053, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.22961653769016266, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0077, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.24187293648719788, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.005, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.3535212278366089, + "learning_rate": 1.601916647245149e-05, + "loss": 0.007, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.26539868116378784, + "learning_rate": 1.601107070706339e-05, + "loss": 0.008, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.43096065521240234, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0076, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.16919535398483276, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0058, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.2383720725774765, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0064, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.36103156208992004, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0067, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.2657287120819092, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0072, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.21437199413776398, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0065, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.34000417590141296, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0046, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.4855337142944336, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0068, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.3178497850894928, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0064, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.3171309530735016, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0067, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.3364340662956238, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0067, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2272711992263794, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0069, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.29505178332328796, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0078, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.3755042552947998, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0081, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.2983969449996948, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0085, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.3112468421459198, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0072, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.1950412392616272, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0061, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.2153436243534088, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0065, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.25062650442123413, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0079, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.1407836377620697, + "learning_rate": 1.584793312377278e-05, + "loss": 0.005, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.17276513576507568, + "learning_rate": 1.583971586792325e-05, + "loss": 0.006, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.47983887791633606, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0076, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.28724750876426697, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0076, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.3224884569644928, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0079, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.37969788908958435, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0063, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.48106926679611206, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0071, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.3555319905281067, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0075, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.19486083090305328, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.006, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.42018064856529236, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0074, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.3075830936431885, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0071, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.20921990275382996, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0063, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.20436584949493408, + "learning_rate": 1.574895332125391e-05, + "loss": 0.006, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.28120604157447815, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0071, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.22980183362960815, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0078, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.24825431406497955, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0064, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.22042447328567505, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0071, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.249199777841568, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0076, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.32628607749938965, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0057, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.35151633620262146, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0059, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.29098865389823914, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0064, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.24006013572216034, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0058, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.2797141671180725, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0073, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.2963006794452667, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0058, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.19539053738117218, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0053, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.2686854898929596, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0051, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.35952430963516235, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0071, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.21042552590370178, + "learning_rate": 1.562410199183484e-05, + "loss": 0.005, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.27942436933517456, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0068, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.17137926816940308, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0063, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.20331411063671112, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0047, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.15683002769947052, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0052, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.14726290106773376, + "learning_rate": 1.558221191857467e-05, + "loss": 0.006, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.2940376400947571, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0068, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.4059796929359436, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0067, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.2587816119194031, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0086, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.3462979793548584, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0078, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.5607128739356995, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0079, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.24189788103103638, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0052, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 0.23362945020198822, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0073, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.22395116090774536, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0059, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.3514958322048187, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0064, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.25395795702934265, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0081, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.2948741018772125, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0051, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.22298739850521088, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0038, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.46948447823524475, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0097, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.2992243468761444, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0083, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.18001538515090942, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0055, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.23337051272392273, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0066, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.2863878905773163, + "learning_rate": 1.543878746906905e-05, + "loss": 0.006, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.23027309775352478, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0072, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.21359150111675262, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0064, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.3878735601902008, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0069, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.29146283864974976, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.007, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.21782676875591278, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0051, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.45582008361816406, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0063, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.4554077982902527, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0067, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.2254059612751007, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0064, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.13952374458312988, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0061, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.23241721093654633, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0072, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.3424162268638611, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0058, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.21074503660202026, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0057, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.33662086725234985, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0056, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.24403709173202515, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0073, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.27195101976394653, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0058, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.34224429726600647, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0072, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.29089581966400146, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0053, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3397226333618164, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0066, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.30517837405204773, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0092, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.3485032021999359, + "learning_rate": 1.52681291800283e-05, + "loss": 0.007, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.31346458196640015, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0045, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.1864607185125351, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.006, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.20976679027080536, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0053, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.22616958618164062, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0059, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.14772117137908936, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0073, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.33677151799201965, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0059, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.32354292273521423, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0061, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.21409569680690765, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0064, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.4659721851348877, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0061, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 0.32267874479293823, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0064, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.5019848942756653, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0061, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.32694318890571594, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0076, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.3013843297958374, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0068, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.1973707377910614, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0059, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22204430401325226, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0056, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.3365449607372284, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0059, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.3398677110671997, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.007, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.27888917922973633, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0062, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.2814931273460388, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0069, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.3317541182041168, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.006, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.21940776705741882, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0052, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.239700049161911, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0059, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.19117280840873718, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0071, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.21827168762683868, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0056, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.25645333528518677, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0085, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.30847233533859253, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0055, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.3127819895744324, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0058, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.30181658267974854, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0075, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.34778207540512085, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0077, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.18988046050071716, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0048, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.3479195833206177, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0045, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.24158424139022827, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0051, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.14698052406311035, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0053, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.4441753625869751, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0065, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.28078633546829224, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0064, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.29406028985977173, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0048, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3856968581676483, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0067, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.36528849601745605, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0062, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.34250667691230774, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0053, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.2862832844257355, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0055, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.3683549761772156, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0091, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.26892581582069397, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0069, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.2220073938369751, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0052, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.18825116753578186, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0065, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.28731998801231384, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0069, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.26817163825035095, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0058, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.44162800908088684, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0065, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 0.2990165948867798, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0074, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.20428279042243958, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0053, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.2918189465999603, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0056, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.30408942699432373, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0063, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.2593521177768707, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0061, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.34048640727996826, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0054, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.2438877820968628, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0059, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.5205245018005371, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0065, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.3658570349216461, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0061, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.23279106616973877, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0039, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.2704083323478699, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0054, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.1849551945924759, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0061, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.21807430684566498, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0059, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.47879981994628906, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0061, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.24125567078590393, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0056, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.25820469856262207, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0053, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.30664944648742676, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0075, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.3646678030490875, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0057, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.2534210979938507, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0045, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.2125798910856247, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0074, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 0.4387839734554291, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0072, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.337387353181839, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.01, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.23150259256362915, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0072, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.3243090808391571, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0076, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.26716119050979614, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.006, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.15551891922950745, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0061, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.1841796338558197, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0058, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 0.3119230270385742, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.006, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.2633327841758728, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0059, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.24567869305610657, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0055, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.3697315454483032, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0061, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.1941021829843521, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0052, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.2610131502151489, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.007, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.24856074154376984, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0062, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.27259066700935364, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0052, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.20962993800640106, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0055, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.4015270471572876, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0062, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.22935271263122559, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0063, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.29984018206596375, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0059, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.35775551199913025, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0079, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.15501125156879425, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0054, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.3543296158313751, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0072, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.1982075721025467, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0064, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.2616399824619293, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0062, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.2612541615962982, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0064, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3081730008125305, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0055, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.24024926126003265, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0083, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.20793405175209045, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0055, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.21445533633232117, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0058, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.24078251421451569, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0059, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.36214157938957214, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0061, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.2583295702934265, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0054, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.2641732394695282, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0069, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.2179708331823349, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0049, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.27418699860572815, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0049, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.3894921839237213, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0076, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.3912152945995331, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0063, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.16886518895626068, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0059, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.2731325626373291, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0073, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.3299262225627899, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.007, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.2671407163143158, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0058, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.2701479196548462, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0059, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.3803080916404724, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0061, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.2621704041957855, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0061, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.27780428528785706, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0065, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.3326016962528229, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0081, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.3632255792617798, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0069, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.24395202100276947, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0065, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.3215671181678772, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0066, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.2625272572040558, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0065, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.31547197699546814, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0043, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.1893424689769745, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0059, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.27042335271835327, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0059, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.22597061097621918, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0063, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.1742873191833496, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0062, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.16797663271427155, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0048, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.42558521032333374, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0075, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.37216684222221375, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0061, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.19943472743034363, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0065, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.2211161106824875, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0075, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.2680184245109558, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0052, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.2402123361825943, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0051, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.1881084442138672, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0066, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.26134756207466125, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0063, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.3185539245605469, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0062, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.3118845820426941, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0061, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.22595946490764618, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.007, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.2627023458480835, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0067, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.2984865605831146, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0051, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.25496092438697815, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0057, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.3078263998031616, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0074, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.17885653674602509, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0057, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.37737196683883667, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0058, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.21651378273963928, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0053, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.1974128633737564, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0059, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.17184904217720032, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0058, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.3074864447116852, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0059, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.28784239292144775, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0061, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.3435216546058655, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0065, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.38048845529556274, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0057, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.1875533014535904, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0052, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.48555630445480347, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0063, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 0.25066429376602173, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0055, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.2763892412185669, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0059, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.21217335760593414, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0092, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.23555652797222137, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0064, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.14828811585903168, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.006, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 0.27303484082221985, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0047, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.14681454002857208, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0067, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.43693456053733826, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0081, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.2940906286239624, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0059, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.20382657647132874, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0074, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.25655868649482727, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0069, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.31879740953445435, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0062, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4898712933063507, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0051, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.17142456769943237, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0061, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.14010348916053772, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0045, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.26882827281951904, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0056, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.2636195421218872, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0048, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.24932081997394562, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0045, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.3367895185947418, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0049, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.15173649787902832, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0053, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.34083831310272217, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0072, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3327343165874481, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0048, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.36545902490615845, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0076, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.22761192917823792, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0067, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.19272181391716003, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0072, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.2881070375442505, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.006, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.32841676473617554, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0063, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.19850151240825653, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0052, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.31401291489601135, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0052, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.4023345112800598, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0058, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.25802844762802124, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0051, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.19678954780101776, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0053, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.4545653164386749, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0073, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.36174362897872925, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0068, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.31692951917648315, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0063, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.3470834195613861, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0064, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.29541268944740295, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0062, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.26377183198928833, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.006, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.2019137591123581, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0058, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.45156505703926086, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.007, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.15810425579547882, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.006, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.20093902945518494, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.006, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.28989917039871216, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0062, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.39454182982444763, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0063, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.25967612862586975, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0069, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.2058791220188141, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0065, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.26367849111557007, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0074, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.2432256042957306, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0054, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.19844679534435272, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0048, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.16757237911224365, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0052, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.2988821566104889, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0047, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.2231496274471283, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0048, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.265029639005661, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0048, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.41179928183555603, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.0049, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.33498677611351013, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0052, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.2323407232761383, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0048, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.27306419610977173, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0061, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.2791977822780609, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0088, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.453421026468277, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0073, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.3209727108478546, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0063, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.2572932839393616, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0056, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.19572272896766663, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0051, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.2831172049045563, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0057, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.21267575025558472, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0059, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.3220005929470062, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0057, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.2515857517719269, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0063, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.18344618380069733, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0052, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.34515154361724854, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0052, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.16711464524269104, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0054, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.3027217984199524, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.006, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.31168296933174133, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.007, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5778804421424866, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0056, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.2591782212257385, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0061, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.2449295073747635, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0046, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 0.19733767211437225, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0054, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.14837461709976196, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0053, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.3784295916557312, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0054, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.2400134950876236, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0054, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.17671307921409607, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0051, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.2664073705673218, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.006, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.25426605343818665, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0062, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.26733267307281494, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0049, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.46151378750801086, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.006, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.17070212960243225, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0062, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.42009514570236206, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0052, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.20439159870147705, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0053, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.25189417600631714, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0066, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.21402288973331451, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0072, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.294109046459198, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0061, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.29355865716934204, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0061, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.2937833368778229, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0061, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.1926010102033615, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0056, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.21794214844703674, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0065, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.23409108817577362, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0067, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.4696379005908966, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0062, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.28415724635124207, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0061, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.22433705627918243, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0064, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3090682923793793, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0056, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.23742817342281342, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0057, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.2670089900493622, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0052, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.2810697555541992, + "learning_rate": 1.299277443549658e-05, + "loss": 0.007, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.44233059883117676, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0069, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.14227768778800964, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0064, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.298776239156723, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0072, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.2882034480571747, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0064, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.23135380446910858, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0064, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.2870500981807709, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.005, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.24524538218975067, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0064, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.2949783504009247, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0081, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.2215491235256195, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0069, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.26351356506347656, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0082, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.1909482628107071, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0052, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.13428187370300293, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0068, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.2125115543603897, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0048, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.27032148838043213, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0056, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.20981402695178986, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0069, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.24961373209953308, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0073, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.13643066585063934, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0054, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.25289252400398254, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0071, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.4061530828475952, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.006, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.29924723505973816, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0055, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.37029367685317993, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0053, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.37273409962654114, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0066, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.18242980539798737, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0054, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.18563945591449738, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0044, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.32972440123558044, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0045, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 0.3327874541282654, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0065, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.2077408730983734, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0055, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.1813255399465561, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0055, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.17811767756938934, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0055, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.20526157319545746, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0043, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.112189382314682, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0055, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.29082757234573364, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0099, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.23212411999702454, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0067, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.17449915409088135, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0047, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.3327349126338959, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0047, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.2709571123123169, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0056, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.19788618385791779, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0063, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.22075456380844116, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0064, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.2943982779979706, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0057, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.1718410849571228, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0056, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.3546068072319031, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0055, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.18132814764976501, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0047, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.17795684933662415, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0048, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.22964486479759216, + "learning_rate": 1.257232766480803e-05, + "loss": 0.005, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.3259448707103729, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0072, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.18410101532936096, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0045, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.28669047355651855, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0056, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.25986725091934204, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0055, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.1731722205877304, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0053, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.17501944303512573, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.005, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.2749968469142914, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0046, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.26125603914260864, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0055, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.22476239502429962, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0103, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.26169249415397644, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0067, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.19236186146736145, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0048, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.26535508036613464, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0055, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.2534106373786926, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0052, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.29464206099510193, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0076, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.3711875081062317, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0059, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.26430103182792664, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0055, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.27274343371391296, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.006, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.15951389074325562, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0069, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.33735600113868713, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0064, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.19443227350711823, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0051, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.1960541307926178, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0049, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21133695542812347, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0066, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.22702853381633759, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.006, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.22489185631275177, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0061, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.33164891600608826, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0067, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.22196516394615173, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.0055, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.19532594084739685, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0048, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.41902172565460205, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0064, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.30388328433036804, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0052, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.2507944703102112, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0051, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.30817684531211853, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0052, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.27485454082489014, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.006, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.14287802577018738, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0047, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 0.14513961970806122, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.0049, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.3345814645290375, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0051, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.2974685728549957, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0049, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.3455393612384796, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0062, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.16792115569114685, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.005, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.3038713335990906, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.005, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.2928559184074402, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0054, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.2317439168691635, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0039, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.3498123586177826, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0067, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.2850436866283417, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0045, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.18316122889518738, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0089, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.34362390637397766, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0066, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.13047993183135986, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0057, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.3403606116771698, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0055, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.27717292308807373, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0043, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.27412480115890503, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0049, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.1914675235748291, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0075, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.3778243958950043, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0084, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.20566068589687347, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.007, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.1868937760591507, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0051, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.24719548225402832, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.005, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.20591633021831512, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0053, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4353996217250824, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.005, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.31571000814437866, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.005, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.14182177186012268, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0048, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.3461489975452423, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0062, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.17980965971946716, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0043, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.28671878576278687, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0048, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.18663623929023743, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0072, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.25223061442375183, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0063, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.20179906487464905, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0058, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.37325599789619446, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0079, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.18855971097946167, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0052, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.2992260754108429, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0051, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.18020357191562653, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0046, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.2106374204158783, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0044, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3749687373638153, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0068, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.1616801619529724, + "learning_rate": 1.188676298665799e-05, + "loss": 0.007, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.20882001519203186, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0143, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.16600479185581207, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0052, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.406480073928833, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0051, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.27349016070365906, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0056, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.2340608835220337, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0044, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.3165459632873535, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0042, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.19552721083164215, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0047, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.21882636845111847, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0061, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.23699741065502167, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0052, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.283207505941391, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0053, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.2782933712005615, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0062, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.3389151096343994, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0074, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.25642505288124084, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0061, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.19476772844791412, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0067, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.1992277055978775, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0057, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.21006375551223755, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0058, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.18808932602405548, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0073, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.258075475692749, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0052, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.29291409254074097, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0052, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.19002115726470947, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0041, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.4246057868003845, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.006, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 0.16166792809963226, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.005, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.35779255628585815, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0065, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.20405125617980957, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0082, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.23229332268238068, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0095, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.21156901121139526, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0074, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.22334401309490204, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0051, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.18344342708587646, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0048, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.22982414066791534, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0056, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.24991759657859802, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0046, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.27965986728668213, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0045, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.309841126203537, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0054, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.20964398980140686, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0044, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.45226722955703735, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0057, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.17177052795886993, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0064, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.8886939287185669, + "learning_rate": 1.153689339251154e-05, + "loss": 0.008, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.14726528525352478, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0066, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.32135209441185, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0064, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.22926779091358185, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0052, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.21345189213752747, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0047, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.31324461102485657, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0072, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.2185574620962143, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0047, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.36229151487350464, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0042, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.3479749262332916, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0053, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.23806153237819672, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0065, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.30633601546287537, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0079, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.2326052039861679, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0063, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 0.1756114363670349, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0064, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.18622055649757385, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0045, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.3261238932609558, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0059, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.16155003011226654, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0057, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.22661013901233673, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0046, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.24310468137264252, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0044, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.16182619333267212, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0056, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.1656215786933899, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0039, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.2945510447025299, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0049, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.24436083436012268, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0058, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.34221476316452026, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0069, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.26235878467559814, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0055, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.23333275318145752, + "learning_rate": 1.130316049722011e-05, + "loss": 0.005, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.23382601141929626, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0057, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 0.1693800389766693, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0058, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.3740929067134857, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.005, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.26146796345710754, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0038, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.13361674547195435, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0053, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8631370663642883, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0085, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.2952764630317688, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0054, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.23047442734241486, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0054, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.25271645188331604, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0059, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.3246142864227295, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0066, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.31531205773353577, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0045, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4806351959705353, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0089, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.15645328164100647, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0051, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.29767802357673645, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0044, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.23338516056537628, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0055, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.20454354584217072, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0049, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.2087928056716919, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.004, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.18911990523338318, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0058, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.16931432485580444, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0053, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.3027138411998749, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0055, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.22635169327259064, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0039, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.26646292209625244, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0047, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.20067426562309265, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0054, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.22507227957248688, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0076, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.18533077836036682, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.005, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.1757635474205017, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0077, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.2326493263244629, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.006, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.2661048471927643, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0048, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.3285987079143524, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0047, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.3764145076274872, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.005, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.19637148082256317, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0048, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 0.16601431369781494, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.005, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.12405529618263245, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0036, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.21413138508796692, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0043, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.3323937952518463, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0057, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.20915299654006958, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0054, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.28372666239738464, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0048, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.32995301485061646, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0051, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.2148507684469223, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0061, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.22549118101596832, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.005, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.19749189913272858, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0049, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.250184565782547, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0065, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.23174546658992767, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.2707926034927368, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0049, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.175989031791687, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0058, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.2267833948135376, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0044, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.3495822846889496, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0048, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.2051204890012741, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0063, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.22149987518787384, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0058, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.21434035897254944, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0046, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.2996143400669098, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0065, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.22886960208415985, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0053, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.3317148685455322, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.005, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.45717868208885193, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0062, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.1223258301615715, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0051, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.2037084549665451, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0046, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.3772616982460022, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0045, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.30312252044677734, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0069, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.14988413453102112, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0047, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.3409348130226135, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0069, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.2308650016784668, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0049, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.15572187304496765, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0051, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.1962181180715561, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0049, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.337464302778244, + "learning_rate": 1.067930046280971e-05, + "loss": 0.005, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.17047251760959625, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0045, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.3098141849040985, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0043, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.17919068038463593, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0052, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.3461310863494873, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.006, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.37006744742393494, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0066, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.19726566970348358, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.005, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.1319705843925476, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0049, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.2131422460079193, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0055, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.1435563862323761, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0067, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.24024318158626556, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0055, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.1511068344116211, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0052, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.16795606911182404, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0047, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.1475641280412674, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0046, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.21277494728565216, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0048, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.2511015832424164, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0043, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.24675171077251434, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0059, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.2560728192329407, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0055, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.30879196524620056, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.005, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.1838868409395218, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0052, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.1673516035079956, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0056, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.20293423533439636, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0047, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.25513023138046265, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0052, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.26149800419807434, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0045, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.27551159262657166, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0041, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.2508440911769867, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0043, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.2889135181903839, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0043, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.1755184680223465, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0051, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.2095116674900055, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0064, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.33451047539711, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0079, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.44589516520500183, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0064, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.24158142507076263, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0047, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.15632936358451843, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.006, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.10808487981557846, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0065, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.1782998889684677, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0046, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.16395118832588196, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.004, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 0.30205732583999634, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0058, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.1561775654554367, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.004, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.1649634838104248, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0062, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.15428072214126587, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0043, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.11285894364118576, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0067, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.3470291793346405, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0056, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.16610246896743774, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0051, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.29931193590164185, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0051, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.15366005897521973, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.005, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.2352767139673233, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0057, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.19226962327957153, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0042, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.1903623789548874, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0044, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.4167932868003845, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0071, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.2913760840892792, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0046, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.2632276713848114, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0063, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.21258050203323364, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0043, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.19750680029392242, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.0032, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.2896588444709778, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0045, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3017624020576477, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0074, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.18355949223041534, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0051, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.16483789682388306, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0056, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.2190672904253006, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0043, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.28435200452804565, + "learning_rate": 1.011517750003287e-05, + "loss": 0.005, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.2564929723739624, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0049, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.2592712342739105, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0048, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.18716935813426971, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0047, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.18236829340457916, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0049, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.27956655621528625, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0056, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.13664546608924866, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0048, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.21617569029331207, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0052, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.2196502536535263, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0054, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.20864732563495636, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0041, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.38381293416023254, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.005, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.1605401486158371, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0045, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.2079813927412033, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0051, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.2110205590724945, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0054, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.2421400547027588, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0048, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.41358140110969543, + "learning_rate": 9.969762660447491e-06, + "loss": 0.006, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.23386628925800323, + "learning_rate": 9.960077585586335e-06, + "loss": 0.005, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.20425592362880707, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0059, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.21164651215076447, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0042, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.1642364114522934, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0034, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.18716906011104584, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0044, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.15626995265483856, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0044, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.18394386768341064, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0044, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.3590037524700165, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0073, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.2103291153907776, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0051, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.19865299761295319, + "learning_rate": 9.87296819358355e-06, + "loss": 0.006, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.2052467316389084, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0065, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.31245940923690796, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0049, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.2959006726741791, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0042, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.33695659041404724, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0071, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.20898328721523285, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0062, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.3500119149684906, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0049, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.3926694095134735, + "learning_rate": 9.805290087509098e-06, + "loss": 0.007, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.24234539270401, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0039, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.1705496460199356, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0056, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.2907398045063019, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0048, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.2366454005241394, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0047, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.25498414039611816, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0046, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.163838192820549, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0048, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.1613040417432785, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0048, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.3639470338821411, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0042, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.22151169180870056, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0043, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.13474372029304504, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0051, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.2601003050804138, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0038, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.20202822983264923, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0046, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.18514803051948547, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0061, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.16678287088871002, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0038, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.17608965933322906, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0041, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.26356828212738037, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0059, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.297612726688385, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0047, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.16363881528377533, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.12642459571361542, + "learning_rate": 9.621949874438232e-06, + "loss": 0.004, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.3339644968509674, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0052, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.20784282684326172, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0046, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.28467273712158203, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0047, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.3124372661113739, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0051, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.3490087389945984, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0047, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.15114343166351318, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0051, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.41157594323158264, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0058, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.40405890345573425, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0045, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.1149911880493164, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0087, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.18746539950370789, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0058, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.1327875554561615, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0049, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.1530160903930664, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0038, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.2663615047931671, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0049, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.3390499949455261, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0046, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.2461002618074417, + "learning_rate": 9.477616135359713e-06, + "loss": 0.006, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.2141093611717224, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0049, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.20443470776081085, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0052, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.14927290380001068, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0039, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.3012462854385376, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0047, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.33484792709350586, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0045, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.19986321032047272, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0041, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.21612870693206787, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0043, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.19541047513484955, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0044, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.24203962087631226, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0049, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.1470087766647339, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0049, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.2336059808731079, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0048, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.32893121242523193, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0044, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.32034680247306824, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0055, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.27538758516311646, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0049, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.18869644403457642, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0065, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.2719379961490631, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0047, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.2850756347179413, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0043, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.19997543096542358, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0068, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.19222821295261383, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0044, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.16414248943328857, + "learning_rate": 9.285803018919292e-06, + "loss": 0.004, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.23754803836345673, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0039, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.2682085335254669, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0048, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.18268488347530365, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0046, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.14906349778175354, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0034, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.19079554080963135, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0041, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.09538780897855759, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0043, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.19193744659423828, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0044, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.1366361379623413, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0049, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.29436588287353516, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0052, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.24179348349571228, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0047, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.236627459526062, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0061, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.1719210296869278, + "learning_rate": 9.171095634265995e-06, + "loss": 0.0054, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.2724406123161316, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0048, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.09852395206689835, + "learning_rate": 9.152007262148612e-06, + "loss": 0.004, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.23493632674217224, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0049, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.20697079598903656, + "learning_rate": 9.132927564918328e-06, + "loss": 0.0047, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.16597376763820648, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0048, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.23542962968349457, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0046, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.18859006464481354, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0054, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.16773538291454315, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0044, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.2122378647327423, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0042, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.18205690383911133, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0046, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.1791398823261261, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0043, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.4446735680103302, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0052, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.32150915265083313, + "learning_rate": 9.047178679583151e-06, + "loss": 0.005, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.15855731070041656, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0045, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.19377414882183075, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0057, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.25969046354293823, + "learning_rate": 9.018636566864313e-06, + "loss": 0.006, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.2349981814622879, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0073, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.1853523701429367, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0051, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.22417226433753967, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0058, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.1969340741634369, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0058, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.18523764610290527, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0059, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.28188323974609375, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0052, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.18134717643260956, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0048, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.15660132467746735, + "learning_rate": 8.942627394858978e-06, + "loss": 0.004, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.3179869055747986, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0044, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.14007267355918884, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0043, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.31531354784965515, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0062, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.1867508888244629, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0054, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4172282814979553, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0056, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.21233956515789032, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0054, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.13055016100406647, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0048, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.24662990868091583, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0054, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.1877284198999405, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0045, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.20158089697360992, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0052, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.23169469833374023, + "learning_rate": 8.83836825410936e-06, + "loss": 0.0048, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.27991265058517456, + "learning_rate": 8.828905148874785e-06, + "loss": 0.008, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.3321090638637543, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0063, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.14790703356266022, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0033, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.1504756361246109, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0052, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.2211659848690033, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0038, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.1777208149433136, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0041, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.2586643397808075, + "learning_rate": 8.772180411864604e-06, + "loss": 0.006, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.2705499529838562, + "learning_rate": 8.762735374981932e-06, + "loss": 0.0047, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.16527540981769562, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0037, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.24313445389270782, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0057, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.16705767810344696, + "learning_rate": 8.734416061983528e-06, + "loss": 0.004, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.20638783276081085, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0052, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.26159438490867615, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0039, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.30387070775032043, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0038, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.24292278289794922, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0042, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.3707493543624878, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0056, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.41142478585243225, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0044, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.22052627801895142, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0047, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.14626234769821167, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0047, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.25504666566848755, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0046, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.2020457535982132, + "learning_rate": 8.640192851412488e-06, + "loss": 0.006, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.2440478354692459, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0047, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.12040785700082779, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0044, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.25539812445640564, + "learning_rate": 8.611979388060327e-06, + "loss": 0.006, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.20701228082180023, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0041, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.24188214540481567, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0063, + "step": 18000 + }, + { + "epoch": 1.0791539337288034, + "grad_norm": 0.24987974762916565, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0063, + "step": 18010 + }, + { + "epoch": 1.0797531308047217, + "grad_norm": 0.20973123610019684, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0049, + "step": 18020 + }, + { + "epoch": 1.08035232788064, + "grad_norm": 0.19898714125156403, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0061, + "step": 18030 + }, + { + "epoch": 1.0809515249565582, + "grad_norm": 0.21703247725963593, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0056, + "step": 18040 + }, + { + "epoch": 1.0815507220324765, + "grad_norm": 0.18688541650772095, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0054, + "step": 18050 + }, + { + "epoch": 1.0821499191083948, + "grad_norm": 0.30194586515426636, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0049, + "step": 18060 + }, + { + "epoch": 1.082749116184313, + "grad_norm": 0.17975366115570068, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0046, + "step": 18070 + }, + { + "epoch": 1.0833483132602313, + "grad_norm": 0.25966599583625793, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0044, + "step": 18080 + }, + { + "epoch": 1.0839475103361496, + "grad_norm": 0.1702205240726471, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0058, + "step": 18090 + }, + { + "epoch": 1.0845467074120678, + "grad_norm": 0.18940114974975586, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0052, + "step": 18100 + }, + { + "epoch": 1.085145904487986, + "grad_norm": 0.18239127099514008, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0047, + "step": 18110 + }, + { + "epoch": 1.0857451015639044, + "grad_norm": 0.14571616053581238, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0046, + "step": 18120 + }, + { + "epoch": 1.0863442986398226, + "grad_norm": 0.17203395068645477, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0038, + "step": 18130 + }, + { + "epoch": 1.086943495715741, + "grad_norm": 0.249881312251091, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0056, + "step": 18140 + }, + { + "epoch": 1.0875426927916592, + "grad_norm": 0.296194463968277, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0044, + "step": 18150 + }, + { + "epoch": 1.0881418898675774, + "grad_norm": 0.21376049518585205, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0052, + "step": 18160 + }, + { + "epoch": 1.0887410869434957, + "grad_norm": 0.2952374815940857, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0036, + "step": 18170 + }, + { + "epoch": 1.089340284019414, + "grad_norm": 0.20862646400928497, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0051, + "step": 18180 + }, + { + "epoch": 1.0899394810953322, + "grad_norm": 0.17828255891799927, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0053, + "step": 18190 + }, + { + "epoch": 1.0905386781712505, + "grad_norm": 0.20771050453186035, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0038, + "step": 18200 + }, + { + "epoch": 1.0911378752471688, + "grad_norm": 0.3046565651893616, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0059, + "step": 18210 + }, + { + "epoch": 1.091737072323087, + "grad_norm": 0.12605167925357819, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0046, + "step": 18220 + }, + { + "epoch": 1.0923362693990053, + "grad_norm": 0.13702887296676636, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0038, + "step": 18230 + }, + { + "epoch": 1.0929354664749236, + "grad_norm": 0.11569058150053024, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0042, + "step": 18240 + }, + { + "epoch": 1.0935346635508418, + "grad_norm": 0.27488255500793457, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0054, + "step": 18250 + }, + { + "epoch": 1.09413386062676, + "grad_norm": 0.30820342898368835, + "learning_rate": 8.349909816537207e-06, + "loss": 0.005, + "step": 18260 + }, + { + "epoch": 1.0947330577026784, + "grad_norm": 0.3108576536178589, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0056, + "step": 18270 + }, + { + "epoch": 1.0953322547785966, + "grad_norm": 0.16087505221366882, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0044, + "step": 18280 + }, + { + "epoch": 1.095931451854515, + "grad_norm": 0.27139320969581604, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0055, + "step": 18290 + }, + { + "epoch": 1.0965306489304332, + "grad_norm": 0.17057007551193237, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0036, + "step": 18300 + }, + { + "epoch": 1.0971298460063514, + "grad_norm": 0.13946233689785004, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0057, + "step": 18310 + }, + { + "epoch": 1.0977290430822697, + "grad_norm": 0.2342602014541626, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0038, + "step": 18320 + }, + { + "epoch": 1.098328240158188, + "grad_norm": 0.17249339818954468, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0047, + "step": 18330 + }, + { + "epoch": 1.0989274372341062, + "grad_norm": 0.2641673684120178, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0044, + "step": 18340 + }, + { + "epoch": 1.0995266343100245, + "grad_norm": 0.18304336071014404, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0041, + "step": 18350 + }, + { + "epoch": 1.1001258313859428, + "grad_norm": 0.25955966114997864, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0045, + "step": 18360 + }, + { + "epoch": 1.100725028461861, + "grad_norm": 0.2159314751625061, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0038, + "step": 18370 + }, + { + "epoch": 1.1013242255377793, + "grad_norm": 0.254371702671051, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0043, + "step": 18380 + }, + { + "epoch": 1.1019234226136976, + "grad_norm": 0.10616741329431534, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0036, + "step": 18390 + }, + { + "epoch": 1.1025226196896158, + "grad_norm": 0.38598379492759705, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0065, + "step": 18400 + }, + { + "epoch": 1.1031218167655341, + "grad_norm": 0.3797863721847534, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0048, + "step": 18410 + }, + { + "epoch": 1.1037210138414524, + "grad_norm": 0.2059139758348465, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0062, + "step": 18420 + }, + { + "epoch": 1.1043202109173706, + "grad_norm": 0.19991335272789001, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0043, + "step": 18430 + }, + { + "epoch": 1.104919407993289, + "grad_norm": 0.17376656830310822, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0047, + "step": 18440 + }, + { + "epoch": 1.1055186050692072, + "grad_norm": 0.17102457582950592, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0056, + "step": 18450 + }, + { + "epoch": 1.1061178021451255, + "grad_norm": 0.501983642578125, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0065, + "step": 18460 + }, + { + "epoch": 1.1067169992210437, + "grad_norm": 0.40338510274887085, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0055, + "step": 18470 + }, + { + "epoch": 1.107316196296962, + "grad_norm": 0.10511627048254013, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0052, + "step": 18480 + }, + { + "epoch": 1.1079153933728803, + "grad_norm": 0.2610682249069214, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0038, + "step": 18490 + }, + { + "epoch": 1.1085145904487985, + "grad_norm": 0.09666074812412262, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0058, + "step": 18500 + }, + { + "epoch": 1.1091137875247168, + "grad_norm": 0.19014683365821838, + "learning_rate": 8.117972135268806e-06, + "loss": 0.005, + "step": 18510 + }, + { + "epoch": 1.109712984600635, + "grad_norm": 0.2999255657196045, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0041, + "step": 18520 + }, + { + "epoch": 1.1103121816765533, + "grad_norm": 0.20351538062095642, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0049, + "step": 18530 + }, + { + "epoch": 1.1109113787524716, + "grad_norm": 0.1562410295009613, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0034, + "step": 18540 + }, + { + "epoch": 1.1115105758283899, + "grad_norm": 0.14160799980163574, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0035, + "step": 18550 + }, + { + "epoch": 1.1121097729043081, + "grad_norm": 0.10796743631362915, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0056, + "step": 18560 + }, + { + "epoch": 1.1127089699802264, + "grad_norm": 0.28861188888549805, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0043, + "step": 18570 + }, + { + "epoch": 1.1133081670561449, + "grad_norm": 0.3835368752479553, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0037, + "step": 18580 + }, + { + "epoch": 1.113907364132063, + "grad_norm": 0.21850043535232544, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0038, + "step": 18590 + }, + { + "epoch": 1.1145065612079814, + "grad_norm": 0.2950346767902374, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0068, + "step": 18600 + }, + { + "epoch": 1.1151057582838995, + "grad_norm": 0.13051068782806396, + "learning_rate": 8.025779439806006e-06, + "loss": 0.0041, + "step": 18610 + }, + { + "epoch": 1.115704955359818, + "grad_norm": 0.11036359518766403, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0074, + "step": 18620 + }, + { + "epoch": 1.116304152435736, + "grad_norm": 0.35306516289711, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0087, + "step": 18630 + }, + { + "epoch": 1.1169033495116545, + "grad_norm": 0.29782727360725403, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0045, + "step": 18640 + }, + { + "epoch": 1.1175025465875728, + "grad_norm": 0.20690713822841644, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0042, + "step": 18650 + }, + { + "epoch": 1.118101743663491, + "grad_norm": 0.16064110398292542, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0038, + "step": 18660 + }, + { + "epoch": 1.1187009407394093, + "grad_norm": 0.2477649450302124, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0042, + "step": 18670 + }, + { + "epoch": 1.1193001378153276, + "grad_norm": 0.25939393043518066, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0045, + "step": 18680 + }, + { + "epoch": 1.1198993348912458, + "grad_norm": 0.3345301151275635, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0045, + "step": 18690 + }, + { + "epoch": 1.120498531967164, + "grad_norm": 0.19570066034793854, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0052, + "step": 18700 + }, + { + "epoch": 1.1210977290430824, + "grad_norm": 0.09655601531267166, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0044, + "step": 18710 + }, + { + "epoch": 1.1216969261190006, + "grad_norm": 0.13345655798912048, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0031, + "step": 18720 + }, + { + "epoch": 1.122296123194919, + "grad_norm": 0.3130756616592407, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0072, + "step": 18730 + }, + { + "epoch": 1.1228953202708372, + "grad_norm": 0.16259168088436127, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0036, + "step": 18740 + }, + { + "epoch": 1.1234945173467554, + "grad_norm": 0.2581227123737335, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0037, + "step": 18750 + }, + { + "epoch": 1.1240937144226737, + "grad_norm": 0.36706119775772095, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0043, + "step": 18760 + }, + { + "epoch": 1.124692911498592, + "grad_norm": 0.1705426573753357, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0069, + "step": 18770 + }, + { + "epoch": 1.1252921085745102, + "grad_norm": 0.4281153380870819, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0057, + "step": 18780 + }, + { + "epoch": 1.1258913056504285, + "grad_norm": 0.25743696093559265, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0036, + "step": 18790 + }, + { + "epoch": 1.1264905027263468, + "grad_norm": 0.17692404985427856, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0044, + "step": 18800 + }, + { + "epoch": 1.127089699802265, + "grad_norm": 0.17617255449295044, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0043, + "step": 18810 + }, + { + "epoch": 1.1276888968781833, + "grad_norm": 0.193951815366745, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0042, + "step": 18820 + }, + { + "epoch": 1.1282880939541016, + "grad_norm": 0.2187023162841797, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0047, + "step": 18830 + }, + { + "epoch": 1.1288872910300198, + "grad_norm": 0.21488729119300842, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0039, + "step": 18840 + }, + { + "epoch": 1.129486488105938, + "grad_norm": 0.13388743996620178, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0043, + "step": 18850 + }, + { + "epoch": 1.1300856851818564, + "grad_norm": 0.26977118849754333, + "learning_rate": 7.796848308199681e-06, + "loss": 0.004, + "step": 18860 + }, + { + "epoch": 1.1306848822577746, + "grad_norm": 0.40695786476135254, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0049, + "step": 18870 + }, + { + "epoch": 1.131284079333693, + "grad_norm": 0.29070621728897095, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0056, + "step": 18880 + }, + { + "epoch": 1.1318832764096112, + "grad_norm": 0.2745647728443146, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0056, + "step": 18890 + }, + { + "epoch": 1.1324824734855294, + "grad_norm": 0.20881050825119019, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0057, + "step": 18900 + }, + { + "epoch": 1.1330816705614477, + "grad_norm": 0.17475518584251404, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0041, + "step": 18910 + }, + { + "epoch": 1.133680867637366, + "grad_norm": 0.2414310723543167, + "learning_rate": 7.742248115573104e-06, + "loss": 0.004, + "step": 18920 + }, + { + "epoch": 1.1342800647132842, + "grad_norm": 0.20051640272140503, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0042, + "step": 18930 + }, + { + "epoch": 1.1348792617892025, + "grad_norm": 0.18383435904979706, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0038, + "step": 18940 + }, + { + "epoch": 1.1354784588651208, + "grad_norm": 0.16546988487243652, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0041, + "step": 18950 + }, + { + "epoch": 1.136077655941039, + "grad_norm": 0.17165544629096985, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0057, + "step": 18960 + }, + { + "epoch": 1.1366768530169573, + "grad_norm": 0.25065234303474426, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0048, + "step": 18970 + }, + { + "epoch": 1.1372760500928756, + "grad_norm": 0.19762223958969116, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0038, + "step": 18980 + }, + { + "epoch": 1.1378752471687938, + "grad_norm": 0.23894545435905457, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0033, + "step": 18990 + }, + { + "epoch": 1.1384744442447121, + "grad_norm": 0.2860289216041565, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0053, + "step": 19000 + }, + { + "epoch": 1.1390736413206304, + "grad_norm": 0.3699626624584198, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0061, + "step": 19010 + }, + { + "epoch": 1.1396728383965486, + "grad_norm": 0.2370971292257309, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0043, + "step": 19020 + }, + { + "epoch": 1.140272035472467, + "grad_norm": 0.19790691137313843, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0042, + "step": 19030 + }, + { + "epoch": 1.1408712325483852, + "grad_norm": 0.14648208022117615, + "learning_rate": 7.633462930388875e-06, + "loss": 0.005, + "step": 19040 + }, + { + "epoch": 1.1414704296243035, + "grad_norm": 0.105158232152462, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0032, + "step": 19050 + }, + { + "epoch": 1.1420696267002217, + "grad_norm": 0.24994254112243652, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0042, + "step": 19060 + }, + { + "epoch": 1.14266882377614, + "grad_norm": 0.30648791790008545, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0058, + "step": 19070 + }, + { + "epoch": 1.1432680208520583, + "grad_norm": 0.16284243762493134, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0047, + "step": 19080 + }, + { + "epoch": 1.1438672179279765, + "grad_norm": 0.14919471740722656, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0045, + "step": 19090 + }, + { + "epoch": 1.1444664150038948, + "grad_norm": 0.14879491925239563, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0047, + "step": 19100 + }, + { + "epoch": 1.145065612079813, + "grad_norm": 0.11741457879543304, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0041, + "step": 19110 + }, + { + "epoch": 1.1456648091557313, + "grad_norm": 0.09406878799200058, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0029, + "step": 19120 + }, + { + "epoch": 1.1462640062316496, + "grad_norm": 0.20860706269741058, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0054, + "step": 19130 + }, + { + "epoch": 1.1468632033075679, + "grad_norm": 0.24234607815742493, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0047, + "step": 19140 + }, + { + "epoch": 1.1474624003834861, + "grad_norm": 0.27025938034057617, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0042, + "step": 19150 + }, + { + "epoch": 1.1480615974594044, + "grad_norm": 0.15129081904888153, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0046, + "step": 19160 + }, + { + "epoch": 1.1486607945353227, + "grad_norm": 0.11173490434885025, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0035, + "step": 19170 + }, + { + "epoch": 1.149259991611241, + "grad_norm": 0.2204807698726654, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0036, + "step": 19180 + }, + { + "epoch": 1.1498591886871592, + "grad_norm": 0.20111115276813507, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0087, + "step": 19190 + }, + { + "epoch": 1.1504583857630775, + "grad_norm": 0.213748961687088, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0045, + "step": 19200 + }, + { + "epoch": 1.1510575828389957, + "grad_norm": 0.21150177717208862, + "learning_rate": 7.480328799175369e-06, + "loss": 0.004, + "step": 19210 + }, + { + "epoch": 1.151656779914914, + "grad_norm": 0.2450210005044937, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0036, + "step": 19220 + }, + { + "epoch": 1.1522559769908323, + "grad_norm": 0.16161729395389557, + "learning_rate": 7.4623904967312e-06, + "loss": 0.004, + "step": 19230 + }, + { + "epoch": 1.1528551740667505, + "grad_norm": 0.15077564120292664, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0038, + "step": 19240 + }, + { + "epoch": 1.1534543711426688, + "grad_norm": 0.3078431487083435, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0051, + "step": 19250 + }, + { + "epoch": 1.154053568218587, + "grad_norm": 0.15213221311569214, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0032, + "step": 19260 + }, + { + "epoch": 1.1546527652945053, + "grad_norm": 0.12404917925596237, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0042, + "step": 19270 + }, + { + "epoch": 1.1552519623704236, + "grad_norm": 0.18779516220092773, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0041, + "step": 19280 + }, + { + "epoch": 1.1558511594463419, + "grad_norm": 0.4039568603038788, + "learning_rate": 7.408675563767873e-06, + "loss": 0.005, + "step": 19290 + }, + { + "epoch": 1.1564503565222601, + "grad_norm": 0.2045651078224182, + "learning_rate": 7.399737764864619e-06, + "loss": 0.0057, + "step": 19300 + }, + { + "epoch": 1.1570495535981784, + "grad_norm": 0.3885338306427002, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0049, + "step": 19310 + }, + { + "epoch": 1.1576487506740967, + "grad_norm": 0.253049373626709, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0059, + "step": 19320 + }, + { + "epoch": 1.158247947750015, + "grad_norm": 0.250356525182724, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0076, + "step": 19330 + }, + { + "epoch": 1.1588471448259332, + "grad_norm": 0.3269367814064026, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0112, + "step": 19340 + }, + { + "epoch": 1.1594463419018515, + "grad_norm": 0.15401138365268707, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0052, + "step": 19350 + }, + { + "epoch": 1.1600455389777697, + "grad_norm": 0.1631775051355362, + "learning_rate": 7.346200065486093e-06, + "loss": 0.004, + "step": 19360 + }, + { + "epoch": 1.160644736053688, + "grad_norm": 0.17112085223197937, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0038, + "step": 19370 + }, + { + "epoch": 1.1612439331296063, + "grad_norm": 0.24018551409244537, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0056, + "step": 19380 + }, + { + "epoch": 1.1618431302055245, + "grad_norm": 0.17964349687099457, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0057, + "step": 19390 + }, + { + "epoch": 1.1624423272814428, + "grad_norm": 0.1747465431690216, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0053, + "step": 19400 + }, + { + "epoch": 1.163041524357361, + "grad_norm": 0.21299205720424652, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0038, + "step": 19410 + }, + { + "epoch": 1.1636407214332793, + "grad_norm": 0.13219258189201355, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0057, + "step": 19420 + }, + { + "epoch": 1.1642399185091976, + "grad_norm": 1.0558332204818726, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0066, + "step": 19430 + }, + { + "epoch": 1.1648391155851159, + "grad_norm": 0.2154799997806549, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0041, + "step": 19440 + }, + { + "epoch": 1.1654383126610341, + "grad_norm": 0.13665339350700378, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0044, + "step": 19450 + }, + { + "epoch": 1.1660375097369524, + "grad_norm": 0.2101723700761795, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0039, + "step": 19460 + }, + { + "epoch": 1.1666367068128707, + "grad_norm": 0.13208501040935516, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0054, + "step": 19470 + }, + { + "epoch": 1.167235903888789, + "grad_norm": 0.09342823177576065, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0032, + "step": 19480 + }, + { + "epoch": 1.1678351009647072, + "grad_norm": 0.22464905679225922, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0055, + "step": 19490 + }, + { + "epoch": 1.1684342980406255, + "grad_norm": 0.17030438780784607, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0042, + "step": 19500 + }, + { + "epoch": 1.1690334951165438, + "grad_norm": 0.17673689126968384, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0055, + "step": 19510 + }, + { + "epoch": 1.169632692192462, + "grad_norm": 0.24041922390460968, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0048, + "step": 19520 + }, + { + "epoch": 1.1702318892683803, + "grad_norm": 0.14808662235736847, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0031, + "step": 19530 + }, + { + "epoch": 1.1708310863442986, + "grad_norm": 0.2489791214466095, + "learning_rate": 7.186522173441719e-06, + "loss": 0.004, + "step": 19540 + }, + { + "epoch": 1.1714302834202168, + "grad_norm": 0.19468742609024048, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0042, + "step": 19550 + }, + { + "epoch": 1.172029480496135, + "grad_norm": 0.15028323233127594, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0061, + "step": 19560 + }, + { + "epoch": 1.1726286775720534, + "grad_norm": 0.13852037489414215, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0045, + "step": 19570 + }, + { + "epoch": 1.1732278746479716, + "grad_norm": 0.1401798278093338, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0063, + "step": 19580 + }, + { + "epoch": 1.17382707172389, + "grad_norm": 0.1831122189760208, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0034, + "step": 19590 + }, + { + "epoch": 1.1744262687998082, + "grad_norm": 0.2867920994758606, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0044, + "step": 19600 + }, + { + "epoch": 1.1750254658757266, + "grad_norm": 0.13363438844680786, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0038, + "step": 19610 + }, + { + "epoch": 1.1756246629516447, + "grad_norm": 0.20085696876049042, + "learning_rate": 7.116016051769541e-06, + "loss": 0.004, + "step": 19620 + }, + { + "epoch": 1.1762238600275632, + "grad_norm": 0.1598372906446457, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0042, + "step": 19630 + }, + { + "epoch": 1.1768230571034812, + "grad_norm": 0.09672598540782928, + "learning_rate": 7.098434895408162e-06, + "loss": 0.004, + "step": 19640 + }, + { + "epoch": 1.1774222541793997, + "grad_norm": 0.18206225335597992, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0048, + "step": 19650 + }, + { + "epoch": 1.1780214512553178, + "grad_norm": 0.1818019449710846, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0038, + "step": 19660 + }, + { + "epoch": 1.1786206483312363, + "grad_norm": 0.21658800542354584, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0044, + "step": 19670 + }, + { + "epoch": 1.1792198454071543, + "grad_norm": 0.08513368666172028, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0038, + "step": 19680 + }, + { + "epoch": 1.1798190424830728, + "grad_norm": 0.10634194314479828, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0044, + "step": 19690 + }, + { + "epoch": 1.1804182395589908, + "grad_norm": 0.12106078863143921, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0037, + "step": 19700 + }, + { + "epoch": 1.1810174366349093, + "grad_norm": 0.11508465558290482, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0036, + "step": 19710 + }, + { + "epoch": 1.1816166337108274, + "grad_norm": 0.20805053412914276, + "learning_rate": 7.028294242074066e-06, + "loss": 0.004, + "step": 19720 + }, + { + "epoch": 1.1822158307867459, + "grad_norm": 0.23920200765132904, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0045, + "step": 19730 + }, + { + "epoch": 1.1828150278626641, + "grad_norm": 0.1300375908613205, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0045, + "step": 19740 + }, + { + "epoch": 1.1834142249385824, + "grad_norm": 0.23444809019565582, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0036, + "step": 19750 + }, + { + "epoch": 1.1840134220145007, + "grad_norm": 0.2636217772960663, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0044, + "step": 19760 + }, + { + "epoch": 1.184612619090419, + "grad_norm": 0.31166398525238037, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.005, + "step": 19770 + }, + { + "epoch": 1.1852118161663372, + "grad_norm": 0.32881107926368713, + "learning_rate": 6.975884226362e-06, + "loss": 0.0055, + "step": 19780 + }, + { + "epoch": 1.1858110132422555, + "grad_norm": 0.41748252511024475, + "learning_rate": 6.967165692827958e-06, + "loss": 0.006, + "step": 19790 + }, + { + "epoch": 1.1864102103181737, + "grad_norm": 0.1588834673166275, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0039, + "step": 19800 + }, + { + "epoch": 1.187009407394092, + "grad_norm": 0.23697984218597412, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0039, + "step": 19810 + }, + { + "epoch": 1.1876086044700103, + "grad_norm": 0.19356773793697357, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0061, + "step": 19820 + }, + { + "epoch": 1.1882078015459285, + "grad_norm": 0.16373206675052643, + "learning_rate": 6.932338988482141e-06, + "loss": 0.004, + "step": 19830 + }, + { + "epoch": 1.1888069986218468, + "grad_norm": 0.1331700086593628, + "learning_rate": 6.923644220932124e-06, + "loss": 0.004, + "step": 19840 + }, + { + "epoch": 1.189406195697765, + "grad_norm": 0.4039696753025055, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0057, + "step": 19850 + }, + { + "epoch": 1.1900053927736833, + "grad_norm": 0.30325421690940857, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0065, + "step": 19860 + }, + { + "epoch": 1.1906045898496016, + "grad_norm": 0.21767468750476837, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0038, + "step": 19870 + }, + { + "epoch": 1.1912037869255199, + "grad_norm": 0.17474445700645447, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0056, + "step": 19880 + }, + { + "epoch": 1.1918029840014381, + "grad_norm": 0.17118008434772491, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0045, + "step": 19890 + }, + { + "epoch": 1.1924021810773564, + "grad_norm": 0.44261473417282104, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0063, + "step": 19900 + }, + { + "epoch": 1.1930013781532747, + "grad_norm": 0.18502798676490784, + "learning_rate": 6.862915366041247e-06, + "loss": 0.004, + "step": 19910 + }, + { + "epoch": 1.193600575229193, + "grad_norm": 0.19384194910526276, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0036, + "step": 19920 + }, + { + "epoch": 1.1941997723051112, + "grad_norm": 0.1448352187871933, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0044, + "step": 19930 + }, + { + "epoch": 1.1947989693810295, + "grad_norm": 0.3728172779083252, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0038, + "step": 19940 + }, + { + "epoch": 1.1953981664569477, + "grad_norm": 0.31421783566474915, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0043, + "step": 19950 + }, + { + "epoch": 1.195997363532866, + "grad_norm": 0.28181371092796326, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0045, + "step": 19960 + }, + { + "epoch": 1.1965965606087843, + "grad_norm": 0.2249889373779297, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0041, + "step": 19970 + }, + { + "epoch": 1.1971957576847025, + "grad_norm": 0.26402008533477783, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0043, + "step": 19980 + }, + { + "epoch": 1.1977949547606208, + "grad_norm": 0.22621415555477142, + "learning_rate": 6.793802468038111e-06, + "loss": 0.004, + "step": 19990 + }, + { + "epoch": 1.198394151836539, + "grad_norm": 0.2681289315223694, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0045, + "step": 20000 + }, + { + "epoch": 1.1989933489124573, + "grad_norm": 0.17681041359901428, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0037, + "step": 20010 + }, + { + "epoch": 1.1995925459883756, + "grad_norm": 0.16526542603969574, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0032, + "step": 20020 + }, + { + "epoch": 1.2001917430642939, + "grad_norm": 0.30313149094581604, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0046, + "step": 20030 + }, + { + "epoch": 1.2007909401402121, + "grad_norm": 0.17628541588783264, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0065, + "step": 20040 + }, + { + "epoch": 1.2013901372161304, + "grad_norm": 0.1840096414089203, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0051, + "step": 20050 + }, + { + "epoch": 1.2019893342920487, + "grad_norm": 0.146232470870018, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0035, + "step": 20060 + }, + { + "epoch": 1.202588531367967, + "grad_norm": 0.4804438352584839, + "learning_rate": 6.725005485342219e-06, + "loss": 0.005, + "step": 20070 + }, + { + "epoch": 1.2031877284438852, + "grad_norm": 0.2245558500289917, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0039, + "step": 20080 + }, + { + "epoch": 1.2037869255198035, + "grad_norm": 0.21845588088035583, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0053, + "step": 20090 + }, + { + "epoch": 1.2043861225957218, + "grad_norm": 0.1743943691253662, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0037, + "step": 20100 + }, + { + "epoch": 1.20498531967164, + "grad_norm": 0.16978098452091217, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0036, + "step": 20110 + }, + { + "epoch": 1.2055845167475583, + "grad_norm": 0.27158796787261963, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0043, + "step": 20120 + }, + { + "epoch": 1.2061837138234766, + "grad_norm": 0.13516400754451752, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0048, + "step": 20130 + }, + { + "epoch": 1.2067829108993948, + "grad_norm": 0.1645064353942871, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0038, + "step": 20140 + }, + { + "epoch": 1.207382107975313, + "grad_norm": 0.07616083323955536, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0046, + "step": 20150 + }, + { + "epoch": 1.2079813050512314, + "grad_norm": 0.13306911289691925, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0039, + "step": 20160 + }, + { + "epoch": 1.2085805021271496, + "grad_norm": 0.19445037841796875, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0044, + "step": 20170 + }, + { + "epoch": 1.209179699203068, + "grad_norm": 0.18423207104206085, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0049, + "step": 20180 + }, + { + "epoch": 1.2097788962789862, + "grad_norm": 0.19280213117599487, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0043, + "step": 20190 + }, + { + "epoch": 1.2103780933549044, + "grad_norm": 0.25472623109817505, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0033, + "step": 20200 + }, + { + "epoch": 1.2109772904308227, + "grad_norm": 0.16799427568912506, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0031, + "step": 20210 + }, + { + "epoch": 1.211576487506741, + "grad_norm": 0.2097395807504654, + "learning_rate": 6.596880604028027e-06, + "loss": 0.004, + "step": 20220 + }, + { + "epoch": 1.2121756845826592, + "grad_norm": 0.31450021266937256, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0047, + "step": 20230 + }, + { + "epoch": 1.2127748816585775, + "grad_norm": 0.16530238091945648, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0034, + "step": 20240 + }, + { + "epoch": 1.2133740787344958, + "grad_norm": 0.2506805956363678, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0038, + "step": 20250 + }, + { + "epoch": 1.213973275810414, + "grad_norm": 0.1876160055398941, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0035, + "step": 20260 + }, + { + "epoch": 1.2145724728863323, + "grad_norm": 0.23704354465007782, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0041, + "step": 20270 + }, + { + "epoch": 1.2151716699622506, + "grad_norm": 0.13814999163150787, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0042, + "step": 20280 + }, + { + "epoch": 1.2157708670381688, + "grad_norm": 0.1164403185248375, + "learning_rate": 6.53748481975927e-06, + "loss": 0.0042, + "step": 20290 + }, + { + "epoch": 1.216370064114087, + "grad_norm": 0.23078426718711853, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0038, + "step": 20300 + }, + { + "epoch": 1.2169692611900054, + "grad_norm": 0.21749110519886017, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0046, + "step": 20310 + }, + { + "epoch": 1.2175684582659236, + "grad_norm": 0.24972137808799744, + "learning_rate": 6.512107839793337e-06, + "loss": 0.0041, + "step": 20320 + }, + { + "epoch": 1.218167655341842, + "grad_norm": 0.2491082102060318, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0043, + "step": 20330 + }, + { + "epoch": 1.2187668524177602, + "grad_norm": 0.14915086328983307, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0048, + "step": 20340 + }, + { + "epoch": 1.2193660494936784, + "grad_norm": 0.2794116735458374, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0035, + "step": 20350 + }, + { + "epoch": 1.2199652465695967, + "grad_norm": 0.13765662908554077, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0047, + "step": 20360 + }, + { + "epoch": 1.220564443645515, + "grad_norm": 0.14874878525733948, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0042, + "step": 20370 + }, + { + "epoch": 1.2211636407214332, + "grad_norm": 0.1800280064344406, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0057, + "step": 20380 + }, + { + "epoch": 1.2217628377973515, + "grad_norm": 0.17518648505210876, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0049, + "step": 20390 + }, + { + "epoch": 1.2223620348732698, + "grad_norm": 0.16315865516662598, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0045, + "step": 20400 + }, + { + "epoch": 1.222961231949188, + "grad_norm": 0.3590790033340454, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0039, + "step": 20410 + }, + { + "epoch": 1.2235604290251063, + "grad_norm": 0.14534324407577515, + "learning_rate": 6.427861749601945e-06, + "loss": 0.004, + "step": 20420 + }, + { + "epoch": 1.2241596261010246, + "grad_norm": 0.1662825047969818, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0042, + "step": 20430 + }, + { + "epoch": 1.2247588231769428, + "grad_norm": 0.27466440200805664, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0045, + "step": 20440 + }, + { + "epoch": 1.2253580202528611, + "grad_norm": 0.1323469579219818, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0047, + "step": 20450 + }, + { + "epoch": 1.2259572173287794, + "grad_norm": 0.12367355078458786, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0077, + "step": 20460 + }, + { + "epoch": 1.2265564144046976, + "grad_norm": 0.18238325417041779, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0058, + "step": 20470 + }, + { + "epoch": 1.227155611480616, + "grad_norm": 0.2733745574951172, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0038, + "step": 20480 + }, + { + "epoch": 1.2277548085565342, + "grad_norm": 0.3367181420326233, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0039, + "step": 20490 + }, + { + "epoch": 1.2283540056324525, + "grad_norm": 0.20671530067920685, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0034, + "step": 20500 + }, + { + "epoch": 1.2289532027083707, + "grad_norm": 0.23353071510791779, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0033, + "step": 20510 + }, + { + "epoch": 1.229552399784289, + "grad_norm": 0.21081902086734772, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0031, + "step": 20520 + }, + { + "epoch": 1.2301515968602073, + "grad_norm": 0.3426077365875244, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0049, + "step": 20530 + }, + { + "epoch": 1.2307507939361255, + "grad_norm": 0.3905622959136963, + "learning_rate": 6.327475567095824e-06, + "loss": 0.004, + "step": 20540 + }, + { + "epoch": 1.2313499910120438, + "grad_norm": 0.1888400912284851, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0041, + "step": 20550 + }, + { + "epoch": 1.231949188087962, + "grad_norm": 0.23982487618923187, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0041, + "step": 20560 + }, + { + "epoch": 1.2325483851638803, + "grad_norm": 0.2061331421136856, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0046, + "step": 20570 + }, + { + "epoch": 1.2331475822397986, + "grad_norm": 0.17000116407871246, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0033, + "step": 20580 + }, + { + "epoch": 1.2337467793157169, + "grad_norm": 0.15905790030956268, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0049, + "step": 20590 + }, + { + "epoch": 1.2343459763916351, + "grad_norm": 0.16794419288635254, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0052, + "step": 20600 + }, + { + "epoch": 1.2349451734675534, + "grad_norm": 0.3003343641757965, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0061, + "step": 20610 + }, + { + "epoch": 1.2355443705434717, + "grad_norm": 0.1429288536310196, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0042, + "step": 20620 + }, + { + "epoch": 1.23614356761939, + "grad_norm": 0.18542084097862244, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0047, + "step": 20630 + }, + { + "epoch": 1.2367427646953082, + "grad_norm": 0.2692892253398895, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0035, + "step": 20640 + }, + { + "epoch": 1.2373419617712265, + "grad_norm": 0.23286236822605133, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0037, + "step": 20650 + }, + { + "epoch": 1.2379411588471447, + "grad_norm": 0.0963423103094101, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0041, + "step": 20660 + }, + { + "epoch": 1.238540355923063, + "grad_norm": 0.1425798237323761, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0043, + "step": 20670 + }, + { + "epoch": 1.2391395529989815, + "grad_norm": 0.0960182398557663, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0046, + "step": 20680 + }, + { + "epoch": 1.2397387500748995, + "grad_norm": 0.2674477994441986, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0043, + "step": 20690 + }, + { + "epoch": 1.240337947150818, + "grad_norm": 0.16276703774929047, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0041, + "step": 20700 + }, + { + "epoch": 1.240937144226736, + "grad_norm": 0.24255621433258057, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.003, + "step": 20710 + }, + { + "epoch": 1.2415363413026546, + "grad_norm": 0.20395220816135406, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0054, + "step": 20720 + }, + { + "epoch": 1.2421355383785726, + "grad_norm": 0.12099681794643402, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0082, + "step": 20730 + }, + { + "epoch": 1.242734735454491, + "grad_norm": 0.14017170667648315, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0042, + "step": 20740 + }, + { + "epoch": 1.2433339325304091, + "grad_norm": 0.28132137656211853, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0043, + "step": 20750 + }, + { + "epoch": 1.2439331296063276, + "grad_norm": 0.27220970392227173, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0039, + "step": 20760 + }, + { + "epoch": 1.2445323266822457, + "grad_norm": 0.23647353053092957, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0058, + "step": 20770 + }, + { + "epoch": 1.2451315237581642, + "grad_norm": 0.20623824000358582, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0053, + "step": 20780 + }, + { + "epoch": 1.2457307208340822, + "grad_norm": 0.12366114556789398, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0037, + "step": 20790 + }, + { + "epoch": 1.2463299179100007, + "grad_norm": 0.23330192267894745, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0056, + "step": 20800 + }, + { + "epoch": 1.246929114985919, + "grad_norm": 0.19991633296012878, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0031, + "step": 20810 + }, + { + "epoch": 1.2475283120618372, + "grad_norm": 0.1496160626411438, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0058, + "step": 20820 + }, + { + "epoch": 1.2481275091377555, + "grad_norm": 0.13247868418693542, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0037, + "step": 20830 + }, + { + "epoch": 1.2487267062136738, + "grad_norm": 0.19072194397449493, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0057, + "step": 20840 + }, + { + "epoch": 1.249325903289592, + "grad_norm": 0.10773085057735443, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0042, + "step": 20850 + }, + { + "epoch": 1.2499251003655103, + "grad_norm": 0.14058449864387512, + "learning_rate": 6.063685039328116e-06, + "loss": 0.005, + "step": 20860 + }, + { + "epoch": 1.2505242974414286, + "grad_norm": 0.10825464874505997, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0042, + "step": 20870 + }, + { + "epoch": 1.2511234945173468, + "grad_norm": 0.18059906363487244, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0046, + "step": 20880 + }, + { + "epoch": 1.251722691593265, + "grad_norm": 0.1713389754295349, + "learning_rate": 6.039253929027638e-06, + "loss": 0.005, + "step": 20890 + }, + { + "epoch": 1.2523218886691834, + "grad_norm": 0.23789434134960175, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0047, + "step": 20900 + }, + { + "epoch": 1.2529210857451016, + "grad_norm": 0.17626744508743286, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0041, + "step": 20910 + }, + { + "epoch": 1.25352028282102, + "grad_norm": 0.2091904729604721, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0044, + "step": 20920 + }, + { + "epoch": 1.2541194798969382, + "grad_norm": 0.17293672263622284, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0043, + "step": 20930 + }, + { + "epoch": 1.2547186769728564, + "grad_norm": 0.13156521320343018, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0039, + "step": 20940 + }, + { + "epoch": 1.2553178740487747, + "grad_norm": 0.19591976702213287, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0043, + "step": 20950 + }, + { + "epoch": 1.255917071124693, + "grad_norm": 0.16212835907936096, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0039, + "step": 20960 + }, + { + "epoch": 1.2565162682006112, + "grad_norm": 0.10661022365093231, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0037, + "step": 20970 + }, + { + "epoch": 1.2571154652765295, + "grad_norm": 0.16630858182907104, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0038, + "step": 20980 + }, + { + "epoch": 1.2577146623524478, + "grad_norm": 0.11001022905111313, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0037, + "step": 20990 + }, + { + "epoch": 1.258313859428366, + "grad_norm": 0.1888381838798523, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0044, + "step": 21000 + }, + { + "epoch": 1.2589130565042843, + "grad_norm": 0.19239328801631927, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0044, + "step": 21010 + }, + { + "epoch": 1.2595122535802026, + "grad_norm": 0.16555139422416687, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0032, + "step": 21020 + }, + { + "epoch": 1.2601114506561208, + "grad_norm": 0.19748231768608093, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0043, + "step": 21030 + }, + { + "epoch": 1.2607106477320391, + "grad_norm": 0.1546473354101181, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0049, + "step": 21040 + }, + { + "epoch": 1.2613098448079574, + "grad_norm": 0.30511707067489624, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0037, + "step": 21050 + }, + { + "epoch": 1.2619090418838756, + "grad_norm": 0.1722872257232666, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0048, + "step": 21060 + }, + { + "epoch": 1.262508238959794, + "grad_norm": 0.1784086525440216, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0049, + "step": 21070 + }, + { + "epoch": 1.2631074360357122, + "grad_norm": 0.15101182460784912, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0042, + "step": 21080 + }, + { + "epoch": 1.2637066331116305, + "grad_norm": 0.1252688318490982, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0041, + "step": 21090 + }, + { + "epoch": 1.2643058301875487, + "grad_norm": 0.15101821720600128, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0043, + "step": 21100 + }, + { + "epoch": 1.264905027263467, + "grad_norm": 0.21302345395088196, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0035, + "step": 21110 + }, + { + "epoch": 1.2655042243393853, + "grad_norm": 0.1591431051492691, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0033, + "step": 21120 + }, + { + "epoch": 1.2661034214153035, + "grad_norm": 0.16010484099388123, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0049, + "step": 21130 + }, + { + "epoch": 1.2667026184912218, + "grad_norm": 0.19287234544754028, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0037, + "step": 21140 + }, + { + "epoch": 1.26730181556714, + "grad_norm": 0.1804349720478058, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0036, + "step": 21150 + }, + { + "epoch": 1.2679010126430583, + "grad_norm": 0.14769446849822998, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0044, + "step": 21160 + }, + { + "epoch": 1.2685002097189766, + "grad_norm": 0.15914054214954376, + "learning_rate": 5.813791207086085e-06, + "loss": 0.004, + "step": 21170 + }, + { + "epoch": 1.2690994067948949, + "grad_norm": 0.19632315635681152, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0034, + "step": 21180 + }, + { + "epoch": 1.2696986038708131, + "grad_norm": 0.3017818331718445, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0046, + "step": 21190 + }, + { + "epoch": 1.2702978009467314, + "grad_norm": 0.2728461027145386, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0044, + "step": 21200 + }, + { + "epoch": 1.2708969980226497, + "grad_norm": 0.18619874119758606, + "learning_rate": 5.781966956563247e-06, + "loss": 0.004, + "step": 21210 + }, + { + "epoch": 1.271496195098568, + "grad_norm": 0.1235085129737854, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0037, + "step": 21220 + }, + { + "epoch": 1.2720953921744862, + "grad_norm": 0.15798084437847137, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0035, + "step": 21230 + }, + { + "epoch": 1.2726945892504045, + "grad_norm": 0.15713484585285187, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0036, + "step": 21240 + }, + { + "epoch": 1.2732937863263227, + "grad_norm": 0.15594886243343353, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0038, + "step": 21250 + }, + { + "epoch": 1.273892983402241, + "grad_norm": 0.1558992713689804, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0037, + "step": 21260 + }, + { + "epoch": 1.2744921804781593, + "grad_norm": 0.20599815249443054, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0054, + "step": 21270 + }, + { + "epoch": 1.2750913775540775, + "grad_norm": 0.2785670757293701, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0042, + "step": 21280 + }, + { + "epoch": 1.2756905746299958, + "grad_norm": 0.22550497949123383, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0035, + "step": 21290 + }, + { + "epoch": 1.276289771705914, + "grad_norm": 0.15210074186325073, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0035, + "step": 21300 + }, + { + "epoch": 1.2768889687818323, + "grad_norm": 0.18905121088027954, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0035, + "step": 21310 + }, + { + "epoch": 1.2774881658577506, + "grad_norm": 0.1337066888809204, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0046, + "step": 21320 + }, + { + "epoch": 1.2780873629336689, + "grad_norm": 0.23699362576007843, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0048, + "step": 21330 + }, + { + "epoch": 1.2786865600095871, + "grad_norm": 0.2480958253145218, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0037, + "step": 21340 + }, + { + "epoch": 1.2792857570855054, + "grad_norm": 0.09328999370336533, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0047, + "step": 21350 + }, + { + "epoch": 1.2798849541614237, + "grad_norm": 0.3416430950164795, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0048, + "step": 21360 + }, + { + "epoch": 1.280484151237342, + "grad_norm": 0.13258710503578186, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0032, + "step": 21370 + }, + { + "epoch": 1.2810833483132602, + "grad_norm": 0.18493984639644623, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0037, + "step": 21380 + }, + { + "epoch": 1.2816825453891785, + "grad_norm": 0.10433483123779297, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0045, + "step": 21390 + }, + { + "epoch": 1.2822817424650967, + "grad_norm": 0.18333138525485992, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0038, + "step": 21400 + }, + { + "epoch": 1.282880939541015, + "grad_norm": 0.25164106488227844, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0058, + "step": 21410 + }, + { + "epoch": 1.2834801366169333, + "grad_norm": 0.17989882826805115, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0041, + "step": 21420 + }, + { + "epoch": 1.2840793336928515, + "grad_norm": 0.1597793847322464, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0036, + "step": 21430 + }, + { + "epoch": 1.2846785307687698, + "grad_norm": 0.1543695032596588, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0036, + "step": 21440 + }, + { + "epoch": 1.285277727844688, + "grad_norm": 0.2985675036907196, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0043, + "step": 21450 + }, + { + "epoch": 1.2858769249206063, + "grad_norm": 0.1357773244380951, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0036, + "step": 21460 + }, + { + "epoch": 1.2864761219965246, + "grad_norm": 0.23978300392627716, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.005, + "step": 21470 + }, + { + "epoch": 1.2870753190724429, + "grad_norm": 0.12806151807308197, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0035, + "step": 21480 + }, + { + "epoch": 1.2876745161483611, + "grad_norm": 0.2222731113433838, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0039, + "step": 21490 + }, + { + "epoch": 1.2882737132242794, + "grad_norm": 0.16744646430015564, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0035, + "step": 21500 + }, + { + "epoch": 1.2888729103001977, + "grad_norm": 0.2162114977836609, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0048, + "step": 21510 + }, + { + "epoch": 1.289472107376116, + "grad_norm": 0.14857177436351776, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0036, + "step": 21520 + }, + { + "epoch": 1.2900713044520342, + "grad_norm": 0.21318115293979645, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0032, + "step": 21530 + }, + { + "epoch": 1.2906705015279525, + "grad_norm": 0.257682204246521, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0036, + "step": 21540 + }, + { + "epoch": 1.2912696986038708, + "grad_norm": 0.254349946975708, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0042, + "step": 21550 + }, + { + "epoch": 1.291868895679789, + "grad_norm": 0.148925319314003, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0029, + "step": 21560 + }, + { + "epoch": 1.2924680927557073, + "grad_norm": 0.1902056336402893, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0031, + "step": 21570 + }, + { + "epoch": 1.2930672898316256, + "grad_norm": 0.17580094933509827, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0026, + "step": 21580 + }, + { + "epoch": 1.2936664869075438, + "grad_norm": 0.18856695294380188, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0045, + "step": 21590 + }, + { + "epoch": 1.294265683983462, + "grad_norm": 0.17185454070568085, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0039, + "step": 21600 + }, + { + "epoch": 1.2948648810593804, + "grad_norm": 0.1997966468334198, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0043, + "step": 21610 + }, + { + "epoch": 1.2954640781352986, + "grad_norm": 0.14173944294452667, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0033, + "step": 21620 + }, + { + "epoch": 1.296063275211217, + "grad_norm": 0.20653635263442993, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0039, + "step": 21630 + }, + { + "epoch": 1.2966624722871352, + "grad_norm": 0.19571708142757416, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0026, + "step": 21640 + }, + { + "epoch": 1.2972616693630534, + "grad_norm": 0.0877508670091629, + "learning_rate": 5.438496901657042e-06, + "loss": 0.005, + "step": 21650 + }, + { + "epoch": 1.2978608664389717, + "grad_norm": 0.17305001616477966, + "learning_rate": 5.430834687545416e-06, + "loss": 0.0038, + "step": 21660 + }, + { + "epoch": 1.2984600635148902, + "grad_norm": 0.16555450856685638, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0035, + "step": 21670 + }, + { + "epoch": 1.2990592605908082, + "grad_norm": 0.15395715832710266, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0035, + "step": 21680 + }, + { + "epoch": 1.2996584576667267, + "grad_norm": 0.2430422455072403, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0032, + "step": 21690 + }, + { + "epoch": 1.3002576547426448, + "grad_norm": 0.2465265393257141, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0034, + "step": 21700 + }, + { + "epoch": 1.3008568518185633, + "grad_norm": 0.08382703363895416, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0038, + "step": 21710 + }, + { + "epoch": 1.3014560488944813, + "grad_norm": 0.3427184224128723, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0042, + "step": 21720 + }, + { + "epoch": 1.3020552459703998, + "grad_norm": 0.13029031455516815, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0039, + "step": 21730 + }, + { + "epoch": 1.3026544430463178, + "grad_norm": 0.11826448887586594, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0035, + "step": 21740 + }, + { + "epoch": 1.3032536401222363, + "grad_norm": 0.1612391620874405, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0039, + "step": 21750 + }, + { + "epoch": 1.3038528371981544, + "grad_norm": 0.21143540740013123, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0057, + "step": 21760 + }, + { + "epoch": 1.3044520342740729, + "grad_norm": 0.22977286577224731, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.005, + "step": 21770 + }, + { + "epoch": 1.305051231349991, + "grad_norm": 0.11853202432394028, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0058, + "step": 21780 + }, + { + "epoch": 1.3056504284259094, + "grad_norm": 0.24277184903621674, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0038, + "step": 21790 + }, + { + "epoch": 1.3062496255018274, + "grad_norm": 0.2625603675842285, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0048, + "step": 21800 + }, + { + "epoch": 1.306848822577746, + "grad_norm": 0.1333419382572174, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0033, + "step": 21810 + }, + { + "epoch": 1.307448019653664, + "grad_norm": 0.09627685695886612, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0035, + "step": 21820 + }, + { + "epoch": 1.3080472167295825, + "grad_norm": 0.416618674993515, + "learning_rate": 5.301584321328435e-06, + "loss": 0.0038, + "step": 21830 + }, + { + "epoch": 1.3086464138055005, + "grad_norm": 0.18699553608894348, + "learning_rate": 5.294041118587667e-06, + "loss": 0.004, + "step": 21840 + }, + { + "epoch": 1.309245610881419, + "grad_norm": 0.1827329397201538, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0039, + "step": 21850 + }, + { + "epoch": 1.309844807957337, + "grad_norm": 0.19719162583351135, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0034, + "step": 21860 + }, + { + "epoch": 1.3104440050332555, + "grad_norm": 0.09895205497741699, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0042, + "step": 21870 + }, + { + "epoch": 1.3110432021091736, + "grad_norm": 0.11187861114740372, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0036, + "step": 21880 + }, + { + "epoch": 1.311642399185092, + "grad_norm": 0.154103085398674, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0037, + "step": 21890 + }, + { + "epoch": 1.3122415962610101, + "grad_norm": 0.11124159395694733, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0038, + "step": 21900 + }, + { + "epoch": 1.3128407933369286, + "grad_norm": 0.27686378359794617, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0041, + "step": 21910 + }, + { + "epoch": 1.3134399904128466, + "grad_norm": 0.12900429964065552, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0036, + "step": 21920 + }, + { + "epoch": 1.3140391874887651, + "grad_norm": 0.26441213488578796, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0032, + "step": 21930 + }, + { + "epoch": 1.3146383845646834, + "grad_norm": 0.2187345325946808, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.004, + "step": 21940 + }, + { + "epoch": 1.3152375816406017, + "grad_norm": 0.08503159135580063, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0034, + "step": 21950 + }, + { + "epoch": 1.31583677871652, + "grad_norm": 0.12869144976139069, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0035, + "step": 21960 + }, + { + "epoch": 1.3164359757924382, + "grad_norm": 0.13212713599205017, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0027, + "step": 21970 + }, + { + "epoch": 1.3170351728683565, + "grad_norm": 0.23211228847503662, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0032, + "step": 21980 + }, + { + "epoch": 1.3176343699442747, + "grad_norm": 0.2017366737127304, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0043, + "step": 21990 + }, + { + "epoch": 1.318233567020193, + "grad_norm": 0.21221789717674255, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0039, + "step": 22000 + }, + { + "epoch": 1.3188327640961113, + "grad_norm": 0.24497511982917786, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0046, + "step": 22010 + }, + { + "epoch": 1.3194319611720295, + "grad_norm": 0.15008985996246338, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0039, + "step": 22020 + }, + { + "epoch": 1.3200311582479478, + "grad_norm": 0.15641193091869354, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0039, + "step": 22030 + }, + { + "epoch": 1.320630355323866, + "grad_norm": 0.2608455419540405, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0036, + "step": 22040 + }, + { + "epoch": 1.3212295523997843, + "grad_norm": 0.09808705747127533, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0038, + "step": 22050 + }, + { + "epoch": 1.3218287494757026, + "grad_norm": 0.18084567785263062, + "learning_rate": 5.129800405815733e-06, + "loss": 0.0045, + "step": 22060 + }, + { + "epoch": 1.3224279465516209, + "grad_norm": 0.1957635134458542, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0036, + "step": 22070 + }, + { + "epoch": 1.3230271436275391, + "grad_norm": 0.1479685753583908, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0045, + "step": 22080 + }, + { + "epoch": 1.3236263407034574, + "grad_norm": 0.14854201674461365, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0035, + "step": 22090 + }, + { + "epoch": 1.3242255377793757, + "grad_norm": 0.14744973182678223, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0033, + "step": 22100 + }, + { + "epoch": 1.324824734855294, + "grad_norm": 0.7196730375289917, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0071, + "step": 22110 + }, + { + "epoch": 1.3254239319312122, + "grad_norm": 0.22570419311523438, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0056, + "step": 22120 + }, + { + "epoch": 1.3260231290071305, + "grad_norm": 0.16870586574077606, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0042, + "step": 22130 + }, + { + "epoch": 1.3266223260830488, + "grad_norm": 0.12610554695129395, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0051, + "step": 22140 + }, + { + "epoch": 1.327221523158967, + "grad_norm": 0.11198554188013077, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0042, + "step": 22150 + }, + { + "epoch": 1.3278207202348853, + "grad_norm": 0.13166265189647675, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0037, + "step": 22160 + }, + { + "epoch": 1.3284199173108036, + "grad_norm": 0.1181526631116867, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0037, + "step": 22170 + }, + { + "epoch": 1.3290191143867218, + "grad_norm": 0.2055635005235672, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0027, + "step": 22180 + }, + { + "epoch": 1.32961831146264, + "grad_norm": 0.13400030136108398, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0028, + "step": 22190 + }, + { + "epoch": 1.3302175085385584, + "grad_norm": 0.09746947884559631, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0048, + "step": 22200 + }, + { + "epoch": 1.3308167056144766, + "grad_norm": 0.22124870121479034, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0048, + "step": 22210 + }, + { + "epoch": 1.331415902690395, + "grad_norm": 0.09961193799972534, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0038, + "step": 22220 + }, + { + "epoch": 1.3320150997663132, + "grad_norm": 0.20024695992469788, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0036, + "step": 22230 + }, + { + "epoch": 1.3326142968422314, + "grad_norm": 0.3697144687175751, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0044, + "step": 22240 + }, + { + "epoch": 1.3332134939181497, + "grad_norm": 0.1713833063840866, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0038, + "step": 22250 + }, + { + "epoch": 1.333812690994068, + "grad_norm": 0.1914745569229126, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0051, + "step": 22260 + }, + { + "epoch": 1.3344118880699862, + "grad_norm": 0.190393328666687, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0035, + "step": 22270 + }, + { + "epoch": 1.3350110851459045, + "grad_norm": 0.17361588776111603, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0038, + "step": 22280 + }, + { + "epoch": 1.3356102822218228, + "grad_norm": 0.19456325471401215, + "learning_rate": 4.961660586405147e-06, + "loss": 0.0036, + "step": 22290 + }, + { + "epoch": 1.336209479297741, + "grad_norm": 0.15772588551044464, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0039, + "step": 22300 + }, + { + "epoch": 1.3368086763736593, + "grad_norm": 0.11680205166339874, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0045, + "step": 22310 + }, + { + "epoch": 1.3374078734495776, + "grad_norm": 0.3643893599510193, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0044, + "step": 22320 + }, + { + "epoch": 1.3380070705254958, + "grad_norm": 0.1628265231847763, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0036, + "step": 22330 + }, + { + "epoch": 1.338606267601414, + "grad_norm": 0.10073156654834747, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0041, + "step": 22340 + }, + { + "epoch": 1.3392054646773324, + "grad_norm": 0.13039462268352509, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0045, + "step": 22350 + }, + { + "epoch": 1.3398046617532506, + "grad_norm": 0.12775596976280212, + "learning_rate": 4.911226880894818e-06, + "loss": 0.003, + "step": 22360 + }, + { + "epoch": 1.340403858829169, + "grad_norm": 0.1513100564479828, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0044, + "step": 22370 + }, + { + "epoch": 1.3410030559050872, + "grad_norm": 0.1346164345741272, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0036, + "step": 22380 + }, + { + "epoch": 1.3416022529810054, + "grad_norm": 0.12880294024944305, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0041, + "step": 22390 + }, + { + "epoch": 1.3422014500569237, + "grad_norm": 0.3154917359352112, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0038, + "step": 22400 + }, + { + "epoch": 1.342800647132842, + "grad_norm": 0.18458192050457, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.0057, + "step": 22410 + }, + { + "epoch": 1.3433998442087602, + "grad_norm": 0.2524041533470154, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0038, + "step": 22420 + }, + { + "epoch": 1.3439990412846785, + "grad_norm": 0.11894001811742783, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0033, + "step": 22430 + }, + { + "epoch": 1.3445982383605968, + "grad_norm": 0.1094699576497078, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0037, + "step": 22440 + }, + { + "epoch": 1.345197435436515, + "grad_norm": 0.11090611666440964, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0045, + "step": 22450 + }, + { + "epoch": 1.3457966325124333, + "grad_norm": 0.3179106116294861, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0041, + "step": 22460 + }, + { + "epoch": 1.3463958295883516, + "grad_norm": 0.09424899518489838, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0034, + "step": 22470 + }, + { + "epoch": 1.3469950266642698, + "grad_norm": 0.3028348982334137, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0035, + "step": 22480 + }, + { + "epoch": 1.3475942237401881, + "grad_norm": 0.30831560492515564, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0045, + "step": 22490 + }, + { + "epoch": 1.3481934208161064, + "grad_norm": 0.34811046719551086, + "learning_rate": 4.81141273556404e-06, + "loss": 0.005, + "step": 22500 + }, + { + "epoch": 1.3487926178920246, + "grad_norm": 0.18413113057613373, + "learning_rate": 4.804337352679613e-06, + "loss": 0.0044, + "step": 22510 + }, + { + "epoch": 1.349391814967943, + "grad_norm": 0.11229179799556732, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.004, + "step": 22520 + }, + { + "epoch": 1.3499910120438612, + "grad_norm": 0.2966957688331604, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0056, + "step": 22530 + }, + { + "epoch": 1.3505902091197795, + "grad_norm": 0.10525348782539368, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0037, + "step": 22540 + }, + { + "epoch": 1.3511894061956977, + "grad_norm": 0.1479673534631729, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0049, + "step": 22550 + }, + { + "epoch": 1.351788603271616, + "grad_norm": 0.5229315757751465, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0051, + "step": 22560 + }, + { + "epoch": 1.3523878003475343, + "grad_norm": 0.17021632194519043, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0038, + "step": 22570 + }, + { + "epoch": 1.3529869974234525, + "grad_norm": 0.10177282989025116, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0036, + "step": 22580 + }, + { + "epoch": 1.3535861944993708, + "grad_norm": 0.17768025398254395, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0034, + "step": 22590 + }, + { + "epoch": 1.354185391575289, + "grad_norm": 0.2090948224067688, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0027, + "step": 22600 + }, + { + "epoch": 1.3547845886512073, + "grad_norm": 0.1722206026315689, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0031, + "step": 22610 + }, + { + "epoch": 1.3553837857271256, + "grad_norm": 0.09709088504314423, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0037, + "step": 22620 + }, + { + "epoch": 1.3559829828030439, + "grad_norm": 0.1969165802001953, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0034, + "step": 22630 + }, + { + "epoch": 1.3565821798789621, + "grad_norm": 0.0810595229268074, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0038, + "step": 22640 + }, + { + "epoch": 1.3571813769548804, + "grad_norm": 0.22003750503063202, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0041, + "step": 22650 + }, + { + "epoch": 1.3577805740307987, + "grad_norm": 0.2809178829193115, + "learning_rate": 4.699083753549858e-06, + "loss": 0.003, + "step": 22660 + }, + { + "epoch": 1.358379771106717, + "grad_norm": 0.1343737691640854, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0026, + "step": 22670 + }, + { + "epoch": 1.3589789681826352, + "grad_norm": 0.19191010296344757, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0035, + "step": 22680 + }, + { + "epoch": 1.3595781652585535, + "grad_norm": 0.16617201268672943, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0031, + "step": 22690 + }, + { + "epoch": 1.3601773623344717, + "grad_norm": 0.24936997890472412, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0032, + "step": 22700 + }, + { + "epoch": 1.36077655941039, + "grad_norm": 0.5643696188926697, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0053, + "step": 22710 + }, + { + "epoch": 1.3613757564863083, + "grad_norm": 0.19725625216960907, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0031, + "step": 22720 + }, + { + "epoch": 1.3619749535622265, + "grad_norm": 0.1692969799041748, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0035, + "step": 22730 + }, + { + "epoch": 1.362574150638145, + "grad_norm": 0.17487913370132446, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0041, + "step": 22740 + }, + { + "epoch": 1.363173347714063, + "grad_norm": 0.25642889738082886, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0043, + "step": 22750 + }, + { + "epoch": 1.3637725447899816, + "grad_norm": 0.3692823350429535, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0039, + "step": 22760 + }, + { + "epoch": 1.3643717418658996, + "grad_norm": 0.230118989944458, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0047, + "step": 22770 + }, + { + "epoch": 1.364970938941818, + "grad_norm": 0.1609203815460205, + "learning_rate": 4.616077433849538e-06, + "loss": 0.0038, + "step": 22780 + }, + { + "epoch": 1.3655701360177361, + "grad_norm": 0.21201254427433014, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0029, + "step": 22790 + }, + { + "epoch": 1.3661693330936546, + "grad_norm": 0.10142157226800919, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0034, + "step": 22800 + }, + { + "epoch": 1.3667685301695727, + "grad_norm": 0.19121089577674866, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0033, + "step": 22810 + }, + { + "epoch": 1.3673677272454912, + "grad_norm": 0.156619131565094, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0045, + "step": 22820 + }, + { + "epoch": 1.3679669243214092, + "grad_norm": 0.14690659940242767, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0042, + "step": 22830 + }, + { + "epoch": 1.3685661213973277, + "grad_norm": 0.13466109335422516, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0041, + "step": 22840 + }, + { + "epoch": 1.3691653184732457, + "grad_norm": 0.3713383674621582, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0056, + "step": 22850 + }, + { + "epoch": 1.3697645155491642, + "grad_norm": 0.12184764444828033, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0038, + "step": 22860 + }, + { + "epoch": 1.3703637126250823, + "grad_norm": 0.23971956968307495, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0035, + "step": 22870 + }, + { + "epoch": 1.3709629097010008, + "grad_norm": 0.3320925235748291, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0052, + "step": 22880 + }, + { + "epoch": 1.3715621067769188, + "grad_norm": 0.11913793534040451, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0029, + "step": 22890 + }, + { + "epoch": 1.3721613038528373, + "grad_norm": 0.11725693941116333, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0044, + "step": 22900 + }, + { + "epoch": 1.3727605009287553, + "grad_norm": 0.1550632119178772, + "learning_rate": 4.527371771040039e-06, + "loss": 0.0049, + "step": 22910 + }, + { + "epoch": 1.3733596980046738, + "grad_norm": 0.23413509130477905, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0043, + "step": 22920 + }, + { + "epoch": 1.3739588950805919, + "grad_norm": 0.16070885956287384, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0032, + "step": 22930 + }, + { + "epoch": 1.3745580921565104, + "grad_norm": 0.12317437678575516, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0038, + "step": 22940 + }, + { + "epoch": 1.3751572892324284, + "grad_norm": 0.3462170660495758, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0048, + "step": 22950 + }, + { + "epoch": 1.375756486308347, + "grad_norm": 0.12654773890972137, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0045, + "step": 22960 + }, + { + "epoch": 1.376355683384265, + "grad_norm": 0.06262557208538055, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0026, + "step": 22970 + }, + { + "epoch": 1.3769548804601834, + "grad_norm": 0.1439850926399231, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0045, + "step": 22980 + }, + { + "epoch": 1.3775540775361017, + "grad_norm": 0.24463413655757904, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0042, + "step": 22990 + }, + { + "epoch": 1.37815327461202, + "grad_norm": 0.22048236429691315, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0036, + "step": 23000 + }, + { + "epoch": 1.3787524716879382, + "grad_norm": 0.10628963261842728, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0037, + "step": 23010 + }, + { + "epoch": 1.3793516687638565, + "grad_norm": 0.14685721695423126, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0043, + "step": 23020 + }, + { + "epoch": 1.3799508658397748, + "grad_norm": 0.18807503581047058, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0042, + "step": 23030 + }, + { + "epoch": 1.380550062915693, + "grad_norm": 0.19162075221538544, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0047, + "step": 23040 + }, + { + "epoch": 1.3811492599916113, + "grad_norm": 0.2444164752960205, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0041, + "step": 23050 + }, + { + "epoch": 1.3817484570675296, + "grad_norm": 0.12120077759027481, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0037, + "step": 23060 + }, + { + "epoch": 1.3823476541434478, + "grad_norm": 0.19946682453155518, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.003, + "step": 23070 + }, + { + "epoch": 1.3829468512193661, + "grad_norm": 0.23982395231723785, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0035, + "step": 23080 + }, + { + "epoch": 1.3835460482952844, + "grad_norm": 0.13806626200675964, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0031, + "step": 23090 + }, + { + "epoch": 1.3841452453712026, + "grad_norm": 0.2610985040664673, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0047, + "step": 23100 + }, + { + "epoch": 1.384744442447121, + "grad_norm": 0.1384919434785843, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0055, + "step": 23110 + }, + { + "epoch": 1.3853436395230392, + "grad_norm": 0.14737965166568756, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0037, + "step": 23120 + }, + { + "epoch": 1.3859428365989575, + "grad_norm": 0.1304326057434082, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0036, + "step": 23130 + }, + { + "epoch": 1.3865420336748757, + "grad_norm": 0.22288398444652557, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0035, + "step": 23140 + }, + { + "epoch": 1.387141230750794, + "grad_norm": 0.11266916245222092, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0071, + "step": 23150 + }, + { + "epoch": 1.3877404278267123, + "grad_norm": 0.15941838920116425, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0024, + "step": 23160 + }, + { + "epoch": 1.3883396249026305, + "grad_norm": 0.18921831250190735, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0026, + "step": 23170 + }, + { + "epoch": 1.3889388219785488, + "grad_norm": 0.10112889111042023, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0037, + "step": 23180 + }, + { + "epoch": 1.389538019054467, + "grad_norm": 0.1865631341934204, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0028, + "step": 23190 + }, + { + "epoch": 1.3901372161303853, + "grad_norm": 0.20046782493591309, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0042, + "step": 23200 + }, + { + "epoch": 1.3907364132063036, + "grad_norm": 0.11953336745500565, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0026, + "step": 23210 + }, + { + "epoch": 1.3913356102822219, + "grad_norm": 0.17050383985042572, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0029, + "step": 23220 + }, + { + "epoch": 1.3919348073581401, + "grad_norm": 0.28782936930656433, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0042, + "step": 23230 + }, + { + "epoch": 1.3925340044340584, + "grad_norm": 0.2104359269142151, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0034, + "step": 23240 + }, + { + "epoch": 1.3931332015099767, + "grad_norm": 0.12790441513061523, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0048, + "step": 23250 + }, + { + "epoch": 1.393732398585895, + "grad_norm": 0.12111827731132507, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0043, + "step": 23260 + }, + { + "epoch": 1.3943315956618132, + "grad_norm": 0.2542783319950104, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0043, + "step": 23270 + }, + { + "epoch": 1.3949307927377315, + "grad_norm": 0.17177502810955048, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0036, + "step": 23280 + }, + { + "epoch": 1.3955299898136497, + "grad_norm": 0.14121277630329132, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0051, + "step": 23290 + }, + { + "epoch": 1.396129186889568, + "grad_norm": 0.11357807368040085, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0033, + "step": 23300 + }, + { + "epoch": 1.3967283839654863, + "grad_norm": 0.3277477025985718, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0043, + "step": 23310 + }, + { + "epoch": 1.3973275810414045, + "grad_norm": 0.37000587582588196, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0037, + "step": 23320 + }, + { + "epoch": 1.3979267781173228, + "grad_norm": 0.11122190207242966, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0034, + "step": 23330 + }, + { + "epoch": 1.398525975193241, + "grad_norm": 0.14530375599861145, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0039, + "step": 23340 + }, + { + "epoch": 1.3991251722691593, + "grad_norm": 0.19974422454833984, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0033, + "step": 23350 + }, + { + "epoch": 1.3997243693450776, + "grad_norm": 0.15466761589050293, + "learning_rate": 4.230335566422999e-06, + "loss": 0.003, + "step": 23360 + }, + { + "epoch": 1.4003235664209959, + "grad_norm": 0.19129224121570587, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0042, + "step": 23370 + }, + { + "epoch": 1.4009227634969141, + "grad_norm": 0.2474614828824997, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0046, + "step": 23380 + }, + { + "epoch": 1.4015219605728324, + "grad_norm": 0.15569351613521576, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0048, + "step": 23390 + }, + { + "epoch": 1.4021211576487507, + "grad_norm": 0.09572251886129379, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0044, + "step": 23400 + }, + { + "epoch": 1.402720354724669, + "grad_norm": 0.13737086951732635, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0043, + "step": 23410 + }, + { + "epoch": 1.4033195518005872, + "grad_norm": 0.12266672402620316, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0039, + "step": 23420 + }, + { + "epoch": 1.4039187488765055, + "grad_norm": 0.09208404272794724, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0039, + "step": 23430 + }, + { + "epoch": 1.4045179459524237, + "grad_norm": 0.16571840643882751, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0047, + "step": 23440 + }, + { + "epoch": 1.405117143028342, + "grad_norm": 0.3071173131465912, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0039, + "step": 23450 + }, + { + "epoch": 1.4057163401042603, + "grad_norm": 0.09059276431798935, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0031, + "step": 23460 + }, + { + "epoch": 1.4063155371801785, + "grad_norm": 0.16070133447647095, + "learning_rate": 4.160146936563338e-06, + "loss": 0.004, + "step": 23470 + }, + { + "epoch": 1.4069147342560968, + "grad_norm": 0.12942227721214294, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0027, + "step": 23480 + }, + { + "epoch": 1.407513931332015, + "grad_norm": 0.13913804292678833, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0048, + "step": 23490 + }, + { + "epoch": 1.4081131284079333, + "grad_norm": 0.206321582198143, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0053, + "step": 23500 + }, + { + "epoch": 1.4087123254838516, + "grad_norm": 0.20973987877368927, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0041, + "step": 23510 + }, + { + "epoch": 1.4093115225597699, + "grad_norm": 0.23191478848457336, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0063, + "step": 23520 + }, + { + "epoch": 1.4099107196356881, + "grad_norm": 0.18233250081539154, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0029, + "step": 23530 + }, + { + "epoch": 1.4105099167116064, + "grad_norm": 0.133034810423851, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0043, + "step": 23540 + }, + { + "epoch": 1.4111091137875247, + "grad_norm": 0.10777711123228073, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0028, + "step": 23550 + }, + { + "epoch": 1.411708310863443, + "grad_norm": 0.14128559827804565, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0034, + "step": 23560 + }, + { + "epoch": 1.4123075079393612, + "grad_norm": 0.13215866684913635, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0036, + "step": 23570 + }, + { + "epoch": 1.4129067050152795, + "grad_norm": 0.18918493390083313, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0043, + "step": 23580 + }, + { + "epoch": 1.4135059020911978, + "grad_norm": 0.14459657669067383, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0045, + "step": 23590 + }, + { + "epoch": 1.414105099167116, + "grad_norm": 0.17287056148052216, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0039, + "step": 23600 + }, + { + "epoch": 1.4147042962430343, + "grad_norm": 0.13909804821014404, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0039, + "step": 23610 + }, + { + "epoch": 1.4153034933189526, + "grad_norm": 0.14798089861869812, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0037, + "step": 23620 + }, + { + "epoch": 1.4159026903948708, + "grad_norm": 0.10916659235954285, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0023, + "step": 23630 + }, + { + "epoch": 1.416501887470789, + "grad_norm": 0.1151762530207634, + "learning_rate": 4.053587511509546e-06, + "loss": 0.005, + "step": 23640 + }, + { + "epoch": 1.4171010845467074, + "grad_norm": 0.14232765138149261, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0032, + "step": 23650 + }, + { + "epoch": 1.4177002816226256, + "grad_norm": 0.09513483196496964, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0043, + "step": 23660 + }, + { + "epoch": 1.418299478698544, + "grad_norm": 0.09156285226345062, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0039, + "step": 23670 + }, + { + "epoch": 1.4188986757744622, + "grad_norm": 0.1405397206544876, + "learning_rate": 4.028855757736123e-06, + "loss": 0.004, + "step": 23680 + }, + { + "epoch": 1.4194978728503804, + "grad_norm": 0.15840958058834076, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0037, + "step": 23690 + }, + { + "epoch": 1.4200970699262987, + "grad_norm": 0.190508171916008, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0037, + "step": 23700 + }, + { + "epoch": 1.420696267002217, + "grad_norm": 0.15277954936027527, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0028, + "step": 23710 + }, + { + "epoch": 1.4212954640781352, + "grad_norm": 0.14111991226673126, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0034, + "step": 23720 + }, + { + "epoch": 1.4218946611540535, + "grad_norm": 0.31528833508491516, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0044, + "step": 23730 + }, + { + "epoch": 1.4224938582299718, + "grad_norm": 0.1420607715845108, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0039, + "step": 23740 + }, + { + "epoch": 1.42309305530589, + "grad_norm": 0.1340852528810501, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0032, + "step": 23750 + }, + { + "epoch": 1.4236922523818083, + "grad_norm": 0.11166475713253021, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0033, + "step": 23760 + }, + { + "epoch": 1.4242914494577266, + "grad_norm": 0.13635945320129395, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0028, + "step": 23770 + }, + { + "epoch": 1.4248906465336448, + "grad_norm": 0.15865778923034668, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0036, + "step": 23780 + }, + { + "epoch": 1.4254898436095633, + "grad_norm": 0.08569981157779694, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0026, + "step": 23790 + }, + { + "epoch": 1.4260890406854814, + "grad_norm": 0.1041082963347435, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0033, + "step": 23800 + }, + { + "epoch": 1.4266882377613999, + "grad_norm": 0.17262709140777588, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0041, + "step": 23810 + }, + { + "epoch": 1.427287434837318, + "grad_norm": 0.20455610752105713, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0035, + "step": 23820 + }, + { + "epoch": 1.4278866319132364, + "grad_norm": 0.15869568288326263, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0027, + "step": 23830 + }, + { + "epoch": 1.4284858289891544, + "grad_norm": 0.14855770766735077, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0042, + "step": 23840 + }, + { + "epoch": 1.429085026065073, + "grad_norm": 0.08842955529689789, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0028, + "step": 23850 + }, + { + "epoch": 1.429684223140991, + "grad_norm": 0.18251122534275055, + "learning_rate": 3.919189353330104e-06, + "loss": 0.003, + "step": 23860 + }, + { + "epoch": 1.4302834202169095, + "grad_norm": 0.24990014731884003, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0038, + "step": 23870 + }, + { + "epoch": 1.4308826172928275, + "grad_norm": 0.1088186502456665, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0036, + "step": 23880 + }, + { + "epoch": 1.431481814368746, + "grad_norm": 0.09780745953321457, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0042, + "step": 23890 + }, + { + "epoch": 1.432081011444664, + "grad_norm": 0.1625395119190216, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0033, + "step": 23900 + }, + { + "epoch": 1.4326802085205825, + "grad_norm": 0.16848890483379364, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0033, + "step": 23910 + }, + { + "epoch": 1.4332794055965006, + "grad_norm": 0.19756828248500824, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0051, + "step": 23920 + }, + { + "epoch": 1.433878602672419, + "grad_norm": 0.15720513463020325, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0033, + "step": 23930 + }, + { + "epoch": 1.4344777997483371, + "grad_norm": 0.22365699708461761, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0028, + "step": 23940 + }, + { + "epoch": 1.4350769968242556, + "grad_norm": 0.07928138971328735, + "learning_rate": 3.865363184624925e-06, + "loss": 0.003, + "step": 23950 + }, + { + "epoch": 1.4356761939001736, + "grad_norm": 0.26314112544059753, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0049, + "step": 23960 + }, + { + "epoch": 1.4362753909760921, + "grad_norm": 0.1249697357416153, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0036, + "step": 23970 + }, + { + "epoch": 1.4368745880520102, + "grad_norm": 0.09758924692869186, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0031, + "step": 23980 + }, + { + "epoch": 1.4374737851279287, + "grad_norm": 0.08506497740745544, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0037, + "step": 23990 + }, + { + "epoch": 1.4380729822038467, + "grad_norm": 0.1978219896554947, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0036, + "step": 24000 + }, + { + "epoch": 1.4386721792797652, + "grad_norm": 0.15215060114860535, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0033, + "step": 24010 + }, + { + "epoch": 1.4392713763556833, + "grad_norm": 0.1608658879995346, + "learning_rate": 3.823967005382315e-06, + "loss": 0.0034, + "step": 24020 + }, + { + "epoch": 1.4398705734316017, + "grad_norm": 0.10854586958885193, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0026, + "step": 24030 + }, + { + "epoch": 1.4404697705075198, + "grad_norm": 0.1394745409488678, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0036, + "step": 24040 + }, + { + "epoch": 1.4410689675834383, + "grad_norm": 0.0879194363951683, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0034, + "step": 24050 + }, + { + "epoch": 1.4416681646593565, + "grad_norm": 0.11169253289699554, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0037, + "step": 24060 + }, + { + "epoch": 1.4422673617352748, + "grad_norm": 0.12410115450620651, + "learning_rate": 3.794650811106129e-06, + "loss": 0.0033, + "step": 24070 + }, + { + "epoch": 1.442866558811193, + "grad_norm": 0.13719962537288666, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.0032, + "step": 24080 + }, + { + "epoch": 1.4434657558871113, + "grad_norm": 0.10031221807003021, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0046, + "step": 24090 + }, + { + "epoch": 1.4440649529630296, + "grad_norm": 0.1156797707080841, + "learning_rate": 3.777162510056721e-06, + "loss": 0.0042, + "step": 24100 + }, + { + "epoch": 1.4446641500389479, + "grad_norm": 0.1494375318288803, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0036, + "step": 24110 + }, + { + "epoch": 1.4452633471148661, + "grad_norm": 0.08620154112577438, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0034, + "step": 24120 + }, + { + "epoch": 1.4458625441907844, + "grad_norm": 0.16659799218177795, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0053, + "step": 24130 + }, + { + "epoch": 1.4464617412667027, + "grad_norm": 0.1313968300819397, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.0046, + "step": 24140 + }, + { + "epoch": 1.447060938342621, + "grad_norm": 0.21495603024959564, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0037, + "step": 24150 + }, + { + "epoch": 1.4476601354185392, + "grad_norm": 0.11284582316875458, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0033, + "step": 24160 + }, + { + "epoch": 1.4482593324944575, + "grad_norm": 0.18478819727897644, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0038, + "step": 24170 + }, + { + "epoch": 1.4488585295703758, + "grad_norm": 0.12338980287313461, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0029, + "step": 24180 + }, + { + "epoch": 1.449457726646294, + "grad_norm": 0.09782207757234573, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0037, + "step": 24190 + }, + { + "epoch": 1.4500569237222123, + "grad_norm": 0.10959567129611969, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0029, + "step": 24200 + }, + { + "epoch": 1.4506561207981306, + "grad_norm": 0.17048455774784088, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0035, + "step": 24210 + }, + { + "epoch": 1.4512553178740488, + "grad_norm": 0.12739142775535583, + "learning_rate": 3.707974016467e-06, + "loss": 0.0028, + "step": 24220 + }, + { + "epoch": 1.451854514949967, + "grad_norm": 0.19227802753448486, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0045, + "step": 24230 + }, + { + "epoch": 1.4524537120258854, + "grad_norm": 0.11818226426839828, + "learning_rate": 3.696562092850226e-06, + "loss": 0.0032, + "step": 24240 + }, + { + "epoch": 1.4530529091018036, + "grad_norm": 0.10820474475622177, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0026, + "step": 24250 + }, + { + "epoch": 1.453652106177722, + "grad_norm": 0.11386270821094513, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0026, + "step": 24260 + }, + { + "epoch": 1.4542513032536402, + "grad_norm": 0.23488907516002655, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0035, + "step": 24270 + }, + { + "epoch": 1.4548505003295584, + "grad_norm": 0.12526266276836395, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0037, + "step": 24280 + }, + { + "epoch": 1.4554496974054767, + "grad_norm": 0.22899770736694336, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0035, + "step": 24290 + }, + { + "epoch": 1.456048894481395, + "grad_norm": 0.13044586777687073, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0039, + "step": 24300 + }, + { + "epoch": 1.4566480915573132, + "grad_norm": 0.3652730882167816, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0041, + "step": 24310 + }, + { + "epoch": 1.4572472886332315, + "grad_norm": 0.1416187435388565, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.0036, + "step": 24320 + }, + { + "epoch": 1.4578464857091498, + "grad_norm": 0.11176013946533203, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0037, + "step": 24330 + }, + { + "epoch": 1.458445682785068, + "grad_norm": 0.09744516015052795, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0037, + "step": 24340 + }, + { + "epoch": 1.4590448798609863, + "grad_norm": 0.11925745010375977, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0034, + "step": 24350 + }, + { + "epoch": 1.4596440769369046, + "grad_norm": 0.0942603051662445, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0031, + "step": 24360 + }, + { + "epoch": 1.4602432740128228, + "grad_norm": 0.12849931418895721, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.0042, + "step": 24370 + }, + { + "epoch": 1.460842471088741, + "grad_norm": 0.11910247802734375, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0025, + "step": 24380 + }, + { + "epoch": 1.4614416681646594, + "grad_norm": 0.09603044390678406, + "learning_rate": 3.612069140022124e-06, + "loss": 0.004, + "step": 24390 + }, + { + "epoch": 1.4620408652405776, + "grad_norm": 0.1962766945362091, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0038, + "step": 24400 + }, + { + "epoch": 1.462640062316496, + "grad_norm": 0.15775476396083832, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0043, + "step": 24410 + }, + { + "epoch": 1.4632392593924142, + "grad_norm": 0.1549777239561081, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0041, + "step": 24420 + }, + { + "epoch": 1.4638384564683324, + "grad_norm": 0.24444808065891266, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0029, + "step": 24430 + }, + { + "epoch": 1.4644376535442507, + "grad_norm": 0.12734061479568481, + "learning_rate": 3.584337233394337e-06, + "loss": 0.003, + "step": 24440 + }, + { + "epoch": 1.465036850620169, + "grad_norm": 0.23149384558200836, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0042, + "step": 24450 + }, + { + "epoch": 1.4656360476960872, + "grad_norm": 0.1598765254020691, + "learning_rate": 3.573305344104808e-06, + "loss": 0.0041, + "step": 24460 + }, + { + "epoch": 1.4662352447720055, + "grad_norm": 0.12173855304718018, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0046, + "step": 24470 + }, + { + "epoch": 1.4668344418479238, + "grad_norm": 0.09653043001890182, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0032, + "step": 24480 + }, + { + "epoch": 1.467433638923842, + "grad_norm": 0.13262024521827698, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.003, + "step": 24490 + }, + { + "epoch": 1.4680328359997603, + "grad_norm": 0.2603001892566681, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0031, + "step": 24500 + }, + { + "epoch": 1.4686320330756786, + "grad_norm": 0.24721759557724, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0028, + "step": 24510 + }, + { + "epoch": 1.4692312301515968, + "grad_norm": 0.11963216960430145, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0047, + "step": 24520 + }, + { + "epoch": 1.4698304272275151, + "grad_norm": 0.12025906145572662, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.0024, + "step": 24530 + }, + { + "epoch": 1.4704296243034334, + "grad_norm": 0.1969287395477295, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0033, + "step": 24540 + }, + { + "epoch": 1.4710288213793516, + "grad_norm": 0.24025285243988037, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0047, + "step": 24550 + }, + { + "epoch": 1.47162801845527, + "grad_norm": 0.07612641155719757, + "learning_rate": 3.518669865884119e-06, + "loss": 0.0026, + "step": 24560 + }, + { + "epoch": 1.4722272155311882, + "grad_norm": 0.18313643336296082, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0038, + "step": 24570 + }, + { + "epoch": 1.4728264126071064, + "grad_norm": 0.3311282694339752, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0036, + "step": 24580 + }, + { + "epoch": 1.4734256096830247, + "grad_norm": 0.16643930971622467, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0034, + "step": 24590 + }, + { + "epoch": 1.474024806758943, + "grad_norm": 0.11099164932966232, + "learning_rate": 3.497061149826966e-06, + "loss": 0.003, + "step": 24600 + }, + { + "epoch": 1.4746240038348613, + "grad_norm": 0.11017951369285583, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0031, + "step": 24610 + }, + { + "epoch": 1.4752232009107795, + "grad_norm": 0.17948199808597565, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0037, + "step": 24620 + }, + { + "epoch": 1.4758223979866978, + "grad_norm": 0.1002451479434967, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.0042, + "step": 24630 + }, + { + "epoch": 1.476421595062616, + "grad_norm": 0.13393986225128174, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0038, + "step": 24640 + }, + { + "epoch": 1.4770207921385343, + "grad_norm": 0.0963628888130188, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0027, + "step": 24650 + }, + { + "epoch": 1.4776199892144526, + "grad_norm": 0.14946860074996948, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.0039, + "step": 24660 + }, + { + "epoch": 1.4782191862903709, + "grad_norm": 0.2011580467224121, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.0045, + "step": 24670 + }, + { + "epoch": 1.4788183833662891, + "grad_norm": 0.12523533403873444, + "learning_rate": 3.454266765790622e-06, + "loss": 0.0033, + "step": 24680 + }, + { + "epoch": 1.4794175804422074, + "grad_norm": 0.22948165237903595, + "learning_rate": 3.448957251110008e-06, + "loss": 0.0031, + "step": 24690 + }, + { + "epoch": 1.4800167775181257, + "grad_norm": 0.24120132625102997, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0046, + "step": 24700 + }, + { + "epoch": 1.480615974594044, + "grad_norm": 0.30398526787757874, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0043, + "step": 24710 + }, + { + "epoch": 1.4812151716699622, + "grad_norm": 0.13554388284683228, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0033, + "step": 24720 + }, + { + "epoch": 1.4818143687458805, + "grad_norm": 0.14989149570465088, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.003, + "step": 24730 + }, + { + "epoch": 1.4824135658217987, + "grad_norm": 0.15678660571575165, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0037, + "step": 24740 + }, + { + "epoch": 1.483012762897717, + "grad_norm": 0.29919424653053284, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.0042, + "step": 24750 + }, + { + "epoch": 1.4836119599736353, + "grad_norm": 0.08935242891311646, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.004, + "step": 24760 + }, + { + "epoch": 1.4842111570495535, + "grad_norm": 0.22928708791732788, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0042, + "step": 24770 + }, + { + "epoch": 1.4848103541254718, + "grad_norm": 0.18873436748981476, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0036, + "step": 24780 + }, + { + "epoch": 1.48540955120139, + "grad_norm": 0.0956149622797966, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0037, + "step": 24790 + }, + { + "epoch": 1.4860087482773083, + "grad_norm": 0.13334470987319946, + "learning_rate": 3.391138816571675e-06, + "loss": 0.0034, + "step": 24800 + }, + { + "epoch": 1.4866079453532266, + "grad_norm": 0.13492803275585175, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0033, + "step": 24810 + }, + { + "epoch": 1.4872071424291449, + "grad_norm": 0.13227517902851105, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0032, + "step": 24820 + }, + { + "epoch": 1.4878063395050631, + "grad_norm": 0.11342936754226685, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0042, + "step": 24830 + }, + { + "epoch": 1.4884055365809814, + "grad_norm": 0.3178110122680664, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0038, + "step": 24840 + }, + { + "epoch": 1.4890047336568997, + "grad_norm": 0.04432455077767372, + "learning_rate": 3.36521439484193e-06, + "loss": 0.0028, + "step": 24850 + }, + { + "epoch": 1.4896039307328182, + "grad_norm": 0.09680923074483871, + "learning_rate": 3.36005636574796e-06, + "loss": 0.0036, + "step": 24860 + }, + { + "epoch": 1.4902031278087362, + "grad_norm": 0.2477794885635376, + "learning_rate": 3.354907302553392e-06, + "loss": 0.004, + "step": 24870 + }, + { + "epoch": 1.4908023248846547, + "grad_norm": 0.11931425333023071, + "learning_rate": 3.349767211300933e-06, + "loss": 0.004, + "step": 24880 + }, + { + "epoch": 1.4914015219605727, + "grad_norm": 0.1410735696554184, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0032, + "step": 24890 + }, + { + "epoch": 1.4920007190364912, + "grad_norm": 0.16996408998966217, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.0041, + "step": 24900 + }, + { + "epoch": 1.4925999161124093, + "grad_norm": 0.1275407373905182, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.0038, + "step": 24910 + }, + { + "epoch": 1.4931991131883278, + "grad_norm": 0.10107860714197159, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0036, + "step": 24920 + }, + { + "epoch": 1.4937983102642458, + "grad_norm": 0.10196204483509064, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0032, + "step": 24930 + }, + { + "epoch": 1.4943975073401643, + "grad_norm": 0.10152500867843628, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0038, + "step": 24940 + }, + { + "epoch": 1.4949967044160823, + "grad_norm": 0.19691230356693268, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.0045, + "step": 24950 + }, + { + "epoch": 1.4955959014920008, + "grad_norm": 0.33672890067100525, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0038, + "step": 24960 + }, + { + "epoch": 1.4961950985679189, + "grad_norm": 0.09857437759637833, + "learning_rate": 3.303911119253872e-06, + "loss": 0.004, + "step": 24970 + }, + { + "epoch": 1.4967942956438374, + "grad_norm": 0.13289818167686462, + "learning_rate": 3.298861077451818e-06, + "loss": 0.003, + "step": 24980 + }, + { + "epoch": 1.4973934927197554, + "grad_norm": 0.18509522080421448, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0064, + "step": 24990 + }, + { + "epoch": 1.497992689795674, + "grad_norm": 0.11460676789283752, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0024, + "step": 25000 + }, + { + "epoch": 1.498591886871592, + "grad_norm": 0.12012742459774017, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.0039, + "step": 25010 + }, + { + "epoch": 1.4991910839475104, + "grad_norm": 0.356365442276001, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.0035, + "step": 25020 + }, + { + "epoch": 1.4997902810234285, + "grad_norm": 0.5451288223266602, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0068, + "step": 25030 + }, + { + "epoch": 1.500389478099347, + "grad_norm": 0.1067429855465889, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0034, + "step": 25040 + }, + { + "epoch": 1.500988675175265, + "grad_norm": 0.2349347621202469, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0041, + "step": 25050 + }, + { + "epoch": 1.5015878722511835, + "grad_norm": 0.09102735668420792, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0032, + "step": 25060 + }, + { + "epoch": 1.5021870693271016, + "grad_norm": 0.11968998610973358, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0034, + "step": 25070 + }, + { + "epoch": 1.50278626640302, + "grad_norm": 0.1355520486831665, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0034, + "step": 25080 + }, + { + "epoch": 1.503385463478938, + "grad_norm": 0.11785157769918442, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0044, + "step": 25090 + }, + { + "epoch": 1.5039846605548566, + "grad_norm": 0.12043727189302444, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0047, + "step": 25100 + }, + { + "epoch": 1.5045838576307746, + "grad_norm": 0.13475126028060913, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0033, + "step": 25110 + }, + { + "epoch": 1.5051830547066931, + "grad_norm": 0.12776954472064972, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0032, + "step": 25120 + }, + { + "epoch": 1.5057822517826112, + "grad_norm": 0.10374128818511963, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0054, + "step": 25130 + }, + { + "epoch": 1.5063814488585296, + "grad_norm": 0.08750293403863907, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0035, + "step": 25140 + }, + { + "epoch": 1.5069806459344477, + "grad_norm": 0.1284732222557068, + "learning_rate": 3.214397932123149e-06, + "loss": 0.0047, + "step": 25150 + }, + { + "epoch": 1.5075798430103662, + "grad_norm": 0.12900014221668243, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0042, + "step": 25160 + }, + { + "epoch": 1.5081790400862842, + "grad_norm": 0.11983122676610947, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0031, + "step": 25170 + }, + { + "epoch": 1.5087782371622027, + "grad_norm": 0.20311471819877625, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0045, + "step": 25180 + }, + { + "epoch": 1.5093774342381208, + "grad_norm": 0.1965232491493225, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.0047, + "step": 25190 + }, + { + "epoch": 1.5099766313140393, + "grad_norm": 0.10592305660247803, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0031, + "step": 25200 + }, + { + "epoch": 1.5105758283899573, + "grad_norm": 0.10558371245861053, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0032, + "step": 25210 + }, + { + "epoch": 1.5111750254658758, + "grad_norm": 0.12083200365304947, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0025, + "step": 25220 + }, + { + "epoch": 1.5117742225417938, + "grad_norm": 0.2367735505104065, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0024, + "step": 25230 + }, + { + "epoch": 1.5123734196177123, + "grad_norm": 0.1387612670660019, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.004, + "step": 25240 + }, + { + "epoch": 1.5129726166936306, + "grad_norm": 0.18766231834888458, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0035, + "step": 25250 + }, + { + "epoch": 1.5135718137695489, + "grad_norm": 0.18110574781894684, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0031, + "step": 25260 + }, + { + "epoch": 1.5141710108454671, + "grad_norm": 0.1886875331401825, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.003, + "step": 25270 + }, + { + "epoch": 1.5147702079213854, + "grad_norm": 0.09323479980230331, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.004, + "step": 25280 + }, + { + "epoch": 1.5153694049973037, + "grad_norm": 0.1508265882730484, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0039, + "step": 25290 + }, + { + "epoch": 1.515968602073222, + "grad_norm": 0.11250200122594833, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0039, + "step": 25300 + }, + { + "epoch": 1.5165677991491402, + "grad_norm": 0.23230062425136566, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.004, + "step": 25310 + }, + { + "epoch": 1.5171669962250585, + "grad_norm": 0.179047629237175, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.004, + "step": 25320 + }, + { + "epoch": 1.5177661933009767, + "grad_norm": 0.13797952234745026, + "learning_rate": 3.127844986891409e-06, + "loss": 0.0042, + "step": 25330 + }, + { + "epoch": 1.518365390376895, + "grad_norm": 0.12740616500377655, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0027, + "step": 25340 + }, + { + "epoch": 1.5189645874528133, + "grad_norm": 0.11396504938602448, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.0037, + "step": 25350 + }, + { + "epoch": 1.5195637845287315, + "grad_norm": 0.12815812230110168, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.0033, + "step": 25360 + }, + { + "epoch": 1.5201629816046498, + "grad_norm": 0.17100073397159576, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0039, + "step": 25370 + }, + { + "epoch": 1.520762178680568, + "grad_norm": 0.09657446295022964, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0031, + "step": 25380 + }, + { + "epoch": 1.5213613757564863, + "grad_norm": 0.3235829472541809, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0049, + "step": 25390 + }, + { + "epoch": 1.5219605728324046, + "grad_norm": 0.17849496006965637, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0033, + "step": 25400 + }, + { + "epoch": 1.5225597699083229, + "grad_norm": 0.16907230019569397, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0028, + "step": 25410 + }, + { + "epoch": 1.5231589669842411, + "grad_norm": 0.26099368929862976, + "learning_rate": 3.085688933413021e-06, + "loss": 0.003, + "step": 25420 + }, + { + "epoch": 1.5237581640601594, + "grad_norm": 0.21024562418460846, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0035, + "step": 25430 + }, + { + "epoch": 1.5243573611360777, + "grad_norm": 0.10564325749874115, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.0032, + "step": 25440 + }, + { + "epoch": 1.524956558211996, + "grad_norm": 0.10607697814702988, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0037, + "step": 25450 + }, + { + "epoch": 1.5255557552879142, + "grad_norm": 0.20698976516723633, + "learning_rate": 3.067194157156521e-06, + "loss": 0.003, + "step": 25460 + }, + { + "epoch": 1.5261549523638325, + "grad_norm": 0.20934849977493286, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0033, + "step": 25470 + }, + { + "epoch": 1.5267541494397507, + "grad_norm": 0.12407243996858597, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0037, + "step": 25480 + }, + { + "epoch": 1.527353346515669, + "grad_norm": 0.13003374636173248, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.0032, + "step": 25490 + }, + { + "epoch": 1.5279525435915873, + "grad_norm": 0.15529648959636688, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0048, + "step": 25500 + }, + { + "epoch": 1.5285517406675055, + "grad_norm": 0.12824782729148865, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.0031, + "step": 25510 + }, + { + "epoch": 1.5291509377434238, + "grad_norm": 0.12616124749183655, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.0037, + "step": 25520 + }, + { + "epoch": 1.529750134819342, + "grad_norm": 0.2119731307029724, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0037, + "step": 25530 + }, + { + "epoch": 1.5303493318952603, + "grad_norm": 0.22325192391872406, + "learning_rate": 3.030651808761638e-06, + "loss": 0.0035, + "step": 25540 + }, + { + "epoch": 1.5309485289711786, + "grad_norm": 0.10937803238630295, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0037, + "step": 25550 + }, + { + "epoch": 1.5315477260470969, + "grad_norm": 0.3106321692466736, + "learning_rate": 3.021609639602321e-06, + "loss": 0.0034, + "step": 25560 + }, + { + "epoch": 1.5321469231230151, + "grad_norm": 0.2864716649055481, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.0037, + "step": 25570 + }, + { + "epoch": 1.5327461201989334, + "grad_norm": 0.10637935250997543, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.0037, + "step": 25580 + }, + { + "epoch": 1.5333453172748517, + "grad_norm": 0.11078158766031265, + "learning_rate": 3.008116622200155e-06, + "loss": 0.0034, + "step": 25590 + }, + { + "epoch": 1.53394451435077, + "grad_norm": 0.06270865350961685, + "learning_rate": 3.003637700546652e-06, + "loss": 0.003, + "step": 25600 + }, + { + "epoch": 1.5345437114266882, + "grad_norm": 0.12176132947206497, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0043, + "step": 25610 + }, + { + "epoch": 1.5351429085026065, + "grad_norm": 0.16978275775909424, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0032, + "step": 25620 + }, + { + "epoch": 1.5357421055785248, + "grad_norm": 0.2582871913909912, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.0036, + "step": 25630 + }, + { + "epoch": 1.536341302654443, + "grad_norm": 0.27402547001838684, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0031, + "step": 25640 + }, + { + "epoch": 1.5369404997303613, + "grad_norm": 0.15350353717803955, + "learning_rate": 2.981383959667165e-06, + "loss": 0.004, + "step": 25650 + }, + { + "epoch": 1.5375396968062796, + "grad_norm": 0.0939447432756424, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0033, + "step": 25660 + }, + { + "epoch": 1.5381388938821978, + "grad_norm": 0.16549192368984222, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0048, + "step": 25670 + }, + { + "epoch": 1.538738090958116, + "grad_norm": 0.11002931743860245, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0028, + "step": 25680 + }, + { + "epoch": 1.5393372880340344, + "grad_norm": 0.17383548617362976, + "learning_rate": 2.963750320724704e-06, + "loss": 0.0032, + "step": 25690 + }, + { + "epoch": 1.5399364851099526, + "grad_norm": 0.18648599088191986, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0039, + "step": 25700 + }, + { + "epoch": 1.540535682185871, + "grad_norm": 0.2366044819355011, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0036, + "step": 25710 + }, + { + "epoch": 1.5411348792617892, + "grad_norm": 0.1678195595741272, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.0032, + "step": 25720 + }, + { + "epoch": 1.5417340763377074, + "grad_norm": 0.31918013095855713, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0045, + "step": 25730 + }, + { + "epoch": 1.5423332734136257, + "grad_norm": 0.14635732769966125, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.0039, + "step": 25740 + }, + { + "epoch": 1.542932470489544, + "grad_norm": 0.19166909158229828, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0028, + "step": 25750 + }, + { + "epoch": 1.5435316675654622, + "grad_norm": 0.11960610002279282, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0029, + "step": 25760 + }, + { + "epoch": 1.5441308646413805, + "grad_norm": 0.06636705994606018, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0025, + "step": 25770 + }, + { + "epoch": 1.5447300617172988, + "grad_norm": 0.17033624649047852, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0028, + "step": 25780 + }, + { + "epoch": 1.5453292587932173, + "grad_norm": 0.07974246889352798, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.003, + "step": 25790 + }, + { + "epoch": 1.5459284558691353, + "grad_norm": 0.1188567653298378, + "learning_rate": 2.916036854664115e-06, + "loss": 0.0043, + "step": 25800 + }, + { + "epoch": 1.5465276529450538, + "grad_norm": 0.11378541588783264, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0028, + "step": 25810 + }, + { + "epoch": 1.5471268500209718, + "grad_norm": 0.11495907604694366, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0037, + "step": 25820 + }, + { + "epoch": 1.5477260470968903, + "grad_norm": 0.144247367978096, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0031, + "step": 25830 + }, + { + "epoch": 1.5483252441728084, + "grad_norm": 0.14722205698490143, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0046, + "step": 25840 + }, + { + "epoch": 1.5489244412487269, + "grad_norm": 0.10647077113389969, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.0026, + "step": 25850 + }, + { + "epoch": 1.549523638324645, + "grad_norm": 0.17438668012619019, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0032, + "step": 25860 + }, + { + "epoch": 1.5501228354005634, + "grad_norm": 0.17071637511253357, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0035, + "step": 25870 + }, + { + "epoch": 1.5507220324764814, + "grad_norm": 0.2201206386089325, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0041, + "step": 25880 + }, + { + "epoch": 1.5513212295524, + "grad_norm": 0.14397655427455902, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0037, + "step": 25890 + }, + { + "epoch": 1.551920426628318, + "grad_norm": 0.055822595953941345, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0035, + "step": 25900 + }, + { + "epoch": 1.5525196237042365, + "grad_norm": 0.13084810972213745, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0046, + "step": 25910 + }, + { + "epoch": 1.5531188207801545, + "grad_norm": 0.3321281373500824, + "learning_rate": 2.865295218604555e-06, + "loss": 0.0038, + "step": 25920 + }, + { + "epoch": 1.553718017856073, + "grad_norm": 0.1274777501821518, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0041, + "step": 25930 + }, + { + "epoch": 1.554317214931991, + "grad_norm": 0.09797787666320801, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0034, + "step": 25940 + }, + { + "epoch": 1.5549164120079095, + "grad_norm": 0.1270579695701599, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0042, + "step": 25950 + }, + { + "epoch": 1.5555156090838276, + "grad_norm": 0.09015227854251862, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0034, + "step": 25960 + }, + { + "epoch": 1.556114806159746, + "grad_norm": 0.12557077407836914, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.0037, + "step": 25970 + }, + { + "epoch": 1.5567140032356641, + "grad_norm": 0.2725144922733307, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0036, + "step": 25980 + }, + { + "epoch": 1.5573132003115826, + "grad_norm": 0.13758502900600433, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0039, + "step": 25990 + }, + { + "epoch": 1.5579123973875006, + "grad_norm": 0.19999243319034576, + "learning_rate": 2.832230653119002e-06, + "loss": 0.0038, + "step": 26000 + }, + { + "epoch": 1.5585115944634191, + "grad_norm": 0.1323961615562439, + "learning_rate": 2.828140665735232e-06, + "loss": 0.0033, + "step": 26010 + }, + { + "epoch": 1.5591107915393372, + "grad_norm": 0.12714031338691711, + "learning_rate": 2.8240602684835614e-06, + "loss": 0.0033, + "step": 26020 + }, + { + "epoch": 1.5597099886152557, + "grad_norm": 0.40822476148605347, + "learning_rate": 2.8199894661525695e-06, + "loss": 0.0041, + "step": 26030 + }, + { + "epoch": 1.5603091856911737, + "grad_norm": 0.14638100564479828, + "learning_rate": 2.8159282635195604e-06, + "loss": 0.0042, + "step": 26040 + }, + { + "epoch": 1.5609083827670922, + "grad_norm": 0.17443427443504333, + "learning_rate": 2.8118766653505857e-06, + "loss": 0.0031, + "step": 26050 + }, + { + "epoch": 1.5615075798430103, + "grad_norm": 0.09581520408391953, + "learning_rate": 2.8078346764004217e-06, + "loss": 0.0036, + "step": 26060 + }, + { + "epoch": 1.5621067769189287, + "grad_norm": 0.14804130792617798, + "learning_rate": 2.8038023014125693e-06, + "loss": 0.004, + "step": 26070 + }, + { + "epoch": 1.5627059739948468, + "grad_norm": 0.4015085697174072, + "learning_rate": 2.799779545119241e-06, + "loss": 0.0062, + "step": 26080 + }, + { + "epoch": 1.5633051710707653, + "grad_norm": 0.3468920886516571, + "learning_rate": 2.7957664122413685e-06, + "loss": 0.004, + "step": 26090 + }, + { + "epoch": 1.5639043681466833, + "grad_norm": 0.19594644010066986, + "learning_rate": 2.7917629074885855e-06, + "loss": 0.0042, + "step": 26100 + }, + { + "epoch": 1.5645035652226018, + "grad_norm": 0.09097496420145035, + "learning_rate": 2.78776903555923e-06, + "loss": 0.0027, + "step": 26110 + }, + { + "epoch": 1.5651027622985199, + "grad_norm": 0.11387573927640915, + "learning_rate": 2.7837848011403307e-06, + "loss": 0.0025, + "step": 26120 + }, + { + "epoch": 1.5657019593744383, + "grad_norm": 0.17657096683979034, + "learning_rate": 2.7798102089076096e-06, + "loss": 0.0036, + "step": 26130 + }, + { + "epoch": 1.5663011564503564, + "grad_norm": 0.09257909655570984, + "learning_rate": 2.7758452635254706e-06, + "loss": 0.0033, + "step": 26140 + }, + { + "epoch": 1.5669003535262749, + "grad_norm": 0.15154404938220978, + "learning_rate": 2.771889969647e-06, + "loss": 0.0046, + "step": 26150 + }, + { + "epoch": 1.567499550602193, + "grad_norm": 0.07300597429275513, + "learning_rate": 2.7679443319139547e-06, + "loss": 0.0028, + "step": 26160 + }, + { + "epoch": 1.5680987476781114, + "grad_norm": 0.12779368460178375, + "learning_rate": 2.76400835495676e-06, + "loss": 0.0028, + "step": 26170 + }, + { + "epoch": 1.5686979447540295, + "grad_norm": 0.12631577253341675, + "learning_rate": 2.760082043394504e-06, + "loss": 0.0035, + "step": 26180 + }, + { + "epoch": 1.569297141829948, + "grad_norm": 0.3630695044994354, + "learning_rate": 2.756165401834933e-06, + "loss": 0.0034, + "step": 26190 + }, + { + "epoch": 1.569896338905866, + "grad_norm": 0.18113726377487183, + "learning_rate": 2.7522584348744443e-06, + "loss": 0.005, + "step": 26200 + }, + { + "epoch": 1.5704955359817845, + "grad_norm": 0.21797926723957062, + "learning_rate": 2.748361147098079e-06, + "loss": 0.0033, + "step": 26210 + }, + { + "epoch": 1.5710947330577025, + "grad_norm": 0.1614106148481369, + "learning_rate": 2.7444735430795245e-06, + "loss": 0.0034, + "step": 26220 + }, + { + "epoch": 1.571693930133621, + "grad_norm": 0.10198274999856949, + "learning_rate": 2.740595627381096e-06, + "loss": 0.0038, + "step": 26230 + }, + { + "epoch": 1.572293127209539, + "grad_norm": 0.14413216710090637, + "learning_rate": 2.7367274045537477e-06, + "loss": 0.0022, + "step": 26240 + }, + { + "epoch": 1.5728923242854576, + "grad_norm": 0.08031613379716873, + "learning_rate": 2.732868879137055e-06, + "loss": 0.0037, + "step": 26250 + }, + { + "epoch": 1.5734915213613756, + "grad_norm": 0.31797754764556885, + "learning_rate": 2.7290200556592094e-06, + "loss": 0.0035, + "step": 26260 + }, + { + "epoch": 1.574090718437294, + "grad_norm": 0.0591890886425972, + "learning_rate": 2.72518093863702e-06, + "loss": 0.0027, + "step": 26270 + }, + { + "epoch": 1.5746899155132121, + "grad_norm": 0.15585894882678986, + "learning_rate": 2.721351532575906e-06, + "loss": 0.0064, + "step": 26280 + }, + { + "epoch": 1.5752891125891306, + "grad_norm": 0.13518628478050232, + "learning_rate": 2.717531841969889e-06, + "loss": 0.0042, + "step": 26290 + }, + { + "epoch": 1.5758883096650487, + "grad_norm": 0.13154275715351105, + "learning_rate": 2.713721871301588e-06, + "loss": 0.0033, + "step": 26300 + }, + { + "epoch": 1.5764875067409672, + "grad_norm": 0.33374130725860596, + "learning_rate": 2.709921625042214e-06, + "loss": 0.0061, + "step": 26310 + }, + { + "epoch": 1.5770867038168854, + "grad_norm": 0.12396867573261261, + "learning_rate": 2.7061311076515717e-06, + "loss": 0.004, + "step": 26320 + }, + { + "epoch": 1.5776859008928037, + "grad_norm": 0.08533058315515518, + "learning_rate": 2.7023503235780395e-06, + "loss": 0.0031, + "step": 26330 + }, + { + "epoch": 1.578285097968722, + "grad_norm": 0.25102120637893677, + "learning_rate": 2.6985792772585826e-06, + "loss": 0.003, + "step": 26340 + }, + { + "epoch": 1.5788842950446402, + "grad_norm": 0.10319694876670837, + "learning_rate": 2.6948179731187315e-06, + "loss": 0.0035, + "step": 26350 + }, + { + "epoch": 1.5794834921205585, + "grad_norm": 0.1508130133152008, + "learning_rate": 2.6910664155725847e-06, + "loss": 0.0046, + "step": 26360 + }, + { + "epoch": 1.5800826891964768, + "grad_norm": 0.09007565677165985, + "learning_rate": 2.6873246090228063e-06, + "loss": 0.0025, + "step": 26370 + }, + { + "epoch": 1.580681886272395, + "grad_norm": 0.13807767629623413, + "learning_rate": 2.683592557860616e-06, + "loss": 0.003, + "step": 26380 + }, + { + "epoch": 1.5812810833483133, + "grad_norm": 0.1909133791923523, + "learning_rate": 2.6798702664657803e-06, + "loss": 0.0034, + "step": 26390 + }, + { + "epoch": 1.5818802804242316, + "grad_norm": 0.14300945401191711, + "learning_rate": 2.6761577392066163e-06, + "loss": 0.004, + "step": 26400 + }, + { + "epoch": 1.5824794775001498, + "grad_norm": 0.08184076100587845, + "learning_rate": 2.6724549804399845e-06, + "loss": 0.0025, + "step": 26410 + }, + { + "epoch": 1.583078674576068, + "grad_norm": 0.1493527740240097, + "learning_rate": 2.6687619945112743e-06, + "loss": 0.0026, + "step": 26420 + }, + { + "epoch": 1.5836778716519864, + "grad_norm": 0.09850698709487915, + "learning_rate": 2.6650787857544134e-06, + "loss": 0.0032, + "step": 26430 + }, + { + "epoch": 1.5842770687279046, + "grad_norm": 0.0875677615404129, + "learning_rate": 2.661405358491851e-06, + "loss": 0.0039, + "step": 26440 + }, + { + "epoch": 1.584876265803823, + "grad_norm": 0.2319948524236679, + "learning_rate": 2.6577417170345594e-06, + "loss": 0.0039, + "step": 26450 + }, + { + "epoch": 1.5854754628797412, + "grad_norm": 0.10797403752803802, + "learning_rate": 2.6540878656820246e-06, + "loss": 0.0036, + "step": 26460 + }, + { + "epoch": 1.5860746599556594, + "grad_norm": 0.19400249421596527, + "learning_rate": 2.6504438087222474e-06, + "loss": 0.0034, + "step": 26470 + }, + { + "epoch": 1.5866738570315777, + "grad_norm": 0.1569194793701172, + "learning_rate": 2.6468095504317275e-06, + "loss": 0.0039, + "step": 26480 + }, + { + "epoch": 1.587273054107496, + "grad_norm": 0.17117120325565338, + "learning_rate": 2.643185095075473e-06, + "loss": 0.003, + "step": 26490 + }, + { + "epoch": 1.5878722511834142, + "grad_norm": 0.19703997671604156, + "learning_rate": 2.6395704469069837e-06, + "loss": 0.0041, + "step": 26500 + }, + { + "epoch": 1.5884714482593325, + "grad_norm": 0.09663215279579163, + "learning_rate": 2.635965610168249e-06, + "loss": 0.005, + "step": 26510 + }, + { + "epoch": 1.5890706453352508, + "grad_norm": 0.13411357998847961, + "learning_rate": 2.6323705890897464e-06, + "loss": 0.0035, + "step": 26520 + }, + { + "epoch": 1.589669842411169, + "grad_norm": 0.15013787150382996, + "learning_rate": 2.628785387890433e-06, + "loss": 0.0031, + "step": 26530 + }, + { + "epoch": 1.5902690394870873, + "grad_norm": 0.15517787635326385, + "learning_rate": 2.6252100107777422e-06, + "loss": 0.0028, + "step": 26540 + }, + { + "epoch": 1.5908682365630056, + "grad_norm": 0.23037715256214142, + "learning_rate": 2.6216444619475786e-06, + "loss": 0.0034, + "step": 26550 + }, + { + "epoch": 1.5914674336389238, + "grad_norm": 0.1925845891237259, + "learning_rate": 2.6180887455843135e-06, + "loss": 0.0028, + "step": 26560 + }, + { + "epoch": 1.5920666307148421, + "grad_norm": 0.08933448791503906, + "learning_rate": 2.6145428658607753e-06, + "loss": 0.0029, + "step": 26570 + }, + { + "epoch": 1.5926658277907604, + "grad_norm": 0.14989611506462097, + "learning_rate": 2.6110068269382534e-06, + "loss": 0.0034, + "step": 26580 + }, + { + "epoch": 1.5932650248666786, + "grad_norm": 0.2904585897922516, + "learning_rate": 2.6074806329664854e-06, + "loss": 0.0027, + "step": 26590 + }, + { + "epoch": 1.593864221942597, + "grad_norm": 0.17784662544727325, + "learning_rate": 2.6039642880836585e-06, + "loss": 0.0039, + "step": 26600 + }, + { + "epoch": 1.5944634190185152, + "grad_norm": 0.07810595631599426, + "learning_rate": 2.600457796416397e-06, + "loss": 0.0025, + "step": 26610 + }, + { + "epoch": 1.5950626160944334, + "grad_norm": 0.06783948838710785, + "learning_rate": 2.5969611620797636e-06, + "loss": 0.003, + "step": 26620 + }, + { + "epoch": 1.5956618131703517, + "grad_norm": 0.13763132691383362, + "learning_rate": 2.593474389177255e-06, + "loss": 0.0037, + "step": 26630 + }, + { + "epoch": 1.59626101024627, + "grad_norm": 0.1127597987651825, + "learning_rate": 2.5899974818007924e-06, + "loss": 0.0045, + "step": 26640 + }, + { + "epoch": 1.5968602073221883, + "grad_norm": 0.07828421145677567, + "learning_rate": 2.586530444030723e-06, + "loss": 0.0027, + "step": 26650 + }, + { + "epoch": 1.5974594043981065, + "grad_norm": 0.1327218860387802, + "learning_rate": 2.583073279935805e-06, + "loss": 0.0042, + "step": 26660 + }, + { + "epoch": 1.5980586014740248, + "grad_norm": 0.09427100419998169, + "learning_rate": 2.5796259935732143e-06, + "loss": 0.0027, + "step": 26670 + }, + { + "epoch": 1.598657798549943, + "grad_norm": 0.2112533301115036, + "learning_rate": 2.5761885889885346e-06, + "loss": 0.0046, + "step": 26680 + }, + { + "epoch": 1.5992569956258613, + "grad_norm": 0.24039748311042786, + "learning_rate": 2.5727610702157518e-06, + "loss": 0.0032, + "step": 26690 + }, + { + "epoch": 1.5998561927017796, + "grad_norm": 0.28341665863990784, + "learning_rate": 2.5693434412772496e-06, + "loss": 0.005, + "step": 26700 + }, + { + "epoch": 1.6004553897776979, + "grad_norm": 0.23401512205600739, + "learning_rate": 2.565935706183804e-06, + "loss": 0.0029, + "step": 26710 + }, + { + "epoch": 1.6010545868536161, + "grad_norm": 0.13487978279590607, + "learning_rate": 2.5625378689345837e-06, + "loss": 0.0028, + "step": 26720 + }, + { + "epoch": 1.6016537839295344, + "grad_norm": 0.10604815185070038, + "learning_rate": 2.5591499335171394e-06, + "loss": 0.0029, + "step": 26730 + }, + { + "epoch": 1.6022529810054527, + "grad_norm": 0.12193044275045395, + "learning_rate": 2.555771903907403e-06, + "loss": 0.0031, + "step": 26740 + }, + { + "epoch": 1.602852178081371, + "grad_norm": 0.291572630405426, + "learning_rate": 2.5524037840696787e-06, + "loss": 0.0033, + "step": 26750 + }, + { + "epoch": 1.6034513751572892, + "grad_norm": 0.14938616752624512, + "learning_rate": 2.5490455779566446e-06, + "loss": 0.0027, + "step": 26760 + }, + { + "epoch": 1.6040505722332075, + "grad_norm": 0.16085144877433777, + "learning_rate": 2.545697289509341e-06, + "loss": 0.0022, + "step": 26770 + }, + { + "epoch": 1.6046497693091257, + "grad_norm": 0.14876601099967957, + "learning_rate": 2.5423589226571733e-06, + "loss": 0.0034, + "step": 26780 + }, + { + "epoch": 1.605248966385044, + "grad_norm": 0.13766804337501526, + "learning_rate": 2.5390304813179e-06, + "loss": 0.0029, + "step": 26790 + }, + { + "epoch": 1.6058481634609623, + "grad_norm": 0.1824955940246582, + "learning_rate": 2.5357119693976297e-06, + "loss": 0.0028, + "step": 26800 + }, + { + "epoch": 1.6064473605368805, + "grad_norm": 0.09187015891075134, + "learning_rate": 2.532403390790823e-06, + "loss": 0.0028, + "step": 26810 + }, + { + "epoch": 1.6070465576127988, + "grad_norm": 0.1488831490278244, + "learning_rate": 2.529104749380281e-06, + "loss": 0.0023, + "step": 26820 + }, + { + "epoch": 1.607645754688717, + "grad_norm": 0.16146720945835114, + "learning_rate": 2.5258160490371446e-06, + "loss": 0.0036, + "step": 26830 + }, + { + "epoch": 1.6082449517646353, + "grad_norm": 0.19863533973693848, + "learning_rate": 2.5225372936208854e-06, + "loss": 0.0034, + "step": 26840 + }, + { + "epoch": 1.6088441488405536, + "grad_norm": 0.08710742741823196, + "learning_rate": 2.5192684869793043e-06, + "loss": 0.0028, + "step": 26850 + }, + { + "epoch": 1.609443345916472, + "grad_norm": 0.1280236840248108, + "learning_rate": 2.51600963294853e-06, + "loss": 0.0037, + "step": 26860 + }, + { + "epoch": 1.6100425429923901, + "grad_norm": 0.29420942068099976, + "learning_rate": 2.5127607353530097e-06, + "loss": 0.0031, + "step": 26870 + }, + { + "epoch": 1.6106417400683086, + "grad_norm": 0.16633544862270355, + "learning_rate": 2.5095217980055052e-06, + "loss": 0.0037, + "step": 26880 + }, + { + "epoch": 1.6112409371442267, + "grad_norm": 0.10398953408002853, + "learning_rate": 2.5062928247070873e-06, + "loss": 0.0032, + "step": 26890 + }, + { + "epoch": 1.6118401342201452, + "grad_norm": 0.1609172523021698, + "learning_rate": 2.503073819247138e-06, + "loss": 0.0031, + "step": 26900 + }, + { + "epoch": 1.6124393312960632, + "grad_norm": 0.14156407117843628, + "learning_rate": 2.4998647854033393e-06, + "loss": 0.0032, + "step": 26910 + }, + { + "epoch": 1.6130385283719817, + "grad_norm": 0.3801378309726715, + "learning_rate": 2.4966657269416738e-06, + "loss": 0.0039, + "step": 26920 + }, + { + "epoch": 1.6136377254478997, + "grad_norm": 0.1612473726272583, + "learning_rate": 2.49347664761641e-06, + "loss": 0.0026, + "step": 26930 + }, + { + "epoch": 1.6142369225238182, + "grad_norm": 0.3169429898262024, + "learning_rate": 2.490297551170112e-06, + "loss": 0.0038, + "step": 26940 + }, + { + "epoch": 1.6148361195997363, + "grad_norm": 0.11678534001111984, + "learning_rate": 2.487128441333628e-06, + "loss": 0.0041, + "step": 26950 + }, + { + "epoch": 1.6154353166756548, + "grad_norm": 0.08701438456773758, + "learning_rate": 2.4839693218260844e-06, + "loss": 0.0025, + "step": 26960 + }, + { + "epoch": 1.6160345137515728, + "grad_norm": 0.14214813709259033, + "learning_rate": 2.4808201963548844e-06, + "loss": 0.0032, + "step": 26970 + }, + { + "epoch": 1.6166337108274913, + "grad_norm": 0.06335555016994476, + "learning_rate": 2.477681068615698e-06, + "loss": 0.0022, + "step": 26980 + }, + { + "epoch": 1.6172329079034093, + "grad_norm": 0.1225769966840744, + "learning_rate": 2.4745519422924715e-06, + "loss": 0.0035, + "step": 26990 + }, + { + "epoch": 1.6178321049793278, + "grad_norm": 0.12757551670074463, + "learning_rate": 2.471432821057406e-06, + "loss": 0.0034, + "step": 27000 + }, + { + "epoch": 1.6184313020552459, + "grad_norm": 0.04847760871052742, + "learning_rate": 2.4683237085709673e-06, + "loss": 0.0025, + "step": 27010 + }, + { + "epoch": 1.6190304991311644, + "grad_norm": 0.11208045482635498, + "learning_rate": 2.4652246084818678e-06, + "loss": 0.0028, + "step": 27020 + }, + { + "epoch": 1.6196296962070824, + "grad_norm": 0.10029870271682739, + "learning_rate": 2.4621355244270764e-06, + "loss": 0.0042, + "step": 27030 + }, + { + "epoch": 1.620228893283001, + "grad_norm": 0.10894428193569183, + "learning_rate": 2.4590564600318047e-06, + "loss": 0.003, + "step": 27040 + }, + { + "epoch": 1.620828090358919, + "grad_norm": 0.16484397649765015, + "learning_rate": 2.4559874189095077e-06, + "loss": 0.0027, + "step": 27050 + }, + { + "epoch": 1.6214272874348374, + "grad_norm": 0.18669992685317993, + "learning_rate": 2.4529284046618745e-06, + "loss": 0.0031, + "step": 27060 + }, + { + "epoch": 1.6220264845107555, + "grad_norm": 0.10345451533794403, + "learning_rate": 2.4498794208788296e-06, + "loss": 0.0024, + "step": 27070 + }, + { + "epoch": 1.622625681586674, + "grad_norm": 0.14037790894508362, + "learning_rate": 2.446840471138524e-06, + "loss": 0.0027, + "step": 27080 + }, + { + "epoch": 1.623224878662592, + "grad_norm": 0.2581053078174591, + "learning_rate": 2.443811559007335e-06, + "loss": 0.0027, + "step": 27090 + }, + { + "epoch": 1.6238240757385105, + "grad_norm": 0.12379001826047897, + "learning_rate": 2.440792688039862e-06, + "loss": 0.0024, + "step": 27100 + }, + { + "epoch": 1.6244232728144286, + "grad_norm": 0.17116566002368927, + "learning_rate": 2.437783861778914e-06, + "loss": 0.0025, + "step": 27110 + }, + { + "epoch": 1.625022469890347, + "grad_norm": 0.13846145570278168, + "learning_rate": 2.4347850837555197e-06, + "loss": 0.0042, + "step": 27120 + }, + { + "epoch": 1.625621666966265, + "grad_norm": 0.09063230454921722, + "learning_rate": 2.4317963574889108e-06, + "loss": 0.0032, + "step": 27130 + }, + { + "epoch": 1.6262208640421836, + "grad_norm": 0.19914232194423676, + "learning_rate": 2.428817686486524e-06, + "loss": 0.0043, + "step": 27140 + }, + { + "epoch": 1.6268200611181016, + "grad_norm": 0.13414347171783447, + "learning_rate": 2.425849074243997e-06, + "loss": 0.0031, + "step": 27150 + }, + { + "epoch": 1.6274192581940201, + "grad_norm": 0.11173701286315918, + "learning_rate": 2.4228905242451593e-06, + "loss": 0.0037, + "step": 27160 + }, + { + "epoch": 1.6280184552699382, + "grad_norm": 0.11112072318792343, + "learning_rate": 2.419942039962035e-06, + "loss": 0.0032, + "step": 27170 + }, + { + "epoch": 1.6286176523458566, + "grad_norm": 0.27570319175720215, + "learning_rate": 2.4170036248548345e-06, + "loss": 0.0048, + "step": 27180 + }, + { + "epoch": 1.6292168494217747, + "grad_norm": 0.09076099097728729, + "learning_rate": 2.414075282371954e-06, + "loss": 0.0033, + "step": 27190 + }, + { + "epoch": 1.6298160464976932, + "grad_norm": 0.185089111328125, + "learning_rate": 2.411157015949963e-06, + "loss": 0.005, + "step": 27200 + }, + { + "epoch": 1.6304152435736112, + "grad_norm": 0.06751414388418198, + "learning_rate": 2.408248829013611e-06, + "loss": 0.0026, + "step": 27210 + }, + { + "epoch": 1.6310144406495297, + "grad_norm": 0.14673012495040894, + "learning_rate": 2.4053507249758174e-06, + "loss": 0.0029, + "step": 27220 + }, + { + "epoch": 1.6316136377254478, + "grad_norm": 0.11741532385349274, + "learning_rate": 2.40246270723767e-06, + "loss": 0.0034, + "step": 27230 + }, + { + "epoch": 1.6322128348013663, + "grad_norm": 0.06512618809938431, + "learning_rate": 2.399584779188417e-06, + "loss": 0.0032, + "step": 27240 + }, + { + "epoch": 1.6328120318772843, + "grad_norm": 0.22004343569278717, + "learning_rate": 2.396716944205467e-06, + "loss": 0.004, + "step": 27250 + }, + { + "epoch": 1.6334112289532028, + "grad_norm": 0.1706841140985489, + "learning_rate": 2.3938592056543853e-06, + "loss": 0.0028, + "step": 27260 + }, + { + "epoch": 1.6340104260291208, + "grad_norm": 0.1023155003786087, + "learning_rate": 2.391011566888887e-06, + "loss": 0.0029, + "step": 27270 + }, + { + "epoch": 1.6346096231050393, + "grad_norm": 0.17524677515029907, + "learning_rate": 2.3881740312508346e-06, + "loss": 0.0039, + "step": 27280 + }, + { + "epoch": 1.6352088201809574, + "grad_norm": 0.10368278622627258, + "learning_rate": 2.3853466020702323e-06, + "loss": 0.0026, + "step": 27290 + }, + { + "epoch": 1.6358080172568759, + "grad_norm": 0.06621989607810974, + "learning_rate": 2.382529282665229e-06, + "loss": 0.0028, + "step": 27300 + }, + { + "epoch": 1.636407214332794, + "grad_norm": 0.2700876295566559, + "learning_rate": 2.379722076342103e-06, + "loss": 0.0045, + "step": 27310 + }, + { + "epoch": 1.6370064114087124, + "grad_norm": 0.07727917283773422, + "learning_rate": 2.376924986395271e-06, + "loss": 0.0034, + "step": 27320 + }, + { + "epoch": 1.6376056084846304, + "grad_norm": 0.11636139452457428, + "learning_rate": 2.37413801610727e-06, + "loss": 0.0026, + "step": 27330 + }, + { + "epoch": 1.638204805560549, + "grad_norm": 0.07539201527833939, + "learning_rate": 2.371361168748767e-06, + "loss": 0.0029, + "step": 27340 + }, + { + "epoch": 1.638804002636467, + "grad_norm": 0.14615486562252045, + "learning_rate": 2.3685944475785463e-06, + "loss": 0.0037, + "step": 27350 + }, + { + "epoch": 1.6394031997123855, + "grad_norm": 0.10396217554807663, + "learning_rate": 2.3658378558435098e-06, + "loss": 0.004, + "step": 27360 + }, + { + "epoch": 1.6400023967883035, + "grad_norm": 0.08993582427501678, + "learning_rate": 2.363091396778672e-06, + "loss": 0.0037, + "step": 27370 + }, + { + "epoch": 1.640601593864222, + "grad_norm": 0.15601681172847748, + "learning_rate": 2.3603550736071535e-06, + "loss": 0.0032, + "step": 27380 + }, + { + "epoch": 1.6412007909401403, + "grad_norm": 0.27940425276756287, + "learning_rate": 2.357628889540182e-06, + "loss": 0.0035, + "step": 27390 + }, + { + "epoch": 1.6417999880160585, + "grad_norm": 0.19063127040863037, + "learning_rate": 2.3549128477770894e-06, + "loss": 0.0032, + "step": 27400 + }, + { + "epoch": 1.6423991850919768, + "grad_norm": 0.0989932119846344, + "learning_rate": 2.3522069515052996e-06, + "loss": 0.0033, + "step": 27410 + }, + { + "epoch": 1.642998382167895, + "grad_norm": 0.1885364055633545, + "learning_rate": 2.349511203900333e-06, + "loss": 0.0046, + "step": 27420 + }, + { + "epoch": 1.6435975792438133, + "grad_norm": 0.19619473814964294, + "learning_rate": 2.3468256081258e-06, + "loss": 0.0032, + "step": 27430 + }, + { + "epoch": 1.6441967763197316, + "grad_norm": 0.3142991364002228, + "learning_rate": 2.344150167333397e-06, + "loss": 0.0041, + "step": 27440 + }, + { + "epoch": 1.6447959733956499, + "grad_norm": 0.09447146952152252, + "learning_rate": 2.3414848846629013e-06, + "loss": 0.0034, + "step": 27450 + }, + { + "epoch": 1.6453951704715681, + "grad_norm": 0.12683053314685822, + "learning_rate": 2.3388297632421727e-06, + "loss": 0.0044, + "step": 27460 + }, + { + "epoch": 1.6459943675474864, + "grad_norm": 0.30415666103363037, + "learning_rate": 2.3361848061871417e-06, + "loss": 0.0048, + "step": 27470 + }, + { + "epoch": 1.6465935646234047, + "grad_norm": 0.16130568087100983, + "learning_rate": 2.333550016601814e-06, + "loss": 0.0032, + "step": 27480 + }, + { + "epoch": 1.647192761699323, + "grad_norm": 0.19884297251701355, + "learning_rate": 2.3309253975782623e-06, + "loss": 0.0036, + "step": 27490 + }, + { + "epoch": 1.6477919587752412, + "grad_norm": 0.2124500721693039, + "learning_rate": 2.3283109521966236e-06, + "loss": 0.0048, + "step": 27500 + }, + { + "epoch": 1.6483911558511595, + "grad_norm": 0.20656649768352509, + "learning_rate": 2.325706683525094e-06, + "loss": 0.0044, + "step": 27510 + }, + { + "epoch": 1.6489903529270777, + "grad_norm": 0.08909416943788528, + "learning_rate": 2.3231125946199277e-06, + "loss": 0.0042, + "step": 27520 + }, + { + "epoch": 1.649589550002996, + "grad_norm": 0.1665533483028412, + "learning_rate": 2.320528688525433e-06, + "loss": 0.0045, + "step": 27530 + }, + { + "epoch": 1.6501887470789143, + "grad_norm": 0.11362092941999435, + "learning_rate": 2.317954968273969e-06, + "loss": 0.0028, + "step": 27540 + }, + { + "epoch": 1.6507879441548325, + "grad_norm": 0.11079458892345428, + "learning_rate": 2.3153914368859386e-06, + "loss": 0.0044, + "step": 27550 + }, + { + "epoch": 1.6513871412307508, + "grad_norm": 0.1600227653980255, + "learning_rate": 2.3128380973697868e-06, + "loss": 0.0032, + "step": 27560 + }, + { + "epoch": 1.651986338306669, + "grad_norm": 0.10425245016813278, + "learning_rate": 2.3102949527220025e-06, + "loss": 0.0024, + "step": 27570 + }, + { + "epoch": 1.6525855353825873, + "grad_norm": 0.142449289560318, + "learning_rate": 2.3077620059271054e-06, + "loss": 0.0031, + "step": 27580 + }, + { + "epoch": 1.6531847324585056, + "grad_norm": 0.13777248561382294, + "learning_rate": 2.305239259957653e-06, + "loss": 0.0031, + "step": 27590 + }, + { + "epoch": 1.6537839295344239, + "grad_norm": 0.21916678547859192, + "learning_rate": 2.302726717774224e-06, + "loss": 0.0034, + "step": 27600 + }, + { + "epoch": 1.6543831266103421, + "grad_norm": 0.11044235527515411, + "learning_rate": 2.3002243823254294e-06, + "loss": 0.0032, + "step": 27610 + }, + { + "epoch": 1.6549823236862604, + "grad_norm": 0.30877354741096497, + "learning_rate": 2.2977322565478988e-06, + "loss": 0.0039, + "step": 27620 + }, + { + "epoch": 1.6555815207621787, + "grad_norm": 0.12299321591854095, + "learning_rate": 2.2952503433662806e-06, + "loss": 0.003, + "step": 27630 + }, + { + "epoch": 1.656180717838097, + "grad_norm": 0.10495458543300629, + "learning_rate": 2.2927786456932383e-06, + "loss": 0.0036, + "step": 27640 + }, + { + "epoch": 1.6567799149140152, + "grad_norm": 0.13938122987747192, + "learning_rate": 2.2903171664294446e-06, + "loss": 0.0045, + "step": 27650 + }, + { + "epoch": 1.6573791119899335, + "grad_norm": 0.1632867157459259, + "learning_rate": 2.287865908463585e-06, + "loss": 0.0043, + "step": 27660 + }, + { + "epoch": 1.6579783090658518, + "grad_norm": 0.11505074799060822, + "learning_rate": 2.2854248746723464e-06, + "loss": 0.004, + "step": 27670 + }, + { + "epoch": 1.65857750614177, + "grad_norm": 0.19847853481769562, + "learning_rate": 2.2829940679204192e-06, + "loss": 0.0033, + "step": 27680 + }, + { + "epoch": 1.6591767032176883, + "grad_norm": 0.0759914219379425, + "learning_rate": 2.280573491060488e-06, + "loss": 0.0025, + "step": 27690 + }, + { + "epoch": 1.6597759002936066, + "grad_norm": 0.23778557777404785, + "learning_rate": 2.278163146933236e-06, + "loss": 0.0029, + "step": 27700 + }, + { + "epoch": 1.6603750973695248, + "grad_norm": 0.14102019369602203, + "learning_rate": 2.275763038367336e-06, + "loss": 0.0026, + "step": 27710 + }, + { + "epoch": 1.660974294445443, + "grad_norm": 0.09396950155496597, + "learning_rate": 2.2733731681794505e-06, + "loss": 0.0032, + "step": 27720 + }, + { + "epoch": 1.6615734915213614, + "grad_norm": 0.1578163504600525, + "learning_rate": 2.270993539174225e-06, + "loss": 0.0034, + "step": 27730 + }, + { + "epoch": 1.6621726885972796, + "grad_norm": 0.12897615134716034, + "learning_rate": 2.268624154144283e-06, + "loss": 0.0034, + "step": 27740 + }, + { + "epoch": 1.662771885673198, + "grad_norm": 0.05674497038125992, + "learning_rate": 2.266265015870234e-06, + "loss": 0.0028, + "step": 27750 + }, + { + "epoch": 1.6633710827491162, + "grad_norm": 0.12161347270011902, + "learning_rate": 2.2639161271206562e-06, + "loss": 0.0033, + "step": 27760 + }, + { + "epoch": 1.6639702798250344, + "grad_norm": 0.11158734560012817, + "learning_rate": 2.261577490652103e-06, + "loss": 0.004, + "step": 27770 + }, + { + "epoch": 1.6645694769009527, + "grad_norm": 0.09899834543466568, + "learning_rate": 2.259249109209093e-06, + "loss": 0.003, + "step": 27780 + }, + { + "epoch": 1.665168673976871, + "grad_norm": 0.2654432952404022, + "learning_rate": 2.256930985524111e-06, + "loss": 0.0032, + "step": 27790 + }, + { + "epoch": 1.6657678710527892, + "grad_norm": 0.1188909262418747, + "learning_rate": 2.2546231223176062e-06, + "loss": 0.0034, + "step": 27800 + }, + { + "epoch": 1.6663670681287075, + "grad_norm": 0.4437197148799896, + "learning_rate": 2.2523255222979846e-06, + "loss": 0.005, + "step": 27810 + }, + { + "epoch": 1.6669662652046258, + "grad_norm": 0.17790400981903076, + "learning_rate": 2.2500381881616064e-06, + "loss": 0.0033, + "step": 27820 + }, + { + "epoch": 1.667565462280544, + "grad_norm": 0.10867536813020706, + "learning_rate": 2.2477611225927847e-06, + "loss": 0.0032, + "step": 27830 + }, + { + "epoch": 1.6681646593564623, + "grad_norm": 0.10958084464073181, + "learning_rate": 2.2454943282637852e-06, + "loss": 0.0024, + "step": 27840 + }, + { + "epoch": 1.6687638564323806, + "grad_norm": 0.06520948559045792, + "learning_rate": 2.2432378078348166e-06, + "loss": 0.0029, + "step": 27850 + }, + { + "epoch": 1.6693630535082988, + "grad_norm": 0.13580842316150665, + "learning_rate": 2.2409915639540295e-06, + "loss": 0.0029, + "step": 27860 + }, + { + "epoch": 1.669962250584217, + "grad_norm": 0.15817365050315857, + "learning_rate": 2.2387555992575192e-06, + "loss": 0.002, + "step": 27870 + }, + { + "epoch": 1.6705614476601354, + "grad_norm": 0.35285326838493347, + "learning_rate": 2.236529916369313e-06, + "loss": 0.0062, + "step": 27880 + }, + { + "epoch": 1.6711606447360536, + "grad_norm": 0.24554285407066345, + "learning_rate": 2.2343145179013726e-06, + "loss": 0.0043, + "step": 27890 + }, + { + "epoch": 1.671759841811972, + "grad_norm": 0.16509993374347687, + "learning_rate": 2.232109406453595e-06, + "loss": 0.0032, + "step": 27900 + }, + { + "epoch": 1.6723590388878904, + "grad_norm": 0.13468189537525177, + "learning_rate": 2.229914584613798e-06, + "loss": 0.0026, + "step": 27910 + }, + { + "epoch": 1.6729582359638084, + "grad_norm": 0.17360062897205353, + "learning_rate": 2.22773005495773e-06, + "loss": 0.0029, + "step": 27920 + }, + { + "epoch": 1.673557433039727, + "grad_norm": 0.12582021951675415, + "learning_rate": 2.2255558200490557e-06, + "loss": 0.0035, + "step": 27930 + }, + { + "epoch": 1.674156630115645, + "grad_norm": 0.1015002503991127, + "learning_rate": 2.2233918824393625e-06, + "loss": 0.0033, + "step": 27940 + }, + { + "epoch": 1.6747558271915635, + "grad_norm": 0.3634873926639557, + "learning_rate": 2.221238244668151e-06, + "loss": 0.0031, + "step": 27950 + }, + { + "epoch": 1.6753550242674815, + "grad_norm": 0.15137465298175812, + "learning_rate": 2.219094909262834e-06, + "loss": 0.006, + "step": 27960 + }, + { + "epoch": 1.6759542213434, + "grad_norm": 0.09976715594530106, + "learning_rate": 2.2169618787387374e-06, + "loss": 0.0031, + "step": 27970 + }, + { + "epoch": 1.676553418419318, + "grad_norm": 0.09910957515239716, + "learning_rate": 2.2148391555990905e-06, + "loss": 0.0024, + "step": 27980 + }, + { + "epoch": 1.6771526154952365, + "grad_norm": 0.11276205629110336, + "learning_rate": 2.212726742335025e-06, + "loss": 0.0032, + "step": 27990 + }, + { + "epoch": 1.6777518125711546, + "grad_norm": 0.22798313200473785, + "learning_rate": 2.210624641425579e-06, + "loss": 0.004, + "step": 28000 + }, + { + "epoch": 1.678351009647073, + "grad_norm": 0.10564117878675461, + "learning_rate": 2.208532855337684e-06, + "loss": 0.003, + "step": 28010 + }, + { + "epoch": 1.6789502067229911, + "grad_norm": 0.4329298138618469, + "learning_rate": 2.2064513865261646e-06, + "loss": 0.0048, + "step": 28020 + }, + { + "epoch": 1.6795494037989096, + "grad_norm": 0.19210365414619446, + "learning_rate": 2.204380237433745e-06, + "loss": 0.0034, + "step": 28030 + }, + { + "epoch": 1.6801486008748276, + "grad_norm": 0.15383297204971313, + "learning_rate": 2.202319410491029e-06, + "loss": 0.0032, + "step": 28040 + }, + { + "epoch": 1.6807477979507461, + "grad_norm": 0.0796160027384758, + "learning_rate": 2.2002689081165155e-06, + "loss": 0.0029, + "step": 28050 + }, + { + "epoch": 1.6813469950266642, + "grad_norm": 0.13568224012851715, + "learning_rate": 2.1982287327165827e-06, + "loss": 0.0032, + "step": 28060 + }, + { + "epoch": 1.6819461921025827, + "grad_norm": 0.16137683391571045, + "learning_rate": 2.19619888668549e-06, + "loss": 0.0034, + "step": 28070 + }, + { + "epoch": 1.6825453891785007, + "grad_norm": 0.07282284647226334, + "learning_rate": 2.1941793724053733e-06, + "loss": 0.0031, + "step": 28080 + }, + { + "epoch": 1.6831445862544192, + "grad_norm": 0.17293596267700195, + "learning_rate": 2.1921701922462463e-06, + "loss": 0.0034, + "step": 28090 + }, + { + "epoch": 1.6837437833303373, + "grad_norm": 0.12304569780826569, + "learning_rate": 2.190171348565994e-06, + "loss": 0.0032, + "step": 28100 + }, + { + "epoch": 1.6843429804062557, + "grad_norm": 0.08121246099472046, + "learning_rate": 2.188182843710369e-06, + "loss": 0.0031, + "step": 28110 + }, + { + "epoch": 1.6849421774821738, + "grad_norm": 0.20509444177150726, + "learning_rate": 2.1862046800129964e-06, + "loss": 0.003, + "step": 28120 + }, + { + "epoch": 1.6855413745580923, + "grad_norm": 0.13242113590240479, + "learning_rate": 2.1842368597953578e-06, + "loss": 0.0028, + "step": 28130 + }, + { + "epoch": 1.6861405716340103, + "grad_norm": 0.14173154532909393, + "learning_rate": 2.1822793853668e-06, + "loss": 0.0028, + "step": 28140 + }, + { + "epoch": 1.6867397687099288, + "grad_norm": 0.40674927830696106, + "learning_rate": 2.18033225902453e-06, + "loss": 0.0038, + "step": 28150 + }, + { + "epoch": 1.6873389657858469, + "grad_norm": 0.08623358607292175, + "learning_rate": 2.17839548305361e-06, + "loss": 0.0044, + "step": 28160 + }, + { + "epoch": 1.6879381628617653, + "grad_norm": 0.15655292570590973, + "learning_rate": 2.1764690597269507e-06, + "loss": 0.0035, + "step": 28170 + }, + { + "epoch": 1.6885373599376834, + "grad_norm": 0.13297052681446075, + "learning_rate": 2.17455299130532e-06, + "loss": 0.0036, + "step": 28180 + }, + { + "epoch": 1.6891365570136019, + "grad_norm": 0.16847316920757294, + "learning_rate": 2.17264728003733e-06, + "loss": 0.0045, + "step": 28190 + }, + { + "epoch": 1.68973575408952, + "grad_norm": 0.24691548943519592, + "learning_rate": 2.17075192815944e-06, + "loss": 0.0036, + "step": 28200 + }, + { + "epoch": 1.6903349511654384, + "grad_norm": 0.16196060180664062, + "learning_rate": 2.168866937895951e-06, + "loss": 0.0034, + "step": 28210 + }, + { + "epoch": 1.6909341482413565, + "grad_norm": 0.24568283557891846, + "learning_rate": 2.166992311459001e-06, + "loss": 0.0046, + "step": 28220 + }, + { + "epoch": 1.691533345317275, + "grad_norm": 0.2796950042247772, + "learning_rate": 2.1651280510485727e-06, + "loss": 0.0028, + "step": 28230 + }, + { + "epoch": 1.692132542393193, + "grad_norm": 0.12654386460781097, + "learning_rate": 2.163274158852476e-06, + "loss": 0.0028, + "step": 28240 + }, + { + "epoch": 1.6927317394691115, + "grad_norm": 0.26169532537460327, + "learning_rate": 2.1614306370463605e-06, + "loss": 0.0039, + "step": 28250 + }, + { + "epoch": 1.6933309365450295, + "grad_norm": 0.19274167716503143, + "learning_rate": 2.1595974877936977e-06, + "loss": 0.0035, + "step": 28260 + }, + { + "epoch": 1.693930133620948, + "grad_norm": 0.2024545818567276, + "learning_rate": 2.1577747132457933e-06, + "loss": 0.0031, + "step": 28270 + }, + { + "epoch": 1.694529330696866, + "grad_norm": 0.12702754139900208, + "learning_rate": 2.155962315541773e-06, + "loss": 0.0031, + "step": 28280 + }, + { + "epoch": 1.6951285277727846, + "grad_norm": 0.09141751378774643, + "learning_rate": 2.154160296808588e-06, + "loss": 0.0034, + "step": 28290 + }, + { + "epoch": 1.6957277248487026, + "grad_norm": 0.049686893820762634, + "learning_rate": 2.1523686591610064e-06, + "loss": 0.0023, + "step": 28300 + }, + { + "epoch": 1.696326921924621, + "grad_norm": 0.14534041285514832, + "learning_rate": 2.1505874047016146e-06, + "loss": 0.0031, + "step": 28310 + }, + { + "epoch": 1.6969261190005391, + "grad_norm": 0.08376278728246689, + "learning_rate": 2.1488165355208147e-06, + "loss": 0.0036, + "step": 28320 + }, + { + "epoch": 1.6975253160764576, + "grad_norm": 0.14242660999298096, + "learning_rate": 2.14705605369682e-06, + "loss": 0.0027, + "step": 28330 + }, + { + "epoch": 1.6981245131523757, + "grad_norm": 0.13080888986587524, + "learning_rate": 2.145305961295655e-06, + "loss": 0.0026, + "step": 28340 + }, + { + "epoch": 1.6987237102282942, + "grad_norm": 0.15067961812019348, + "learning_rate": 2.143566260371149e-06, + "loss": 0.0027, + "step": 28350 + }, + { + "epoch": 1.6993229073042122, + "grad_norm": 0.06736161559820175, + "learning_rate": 2.141836952964938e-06, + "loss": 0.002, + "step": 28360 + }, + { + "epoch": 1.6999221043801307, + "grad_norm": 0.17074856162071228, + "learning_rate": 2.1401180411064616e-06, + "loss": 0.0033, + "step": 28370 + }, + { + "epoch": 1.7005213014560487, + "grad_norm": 0.12649405002593994, + "learning_rate": 2.138409526812959e-06, + "loss": 0.0033, + "step": 28380 + }, + { + "epoch": 1.7011204985319672, + "grad_norm": 0.15945205092430115, + "learning_rate": 2.1367114120894663e-06, + "loss": 0.0034, + "step": 28390 + }, + { + "epoch": 1.7017196956078853, + "grad_norm": 0.09780653566122055, + "learning_rate": 2.1350236989288136e-06, + "loss": 0.0025, + "step": 28400 + }, + { + "epoch": 1.7023188926838038, + "grad_norm": 0.4059111773967743, + "learning_rate": 2.1333463893116294e-06, + "loss": 0.0036, + "step": 28410 + }, + { + "epoch": 1.7029180897597218, + "grad_norm": 0.17648592591285706, + "learning_rate": 2.131679485206329e-06, + "loss": 0.0044, + "step": 28420 + }, + { + "epoch": 1.7035172868356403, + "grad_norm": 0.22077783942222595, + "learning_rate": 2.130022988569117e-06, + "loss": 0.0024, + "step": 28430 + }, + { + "epoch": 1.7041164839115586, + "grad_norm": 0.29329779744148254, + "learning_rate": 2.128376901343984e-06, + "loss": 0.0033, + "step": 28440 + }, + { + "epoch": 1.7047156809874768, + "grad_norm": 0.11621195822954178, + "learning_rate": 2.1267412254627056e-06, + "loss": 0.0034, + "step": 28450 + }, + { + "epoch": 1.705314878063395, + "grad_norm": 0.15548695623874664, + "learning_rate": 2.1251159628448386e-06, + "loss": 0.0029, + "step": 28460 + }, + { + "epoch": 1.7059140751393134, + "grad_norm": 0.11100694537162781, + "learning_rate": 2.1235011153977192e-06, + "loss": 0.0028, + "step": 28470 + }, + { + "epoch": 1.7065132722152316, + "grad_norm": 0.226019024848938, + "learning_rate": 2.121896685016461e-06, + "loss": 0.0039, + "step": 28480 + }, + { + "epoch": 1.70711246929115, + "grad_norm": 0.11407872289419174, + "learning_rate": 2.1203026735839514e-06, + "loss": 0.0027, + "step": 28490 + }, + { + "epoch": 1.7077116663670682, + "grad_norm": 0.10995867848396301, + "learning_rate": 2.118719082970852e-06, + "loss": 0.003, + "step": 28500 + }, + { + "epoch": 1.7083108634429864, + "grad_norm": 0.2666904926300049, + "learning_rate": 2.1171459150355947e-06, + "loss": 0.0029, + "step": 28510 + }, + { + "epoch": 1.7089100605189047, + "grad_norm": 0.17604438960552216, + "learning_rate": 2.115583171624381e-06, + "loss": 0.0032, + "step": 28520 + }, + { + "epoch": 1.709509257594823, + "grad_norm": 0.26522526144981384, + "learning_rate": 2.114030854571176e-06, + "loss": 0.0031, + "step": 28530 + }, + { + "epoch": 1.7101084546707412, + "grad_norm": 0.0793771743774414, + "learning_rate": 2.1124889656977097e-06, + "loss": 0.0029, + "step": 28540 + }, + { + "epoch": 1.7107076517466595, + "grad_norm": 0.15632960200309753, + "learning_rate": 2.1109575068134756e-06, + "loss": 0.0039, + "step": 28550 + }, + { + "epoch": 1.7113068488225778, + "grad_norm": 0.14011208713054657, + "learning_rate": 2.1094364797157267e-06, + "loss": 0.0037, + "step": 28560 + }, + { + "epoch": 1.711906045898496, + "grad_norm": 0.12041427195072174, + "learning_rate": 2.107925886189472e-06, + "loss": 0.0043, + "step": 28570 + }, + { + "epoch": 1.7125052429744143, + "grad_norm": 0.1488238424062729, + "learning_rate": 2.1064257280074763e-06, + "loss": 0.0032, + "step": 28580 + }, + { + "epoch": 1.7131044400503326, + "grad_norm": 0.21010251343250275, + "learning_rate": 2.1049360069302594e-06, + "loss": 0.0034, + "step": 28590 + }, + { + "epoch": 1.7137036371262508, + "grad_norm": 0.05566203221678734, + "learning_rate": 2.1034567247060926e-06, + "loss": 0.0027, + "step": 28600 + }, + { + "epoch": 1.7143028342021691, + "grad_norm": 0.21272292733192444, + "learning_rate": 2.1019878830709968e-06, + "loss": 0.0044, + "step": 28610 + }, + { + "epoch": 1.7149020312780874, + "grad_norm": 0.12333092093467712, + "learning_rate": 2.100529483748737e-06, + "loss": 0.0045, + "step": 28620 + }, + { + "epoch": 1.7155012283540056, + "grad_norm": 0.13811928033828735, + "learning_rate": 2.099081528450828e-06, + "loss": 0.0035, + "step": 28630 + }, + { + "epoch": 1.716100425429924, + "grad_norm": 0.07431097328662872, + "learning_rate": 2.097644018876524e-06, + "loss": 0.0022, + "step": 28640 + }, + { + "epoch": 1.7166996225058422, + "grad_norm": 0.08534728735685349, + "learning_rate": 2.096216956712826e-06, + "loss": 0.0017, + "step": 28650 + }, + { + "epoch": 1.7172988195817604, + "grad_norm": 0.3239804804325104, + "learning_rate": 2.0948003436344666e-06, + "loss": 0.0029, + "step": 28660 + }, + { + "epoch": 1.7178980166576787, + "grad_norm": 0.13592901825904846, + "learning_rate": 2.0933941813039244e-06, + "loss": 0.0023, + "step": 28670 + }, + { + "epoch": 1.718497213733597, + "grad_norm": 0.19489605724811554, + "learning_rate": 2.091998471371406e-06, + "loss": 0.0036, + "step": 28680 + }, + { + "epoch": 1.7190964108095153, + "grad_norm": 0.10355830937623978, + "learning_rate": 2.0906132154748557e-06, + "loss": 0.0028, + "step": 28690 + }, + { + "epoch": 1.7196956078854335, + "grad_norm": 0.09176923334598541, + "learning_rate": 2.0892384152399504e-06, + "loss": 0.0038, + "step": 28700 + }, + { + "epoch": 1.7202948049613518, + "grad_norm": 0.11903666704893112, + "learning_rate": 2.0878740722800917e-06, + "loss": 0.0031, + "step": 28710 + }, + { + "epoch": 1.72089400203727, + "grad_norm": 0.0953182652592659, + "learning_rate": 2.086520188196413e-06, + "loss": 0.0035, + "step": 28720 + }, + { + "epoch": 1.7214931991131883, + "grad_norm": 0.16318652033805847, + "learning_rate": 2.085176764577774e-06, + "loss": 0.0067, + "step": 28730 + }, + { + "epoch": 1.7220923961891066, + "grad_norm": 0.07236472517251968, + "learning_rate": 2.083843803000755e-06, + "loss": 0.0028, + "step": 28740 + }, + { + "epoch": 1.7226915932650249, + "grad_norm": 0.1972363442182541, + "learning_rate": 2.0825213050296636e-06, + "loss": 0.0034, + "step": 28750 + }, + { + "epoch": 1.7232907903409431, + "grad_norm": 0.09618771076202393, + "learning_rate": 2.081209272216522e-06, + "loss": 0.0041, + "step": 28760 + }, + { + "epoch": 1.7238899874168614, + "grad_norm": 0.10503746569156647, + "learning_rate": 2.079907706101075e-06, + "loss": 0.0023, + "step": 28770 + }, + { + "epoch": 1.7244891844927797, + "grad_norm": 0.06267210096120834, + "learning_rate": 2.0786166082107833e-06, + "loss": 0.0037, + "step": 28780 + }, + { + "epoch": 1.725088381568698, + "grad_norm": 0.08882488310337067, + "learning_rate": 2.0773359800608217e-06, + "loss": 0.0033, + "step": 28790 + }, + { + "epoch": 1.7256875786446162, + "grad_norm": 0.0851057916879654, + "learning_rate": 2.076065823154079e-06, + "loss": 0.0031, + "step": 28800 + }, + { + "epoch": 1.7262867757205345, + "grad_norm": 0.07359647005796432, + "learning_rate": 2.0748061389811543e-06, + "loss": 0.0023, + "step": 28810 + }, + { + "epoch": 1.7268859727964527, + "grad_norm": 0.10275846719741821, + "learning_rate": 2.073556929020357e-06, + "loss": 0.003, + "step": 28820 + }, + { + "epoch": 1.727485169872371, + "grad_norm": 0.20987747609615326, + "learning_rate": 2.0723181947377057e-06, + "loss": 0.0028, + "step": 28830 + }, + { + "epoch": 1.7280843669482893, + "grad_norm": 0.18235883116722107, + "learning_rate": 2.0710899375869237e-06, + "loss": 0.0023, + "step": 28840 + }, + { + "epoch": 1.7286835640242075, + "grad_norm": 0.33091968297958374, + "learning_rate": 2.0698721590094387e-06, + "loss": 0.0036, + "step": 28850 + }, + { + "epoch": 1.7292827611001258, + "grad_norm": 0.17163614928722382, + "learning_rate": 2.0686648604343824e-06, + "loss": 0.0038, + "step": 28860 + }, + { + "epoch": 1.729881958176044, + "grad_norm": 0.1668156236410141, + "learning_rate": 2.067468043278587e-06, + "loss": 0.0036, + "step": 28870 + }, + { + "epoch": 1.7304811552519623, + "grad_norm": 0.11935430020093918, + "learning_rate": 2.066281708946583e-06, + "loss": 0.0038, + "step": 28880 + }, + { + "epoch": 1.7310803523278806, + "grad_norm": 0.12218618392944336, + "learning_rate": 2.0651058588306007e-06, + "loss": 0.0026, + "step": 28890 + }, + { + "epoch": 1.7316795494037989, + "grad_norm": 0.07735276222229004, + "learning_rate": 2.063940494310565e-06, + "loss": 0.0037, + "step": 28900 + }, + { + "epoch": 1.7322787464797171, + "grad_norm": 0.11424548178911209, + "learning_rate": 2.062785616754097e-06, + "loss": 0.0033, + "step": 28910 + }, + { + "epoch": 1.7328779435556354, + "grad_norm": 0.14824451506137848, + "learning_rate": 2.0616412275165097e-06, + "loss": 0.0033, + "step": 28920 + }, + { + "epoch": 1.7334771406315537, + "grad_norm": 0.1048273965716362, + "learning_rate": 2.0605073279408063e-06, + "loss": 0.0029, + "step": 28930 + }, + { + "epoch": 1.734076337707472, + "grad_norm": 0.29807451367378235, + "learning_rate": 2.0593839193576833e-06, + "loss": 0.0042, + "step": 28940 + }, + { + "epoch": 1.7346755347833902, + "grad_norm": 0.1315613090991974, + "learning_rate": 2.058271003085521e-06, + "loss": 0.0027, + "step": 28950 + }, + { + "epoch": 1.7352747318593085, + "grad_norm": 0.1035790666937828, + "learning_rate": 2.0571685804303905e-06, + "loss": 0.0027, + "step": 28960 + }, + { + "epoch": 1.7358739289352267, + "grad_norm": 0.07466026395559311, + "learning_rate": 2.0560766526860447e-06, + "loss": 0.0034, + "step": 28970 + }, + { + "epoch": 1.7364731260111452, + "grad_norm": 0.11429832130670547, + "learning_rate": 2.054995221133923e-06, + "loss": 0.003, + "step": 28980 + }, + { + "epoch": 1.7370723230870633, + "grad_norm": 0.0868537500500679, + "learning_rate": 2.053924287043144e-06, + "loss": 0.0021, + "step": 28990 + }, + { + "epoch": 1.7376715201629818, + "grad_norm": 0.120276540517807, + "learning_rate": 2.0528638516705106e-06, + "loss": 0.0031, + "step": 29000 + }, + { + "epoch": 1.7382707172388998, + "grad_norm": 0.13164187967777252, + "learning_rate": 2.051813916260501e-06, + "loss": 0.0033, + "step": 29010 + }, + { + "epoch": 1.7388699143148183, + "grad_norm": 0.11800370365381241, + "learning_rate": 2.050774482045273e-06, + "loss": 0.003, + "step": 29020 + }, + { + "epoch": 1.7394691113907363, + "grad_norm": 0.09126367419958115, + "learning_rate": 2.049745550244661e-06, + "loss": 0.003, + "step": 29030 + }, + { + "epoch": 1.7400683084666548, + "grad_norm": 0.4037914276123047, + "learning_rate": 2.0487271220661735e-06, + "loss": 0.0031, + "step": 29040 + }, + { + "epoch": 1.7406675055425729, + "grad_norm": 0.11752860993146896, + "learning_rate": 2.047719198704994e-06, + "loss": 0.0033, + "step": 29050 + }, + { + "epoch": 1.7412667026184914, + "grad_norm": 0.2887340188026428, + "learning_rate": 2.0467217813439762e-06, + "loss": 0.0031, + "step": 29060 + }, + { + "epoch": 1.7418658996944094, + "grad_norm": 0.16717016696929932, + "learning_rate": 2.0457348711536426e-06, + "loss": 0.0029, + "step": 29070 + }, + { + "epoch": 1.742465096770328, + "grad_norm": 0.10888686776161194, + "learning_rate": 2.0447584692921894e-06, + "loss": 0.0029, + "step": 29080 + }, + { + "epoch": 1.743064293846246, + "grad_norm": 0.04226887598633766, + "learning_rate": 2.043792576905478e-06, + "loss": 0.003, + "step": 29090 + }, + { + "epoch": 1.7436634909221644, + "grad_norm": 0.20809385180473328, + "learning_rate": 2.0428371951270394e-06, + "loss": 0.0035, + "step": 29100 + }, + { + "epoch": 1.7442626879980825, + "grad_norm": 0.09200141578912735, + "learning_rate": 2.0418923250780633e-06, + "loss": 0.0027, + "step": 29110 + }, + { + "epoch": 1.744861885074001, + "grad_norm": 0.11600892245769501, + "learning_rate": 2.0409579678674084e-06, + "loss": 0.0024, + "step": 29120 + }, + { + "epoch": 1.745461082149919, + "grad_norm": 0.12580904364585876, + "learning_rate": 2.040034124591597e-06, + "loss": 0.0032, + "step": 29130 + }, + { + "epoch": 1.7460602792258375, + "grad_norm": 0.14242342114448547, + "learning_rate": 2.039120796334809e-06, + "loss": 0.0031, + "step": 29140 + }, + { + "epoch": 1.7466594763017556, + "grad_norm": 0.06451822817325592, + "learning_rate": 2.0382179841688868e-06, + "loss": 0.0029, + "step": 29150 + }, + { + "epoch": 1.747258673377674, + "grad_norm": 0.1550203114748001, + "learning_rate": 2.0373256891533293e-06, + "loss": 0.0039, + "step": 29160 + }, + { + "epoch": 1.747857870453592, + "grad_norm": 0.1968315690755844, + "learning_rate": 2.0364439123352956e-06, + "loss": 0.004, + "step": 29170 + }, + { + "epoch": 1.7484570675295106, + "grad_norm": 0.16437779366970062, + "learning_rate": 2.0355726547495998e-06, + "loss": 0.0027, + "step": 29180 + }, + { + "epoch": 1.7490562646054286, + "grad_norm": 0.1665470451116562, + "learning_rate": 2.034711917418711e-06, + "loss": 0.0032, + "step": 29190 + }, + { + "epoch": 1.7496554616813471, + "grad_norm": 0.070770762860775, + "learning_rate": 2.033861701352752e-06, + "loss": 0.0037, + "step": 29200 + }, + { + "epoch": 1.7502546587572652, + "grad_norm": 0.23530884087085724, + "learning_rate": 2.0330220075494992e-06, + "loss": 0.0031, + "step": 29210 + }, + { + "epoch": 1.7508538558331836, + "grad_norm": 0.10555171966552734, + "learning_rate": 2.0321928369943807e-06, + "loss": 0.0052, + "step": 29220 + }, + { + "epoch": 1.7514530529091017, + "grad_norm": 0.0906955823302269, + "learning_rate": 2.031374190660474e-06, + "loss": 0.0026, + "step": 29230 + }, + { + "epoch": 1.7520522499850202, + "grad_norm": 0.137167826294899, + "learning_rate": 2.0305660695085054e-06, + "loss": 0.0045, + "step": 29240 + }, + { + "epoch": 1.7526514470609382, + "grad_norm": 0.10824514180421829, + "learning_rate": 2.0297684744868494e-06, + "loss": 0.003, + "step": 29250 + }, + { + "epoch": 1.7532506441368567, + "grad_norm": 0.1013123095035553, + "learning_rate": 2.0289814065315306e-06, + "loss": 0.0033, + "step": 29260 + }, + { + "epoch": 1.7538498412127748, + "grad_norm": 0.05192271247506142, + "learning_rate": 2.0282048665662153e-06, + "loss": 0.0025, + "step": 29270 + }, + { + "epoch": 1.7544490382886933, + "grad_norm": 0.2546662986278534, + "learning_rate": 2.0274388555022176e-06, + "loss": 0.0041, + "step": 29280 + }, + { + "epoch": 1.7550482353646113, + "grad_norm": 0.16946350038051605, + "learning_rate": 2.0266833742384928e-06, + "loss": 0.0033, + "step": 29290 + }, + { + "epoch": 1.7556474324405298, + "grad_norm": 0.19652776420116425, + "learning_rate": 2.0259384236616404e-06, + "loss": 0.0041, + "step": 29300 + }, + { + "epoch": 1.7562466295164478, + "grad_norm": 0.1391136795282364, + "learning_rate": 2.0252040046459022e-06, + "loss": 0.0052, + "step": 29310 + }, + { + "epoch": 1.7568458265923663, + "grad_norm": 0.1327095329761505, + "learning_rate": 2.02448011805316e-06, + "loss": 0.0033, + "step": 29320 + }, + { + "epoch": 1.7574450236682844, + "grad_norm": 0.12003149092197418, + "learning_rate": 2.023766764732934e-06, + "loss": 0.003, + "step": 29330 + }, + { + "epoch": 1.7580442207442029, + "grad_norm": 0.10392506420612335, + "learning_rate": 2.0230639455223853e-06, + "loss": 0.0028, + "step": 29340 + }, + { + "epoch": 1.758643417820121, + "grad_norm": 0.19025546312332153, + "learning_rate": 2.0223716612463095e-06, + "loss": 0.0043, + "step": 29350 + }, + { + "epoch": 1.7592426148960394, + "grad_norm": 0.2707730829715729, + "learning_rate": 2.0216899127171424e-06, + "loss": 0.0033, + "step": 29360 + }, + { + "epoch": 1.7598418119719574, + "grad_norm": 0.20897458493709564, + "learning_rate": 2.0210187007349534e-06, + "loss": 0.0037, + "step": 29370 + }, + { + "epoch": 1.760441009047876, + "grad_norm": 0.1678476631641388, + "learning_rate": 2.0203580260874474e-06, + "loss": 0.0031, + "step": 29380 + }, + { + "epoch": 1.761040206123794, + "grad_norm": 0.06412776559591293, + "learning_rate": 2.019707889549963e-06, + "loss": 0.0027, + "step": 29390 + }, + { + "epoch": 1.7616394031997125, + "grad_norm": 0.07877464592456818, + "learning_rate": 2.01906829188547e-06, + "loss": 0.0027, + "step": 29400 + }, + { + "epoch": 1.7622386002756305, + "grad_norm": 0.28429147601127625, + "learning_rate": 2.018439233844574e-06, + "loss": 0.0036, + "step": 29410 + }, + { + "epoch": 1.762837797351549, + "grad_norm": 0.11621754616498947, + "learning_rate": 2.0178207161655087e-06, + "loss": 0.0028, + "step": 29420 + }, + { + "epoch": 1.763436994427467, + "grad_norm": 0.14076471328735352, + "learning_rate": 2.0172127395741398e-06, + "loss": 0.0037, + "step": 29430 + }, + { + "epoch": 1.7640361915033855, + "grad_norm": 0.15799261629581451, + "learning_rate": 2.0166153047839603e-06, + "loss": 0.0035, + "step": 29440 + }, + { + "epoch": 1.7646353885793036, + "grad_norm": 0.08505155891180038, + "learning_rate": 2.016028412496094e-06, + "loss": 0.0024, + "step": 29450 + }, + { + "epoch": 1.765234585655222, + "grad_norm": 0.10400725156068802, + "learning_rate": 2.015452063399292e-06, + "loss": 0.0048, + "step": 29460 + }, + { + "epoch": 1.7658337827311401, + "grad_norm": 0.19443514943122864, + "learning_rate": 2.014886258169932e-06, + "loss": 0.0033, + "step": 29470 + }, + { + "epoch": 1.7664329798070586, + "grad_norm": 0.06996763497591019, + "learning_rate": 2.014330997472017e-06, + "loss": 0.0031, + "step": 29480 + }, + { + "epoch": 1.7670321768829766, + "grad_norm": 0.12015870213508606, + "learning_rate": 2.013786281957177e-06, + "loss": 0.0028, + "step": 29490 + }, + { + "epoch": 1.7676313739588951, + "grad_norm": 0.14683139324188232, + "learning_rate": 2.0132521122646662e-06, + "loss": 0.004, + "step": 29500 + }, + { + "epoch": 1.7682305710348134, + "grad_norm": 0.1023707166314125, + "learning_rate": 2.0127284890213623e-06, + "loss": 0.0031, + "step": 29510 + }, + { + "epoch": 1.7688297681107317, + "grad_norm": 0.16903221607208252, + "learning_rate": 2.012215412841767e-06, + "loss": 0.0027, + "step": 29520 + }, + { + "epoch": 1.76942896518665, + "grad_norm": 0.10042630881071091, + "learning_rate": 2.011712884328003e-06, + "loss": 0.0035, + "step": 29530 + }, + { + "epoch": 1.7700281622625682, + "grad_norm": 0.0850566178560257, + "learning_rate": 2.011220904069815e-06, + "loss": 0.0059, + "step": 29540 + }, + { + "epoch": 1.7706273593384865, + "grad_norm": 0.09834299236536026, + "learning_rate": 2.01073947264457e-06, + "loss": 0.0028, + "step": 29550 + }, + { + "epoch": 1.7712265564144047, + "grad_norm": 0.13409706950187683, + "learning_rate": 2.0102685906172543e-06, + "loss": 0.0031, + "step": 29560 + }, + { + "epoch": 1.771825753490323, + "grad_norm": 0.08727999776601791, + "learning_rate": 2.009808258540475e-06, + "loss": 0.0032, + "step": 29570 + }, + { + "epoch": 1.7724249505662413, + "grad_norm": 0.05625200644135475, + "learning_rate": 2.009358476954456e-06, + "loss": 0.003, + "step": 29580 + }, + { + "epoch": 1.7730241476421595, + "grad_norm": 0.13802480697631836, + "learning_rate": 2.008919246387043e-06, + "loss": 0.003, + "step": 29590 + }, + { + "epoch": 1.7736233447180778, + "grad_norm": 0.12048090994358063, + "learning_rate": 2.0084905673536952e-06, + "loss": 0.0023, + "step": 29600 + }, + { + "epoch": 1.774222541793996, + "grad_norm": 0.06570231169462204, + "learning_rate": 2.0080724403574922e-06, + "loss": 0.0025, + "step": 29610 + }, + { + "epoch": 1.7748217388699143, + "grad_norm": 0.1293211132287979, + "learning_rate": 2.007664865889131e-06, + "loss": 0.0029, + "step": 29620 + }, + { + "epoch": 1.7754209359458326, + "grad_norm": 0.19836539030075073, + "learning_rate": 2.0072678444269208e-06, + "loss": 0.0026, + "step": 29630 + }, + { + "epoch": 1.7760201330217509, + "grad_norm": 0.23906737565994263, + "learning_rate": 2.006881376436789e-06, + "loss": 0.0028, + "step": 29640 + }, + { + "epoch": 1.7766193300976691, + "grad_norm": 0.1388060599565506, + "learning_rate": 2.0065054623722772e-06, + "loss": 0.0034, + "step": 29650 + }, + { + "epoch": 1.7772185271735874, + "grad_norm": 0.09379242360591888, + "learning_rate": 2.0061401026745425e-06, + "loss": 0.0031, + "step": 29660 + }, + { + "epoch": 1.7778177242495057, + "grad_norm": 0.18343773484230042, + "learning_rate": 2.005785297772354e-06, + "loss": 0.003, + "step": 29670 + }, + { + "epoch": 1.778416921325424, + "grad_norm": 0.16866934299468994, + "learning_rate": 2.005441048082095e-06, + "loss": 0.0026, + "step": 29680 + }, + { + "epoch": 1.7790161184013422, + "grad_norm": 0.08610724657773972, + "learning_rate": 2.0051073540077617e-06, + "loss": 0.0027, + "step": 29690 + }, + { + "epoch": 1.7796153154772605, + "grad_norm": 0.060445595532655716, + "learning_rate": 2.0047842159409633e-06, + "loss": 0.002, + "step": 29700 + }, + { + "epoch": 1.7802145125531788, + "grad_norm": 0.19706133008003235, + "learning_rate": 2.004471634260919e-06, + "loss": 0.0029, + "step": 29710 + }, + { + "epoch": 1.780813709629097, + "grad_norm": 0.10716386139392853, + "learning_rate": 2.004169609334462e-06, + "loss": 0.004, + "step": 29720 + }, + { + "epoch": 1.7814129067050153, + "grad_norm": 0.18327921628952026, + "learning_rate": 2.003878141516035e-06, + "loss": 0.0044, + "step": 29730 + }, + { + "epoch": 1.7820121037809336, + "grad_norm": 0.1188778281211853, + "learning_rate": 2.0035972311476916e-06, + "loss": 0.0042, + "step": 29740 + }, + { + "epoch": 1.7826113008568518, + "grad_norm": 0.2874482274055481, + "learning_rate": 2.0033268785590954e-06, + "loss": 0.003, + "step": 29750 + }, + { + "epoch": 1.78321049793277, + "grad_norm": 0.07464325428009033, + "learning_rate": 2.003067084067522e-06, + "loss": 0.0029, + "step": 29760 + }, + { + "epoch": 1.7838096950086884, + "grad_norm": 0.17671462893486023, + "learning_rate": 2.0028178479778523e-06, + "loss": 0.0029, + "step": 29770 + }, + { + "epoch": 1.7844088920846066, + "grad_norm": 0.09008106589317322, + "learning_rate": 2.0025791705825805e-06, + "loss": 0.0031, + "step": 29780 + }, + { + "epoch": 1.785008089160525, + "grad_norm": 0.06681933254003525, + "learning_rate": 2.0023510521618066e-06, + "loss": 0.0029, + "step": 29790 + }, + { + "epoch": 1.7856072862364432, + "grad_norm": 0.09899364411830902, + "learning_rate": 2.0021334929832407e-06, + "loss": 0.0032, + "step": 29800 + }, + { + "epoch": 1.7862064833123614, + "grad_norm": 0.0671558529138565, + "learning_rate": 2.0019264933022016e-06, + "loss": 0.0036, + "step": 29810 + }, + { + "epoch": 1.7868056803882797, + "grad_norm": 0.11834210157394409, + "learning_rate": 2.001730053361614e-06, + "loss": 0.0033, + "step": 29820 + }, + { + "epoch": 1.787404877464198, + "grad_norm": 0.37054625153541565, + "learning_rate": 2.0015441733920105e-06, + "loss": 0.0038, + "step": 29830 + }, + { + "epoch": 1.7880040745401162, + "grad_norm": 0.12430086731910706, + "learning_rate": 2.0013688536115332e-06, + "loss": 0.0043, + "step": 29840 + }, + { + "epoch": 1.7886032716160345, + "grad_norm": 0.15685392916202545, + "learning_rate": 2.0012040942259285e-06, + "loss": 0.0031, + "step": 29850 + }, + { + "epoch": 1.7892024686919528, + "grad_norm": 0.15734116733074188, + "learning_rate": 2.0010498954285506e-06, + "loss": 0.0034, + "step": 29860 + }, + { + "epoch": 1.789801665767871, + "grad_norm": 0.1462196558713913, + "learning_rate": 2.00090625740036e-06, + "loss": 0.0027, + "step": 29870 + }, + { + "epoch": 1.7904008628437893, + "grad_norm": 0.10963186621665955, + "learning_rate": 2.0007731803099256e-06, + "loss": 0.0031, + "step": 29880 + }, + { + "epoch": 1.7910000599197076, + "grad_norm": 0.08986041694879532, + "learning_rate": 2.00065066431342e-06, + "loss": 0.0024, + "step": 29890 + }, + { + "epoch": 1.7915992569956258, + "grad_norm": 0.1555427759885788, + "learning_rate": 2.0005387095546222e-06, + "loss": 0.0048, + "step": 29900 + }, + { + "epoch": 1.792198454071544, + "grad_norm": 0.10785987228155136, + "learning_rate": 2.000437316164917e-06, + "loss": 0.0027, + "step": 29910 + }, + { + "epoch": 1.7927976511474624, + "grad_norm": 0.16140185296535492, + "learning_rate": 2.000346484263297e-06, + "loss": 0.0032, + "step": 29920 + }, + { + "epoch": 1.7933968482233806, + "grad_norm": 0.21847034990787506, + "learning_rate": 2.0002662139563564e-06, + "loss": 0.0038, + "step": 29930 + }, + { + "epoch": 1.793996045299299, + "grad_norm": 0.11339953541755676, + "learning_rate": 2.0001965053382976e-06, + "loss": 0.003, + "step": 29940 + }, + { + "epoch": 1.7945952423752172, + "grad_norm": 0.142179474234581, + "learning_rate": 2.000137358490928e-06, + "loss": 0.0036, + "step": 29950 + }, + { + "epoch": 1.7951944394511354, + "grad_norm": 0.09894557297229767, + "learning_rate": 2.0000887734836583e-06, + "loss": 0.0026, + "step": 29960 + }, + { + "epoch": 1.7957936365270537, + "grad_norm": 0.2643095850944519, + "learning_rate": 2.0000507503735076e-06, + "loss": 0.0027, + "step": 29970 + }, + { + "epoch": 1.796392833602972, + "grad_norm": 0.11731639504432678, + "learning_rate": 2.0000232892050976e-06, + "loss": 0.0028, + "step": 29980 + }, + { + "epoch": 1.7969920306788902, + "grad_norm": 0.07690370082855225, + "learning_rate": 2.000006390010655e-06, + "loss": 0.0028, + "step": 29990 + }, + { + "epoch": 1.7975912277548085, + "grad_norm": 0.10461316257715225, + "learning_rate": 2.0000000528100118e-06, + "loss": 0.0037, + "step": 30000 + }, + { + "epoch": 1.7975912277548085, + "step": 30000, + "total_flos": 1.873893288742748e+17, + "train_loss": 0.006228442152775824, + "train_runtime": 19636.4414, + "train_samples_per_second": 12.222, + "train_steps_per_second": 1.528 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.873893288742748e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..19d22af7b7d6155175015b5c3c5b452030d153ea --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260205-201028_lr2e-05_batchsize8/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccf8e16588ffacf58cd09ed0241d355125d76c992d11c15a4bc8ee94db38dc3b +size 6097