magicgh commited on May 21, 2025

Commit

632dd61

0 Parent(s):

First commit

Browse files

Files changed (26) hide show

.gitattributes +36 -0
README.md +202 -0
adapter_config.json +43 -0
adapter_model.bin +3 -0
added_tokens.json +16 -0
chat_template.json +3 -0
config.json +120 -0
merges.txt +0 -0
optimizer.pt +3 -0
preprocessor_config.json +29 -0
rng_state_0.pth +3 -0
rng_state_1.pth +3 -0
rng_state_2.pth +3 -0
rng_state_3.pth +3 -0
rng_state_4.pth +3 -0
rng_state_5.pth +3 -0
rng_state_6.pth +3 -0
rng_state_7.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +145 -0
tokenselection_modules.py +4 -0
trainer_state.json +1825 -0
training_args.bin +3 -0
vocab.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: iMeanAI/Qwen2-VL-TokenSelection-2B
+library_name: peft
+---
+# GAE-Retriever
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "Qwen2VLForConditionalGeneration",
+    "parent_library": "src.model.vlm_backbone.qwen2_vl_tokenselection.modeling_qwen2_vl"
+  },
+  "base_model_name_or_path": "iMeanAI/Qwen2-VL-TokenSelection-2B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": "gaussian",
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "out_proj",
+    "gate_up_proj",
+    "down_proj",
+    "qkv_proj",
+    "v_proj"
+  ],
+  "task_type": null,
+  "trainable_token_indices": null,
+  "use_dora": true,
+  "use_rslora": false
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57e59eaf8adb7a48d8ffd572e29ddc7916896e64ce90831dd34ae600c4160168
+size 18846106

added_tokens.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

config.json ADDED Viewed

	@@ -0,0 +1,120 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "iMeanAI/Qwen2-VL-TokenSelection-2B",
+  "architectures": [
+    "Qwen2VLForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "tokenselection_modules.Qwen2VLConfig",
+    "AutoImageProcessor": "tokenselection_modules.Qwen2VLImageProcessor",
+    "AutoModelForConditionalGeneration": "tokenselection_modules.Qwen2VLForConditionalGeneration",
+    "AutoProcessor": "tokenselection_modules.Qwen2VLProcessor"
+  },
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "image_token_id": 151655,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "lm_skip_layer": [
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2_vl_tokenselection",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "rope_type": "default",
+    "type": "default"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "hidden_size": 1536,
+    "in_chans": 3,
+    "model_type": "qwen2_vl_tokenselection",
+    "spatial_patch_size": 14,
+    "torch_dtype": "bfloat16",
+    "vis_skip_layer": [
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1
+    ]
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vision_token_id": 151654,
+  "vocab_size": 151936
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35c1369389ce21aceeea1240eddf1b5857dacf904e7ad17aec4e0403491ce506
+size 37745918

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 1003520,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "Qwen2VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "max_pixels": 12845056,
+    "min_pixels": 3136
+  },
+  "temporal_patch_size": 2
+}

rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:975e587fc9522ba13822c89abaedb7c2f5314fb4076560bc8746a11e80594041
+size 15984

rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9044147a96de834c32b9aa1094994d3317f7c9ebb8db20fbc5f423b05b1bee64
+size 15984

rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d67645c02b3422a6e64befda595169279db5acf9effafd170f1f44cf265f7881
+size 15984

rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41c9393853c6ca819ae2cc0796cae537ffed75870b68e108d479e4df0b3d8d8b
+size 15984

rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a9077509f4a2af7a2af42a8b17b7f7740c62e83db88f552d293edd6876ca86f
+size 15984

rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:052096fe62888786ba71f5d0fb47f208bde9222ad9dc874f29338e7792710565
+size 15984

rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56b6f98e4e591f50400ae3d94053aa628bda5489bdd530e0b28c2ce693afb19f
+size 15984

rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96e37ee78adfcc9b9f46eaca2c545fd1ff4fd44ffda3e6326894c1fbf6af3594
+size 15984

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c50c6c89f65a3889f4a1e6fdc00146927c5e249b8d743f8a8a25146842bcb890
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:091aa7594dc2fcfbfa06b9e3c22a5f0562ac14f30375c13af7309407a0e67b8a
+size 11420371

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,145 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "processor_class": "Qwen2VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

tokenselection_modules.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from src.model.vlm_backbone.qwen2_vl_tokenselection.configuration_qwen2_vl import Qwen2VLConfig
+from src.model.vlm_backbone.qwen2_vl_tokenselection.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+from src.model.vlm_backbone.qwen2_vl_tokenselection.processing_qwen2_vl import Qwen2VLProcessor
+from src.model.vlm_backbone.qwen2_vl_tokenselection.image_processing_qwen2_vl import Qwen2VLImageProcessor

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1825 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.01953125,
+  "eval_steps": 500,
+  "global_step": 256,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00390625,
+      "grad_norm": 988.0331420898438,
+      "learning_rate": 3.846153846153847e-06,
+      "loss": 23.0443,
+      "step": 1
+    },
+    {
+      "epoch": 0.0078125,
+      "grad_norm": 1018.0032958984375,
+      "learning_rate": 7.692307692307694e-06,
+      "loss": 22.2415,
+      "step": 2
+    },
+    {
+      "epoch": 0.01171875,
+      "grad_norm": 905.46435546875,
+      "learning_rate": 1.153846153846154e-05,
+      "loss": 23.9407,
+      "step": 3
+    },
+    {
+      "epoch": 0.015625,
+      "grad_norm": 772.3015747070312,
+      "learning_rate": 1.5384615384615387e-05,
+      "loss": 20.3585,
+      "step": 4
+    },
+    {
+      "epoch": 0.01953125,
+      "grad_norm": 1083.967529296875,
+      "learning_rate": 1.923076923076923e-05,
+      "loss": 17.7004,
+      "step": 5
+    },
+    {
+      "epoch": 0.0234375,
+      "grad_norm": 518.4392700195312,
+      "learning_rate": 2.307692307692308e-05,
+      "loss": 14.8905,
+      "step": 6
+    },
+    {
+      "epoch": 0.02734375,
+      "grad_norm": 588.9624633789062,
+      "learning_rate": 2.6923076923076923e-05,
+      "loss": 15.46,
+      "step": 7
+    },
+    {
+      "epoch": 0.03125,
+      "grad_norm": 291.337646484375,
+      "learning_rate": 3.0769230769230774e-05,
+      "loss": 11.7233,
+      "step": 8
+    },
+    {
+      "epoch": 0.03515625,
+      "grad_norm": 243.8762969970703,
+      "learning_rate": 3.461538461538462e-05,
+      "loss": 9.8939,
+      "step": 9
+    },
+    {
+      "epoch": 0.0390625,
+      "grad_norm": 306.4288635253906,
+      "learning_rate": 3.846153846153846e-05,
+      "loss": 10.5548,
+      "step": 10
+    },
+    {
+      "epoch": 0.04296875,
+      "grad_norm": 138.4855194091797,
+      "learning_rate": 4.230769230769231e-05,
+      "loss": 7.1429,
+      "step": 11
+    },
+    {
+      "epoch": 0.046875,
+      "grad_norm": 79.80815887451172,
+      "learning_rate": 4.615384615384616e-05,
+      "loss": 7.0453,
+      "step": 12
+    },
+    {
+      "epoch": 0.05078125,
+      "grad_norm": 95.14498901367188,
+      "learning_rate": 5e-05,
+      "loss": 6.2978,
+      "step": 13
+    },
+    {
+      "epoch": 0.0546875,
+      "grad_norm": 76.86641693115234,
+      "learning_rate": 4.9794238683127575e-05,
+      "loss": 6.9177,
+      "step": 14
+    },
+    {
+      "epoch": 0.05859375,
+      "grad_norm": 90.26302337646484,
+      "learning_rate": 4.958847736625515e-05,
+      "loss": 5.7372,
+      "step": 15
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 51.85117721557617,
+      "learning_rate": 4.938271604938271e-05,
+      "loss": 5.7694,
+      "step": 16
+    },
+    {
+      "epoch": 0.06640625,
+      "grad_norm": 28.343177795410156,
+      "learning_rate": 4.9176954732510286e-05,
+      "loss": 5.8857,
+      "step": 17
+    },
+    {
+      "epoch": 0.0703125,
+      "grad_norm": 28.591285705566406,
+      "learning_rate": 4.8971193415637865e-05,
+      "loss": 5.2027,
+      "step": 18
+    },
+    {
+      "epoch": 0.07421875,
+      "grad_norm": 23.616016387939453,
+      "learning_rate": 4.876543209876544e-05,
+      "loss": 4.8946,
+      "step": 19
+    },
+    {
+      "epoch": 0.078125,
+      "grad_norm": 20.2352294921875,
+      "learning_rate": 4.855967078189301e-05,
+      "loss": 5.0189,
+      "step": 20
+    },
+    {
+      "epoch": 0.08203125,
+      "grad_norm": 16.59236717224121,
+      "learning_rate": 4.835390946502058e-05,
+      "loss": 4.9116,
+      "step": 21
+    },
+    {
+      "epoch": 0.0859375,
+      "grad_norm": 15.008003234863281,
+      "learning_rate": 4.814814814814815e-05,
+      "loss": 4.8674,
+      "step": 22
+    },
+    {
+      "epoch": 0.08984375,
+      "grad_norm": 11.656904220581055,
+      "learning_rate": 4.794238683127572e-05,
+      "loss": 4.96,
+      "step": 23
+    },
+    {
+      "epoch": 0.09375,
+      "grad_norm": 17.49643898010254,
+      "learning_rate": 4.773662551440329e-05,
+      "loss": 4.7911,
+      "step": 24
+    },
+    {
+      "epoch": 0.09765625,
+      "grad_norm": 12.967889785766602,
+      "learning_rate": 4.7530864197530866e-05,
+      "loss": 4.7574,
+      "step": 25
+    },
+    {
+      "epoch": 0.1015625,
+      "grad_norm": 13.78781795501709,
+      "learning_rate": 4.732510288065844e-05,
+      "loss": 4.5257,
+      "step": 26
+    },
+    {
+      "epoch": 0.10546875,
+      "grad_norm": 14.40069580078125,
+      "learning_rate": 4.711934156378601e-05,
+      "loss": 4.6842,
+      "step": 27
+    },
+    {
+      "epoch": 0.109375,
+      "grad_norm": 8.558459281921387,
+      "learning_rate": 4.691358024691358e-05,
+      "loss": 4.4912,
+      "step": 28
+    },
+    {
+      "epoch": 0.11328125,
+      "grad_norm": 9.563591003417969,
+      "learning_rate": 4.6707818930041156e-05,
+      "loss": 4.3927,
+      "step": 29
+    },
+    {
+      "epoch": 0.1171875,
+      "grad_norm": 13.471641540527344,
+      "learning_rate": 4.650205761316873e-05,
+      "loss": 4.446,
+      "step": 30
+    },
+    {
+      "epoch": 0.12109375,
+      "grad_norm": 8.875003814697266,
+      "learning_rate": 4.62962962962963e-05,
+      "loss": 4.4367,
+      "step": 31
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 9.281575202941895,
+      "learning_rate": 4.609053497942387e-05,
+      "loss": 4.4643,
+      "step": 32
+    },
+    {
+      "epoch": 0.12890625,
+      "grad_norm": 9.402353286743164,
+      "learning_rate": 4.5884773662551446e-05,
+      "loss": 4.4036,
+      "step": 33
+    },
+    {
+      "epoch": 0.1328125,
+      "grad_norm": 8.509848594665527,
+      "learning_rate": 4.567901234567901e-05,
+      "loss": 4.3953,
+      "step": 34
+    },
+    {
+      "epoch": 0.13671875,
+      "grad_norm": 9.05785846710205,
+      "learning_rate": 4.5473251028806584e-05,
+      "loss": 4.4259,
+      "step": 35
+    },
+    {
+      "epoch": 0.140625,
+      "grad_norm": 7.090909481048584,
+      "learning_rate": 4.5267489711934157e-05,
+      "loss": 4.2375,
+      "step": 36
+    },
+    {
+      "epoch": 0.14453125,
+      "grad_norm": 10.807791709899902,
+      "learning_rate": 4.506172839506173e-05,
+      "loss": 4.3719,
+      "step": 37
+    },
+    {
+      "epoch": 0.1484375,
+      "grad_norm": 13.562170028686523,
+      "learning_rate": 4.48559670781893e-05,
+      "loss": 4.6153,
+      "step": 38
+    },
+    {
+      "epoch": 0.15234375,
+      "grad_norm": 11.16930103302002,
+      "learning_rate": 4.4650205761316874e-05,
+      "loss": 4.4007,
+      "step": 39
+    },
+    {
+      "epoch": 0.15625,
+      "grad_norm": 12.779727935791016,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 4.5876,
+      "step": 40
+    },
+    {
+      "epoch": 0.16015625,
+      "grad_norm": 14.613138198852539,
+      "learning_rate": 4.423868312757202e-05,
+      "loss": 4.3842,
+      "step": 41
+    },
+    {
+      "epoch": 0.1640625,
+      "grad_norm": 16.920902252197266,
+      "learning_rate": 4.403292181069959e-05,
+      "loss": 4.2888,
+      "step": 42
+    },
+    {
+      "epoch": 0.16796875,
+      "grad_norm": 23.178911209106445,
+      "learning_rate": 4.3827160493827164e-05,
+      "loss": 4.4708,
+      "step": 43
+    },
+    {
+      "epoch": 0.171875,
+      "grad_norm": 28.53046989440918,
+      "learning_rate": 4.3621399176954737e-05,
+      "loss": 4.5108,
+      "step": 44
+    },
+    {
+      "epoch": 0.17578125,
+      "grad_norm": 19.8005428314209,
+      "learning_rate": 4.341563786008231e-05,
+      "loss": 4.4592,
+      "step": 45
+    },
+    {
+      "epoch": 0.1796875,
+      "grad_norm": 12.320777893066406,
+      "learning_rate": 4.3209876543209875e-05,
+      "loss": 4.5041,
+      "step": 46
+    },
+    {
+      "epoch": 0.18359375,
+      "grad_norm": 8.350976943969727,
+      "learning_rate": 4.300411522633745e-05,
+      "loss": 4.3769,
+      "step": 47
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 14.553681373596191,
+      "learning_rate": 4.279835390946502e-05,
+      "loss": 4.683,
+      "step": 48
+    },
+    {
+      "epoch": 0.19140625,
+      "grad_norm": 13.202040672302246,
+      "learning_rate": 4.259259259259259e-05,
+      "loss": 4.4271,
+      "step": 49
+    },
+    {
+      "epoch": 0.1953125,
+      "grad_norm": 12.252931594848633,
+      "learning_rate": 4.2386831275720165e-05,
+      "loss": 4.202,
+      "step": 50
+    },
+    {
+      "epoch": 0.19921875,
+      "grad_norm": 9.770834922790527,
+      "learning_rate": 4.2181069958847744e-05,
+      "loss": 4.4271,
+      "step": 51
+    },
+    {
+      "epoch": 0.203125,
+      "grad_norm": 10.324782371520996,
+      "learning_rate": 4.197530864197531e-05,
+      "loss": 4.3385,
+      "step": 52
+    },
+    {
+      "epoch": 0.20703125,
+      "grad_norm": 8.983942031860352,
+      "learning_rate": 4.176954732510288e-05,
+      "loss": 4.2885,
+      "step": 53
+    },
+    {
+      "epoch": 0.2109375,
+      "grad_norm": 8.984896659851074,
+      "learning_rate": 4.1563786008230455e-05,
+      "loss": 4.446,
+      "step": 54
+    },
+    {
+      "epoch": 0.21484375,
+      "grad_norm": 10.578001022338867,
+      "learning_rate": 4.135802469135803e-05,
+      "loss": 4.3787,
+      "step": 55
+    },
+    {
+      "epoch": 0.21875,
+      "grad_norm": 12.245546340942383,
+      "learning_rate": 4.11522633744856e-05,
+      "loss": 4.1896,
+      "step": 56
+    },
+    {
+      "epoch": 0.22265625,
+      "grad_norm": 12.414233207702637,
+      "learning_rate": 4.094650205761317e-05,
+      "loss": 4.4919,
+      "step": 57
+    },
+    {
+      "epoch": 0.2265625,
+      "grad_norm": 10.887884140014648,
+      "learning_rate": 4.074074074074074e-05,
+      "loss": 4.2051,
+      "step": 58
+    },
+    {
+      "epoch": 0.23046875,
+      "grad_norm": 11.619287490844727,
+      "learning_rate": 4.053497942386831e-05,
+      "loss": 4.015,
+      "step": 59
+    },
+    {
+      "epoch": 0.234375,
+      "grad_norm": 16.54395294189453,
+      "learning_rate": 4.032921810699588e-05,
+      "loss": 3.9459,
+      "step": 60
+    },
+    {
+      "epoch": 0.23828125,
+      "grad_norm": 22.401296615600586,
+      "learning_rate": 4.012345679012346e-05,
+      "loss": 3.8599,
+      "step": 61
+    },
+    {
+      "epoch": 0.2421875,
+      "grad_norm": 30.748231887817383,
+      "learning_rate": 3.9917695473251035e-05,
+      "loss": 3.8411,
+      "step": 62
+    },
+    {
+      "epoch": 0.24609375,
+      "grad_norm": 48.33995056152344,
+      "learning_rate": 3.971193415637861e-05,
+      "loss": 3.8338,
+      "step": 63
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 40.25141906738281,
+      "learning_rate": 3.950617283950617e-05,
+      "loss": 3.7575,
+      "step": 64
+    },
+    {
+      "epoch": 0.25390625,
+      "grad_norm": 70.64269256591797,
+      "learning_rate": 3.9300411522633746e-05,
+      "loss": 3.7886,
+      "step": 65
+    },
+    {
+      "epoch": 0.2578125,
+      "grad_norm": 46.51031494140625,
+      "learning_rate": 3.909465020576132e-05,
+      "loss": 3.7727,
+      "step": 66
+    },
+    {
+      "epoch": 0.26171875,
+      "grad_norm": 44.5860481262207,
+      "learning_rate": 3.888888888888889e-05,
+      "loss": 3.7284,
+      "step": 67
+    },
+    {
+      "epoch": 0.265625,
+      "grad_norm": 37.13584899902344,
+      "learning_rate": 3.868312757201646e-05,
+      "loss": 3.7974,
+      "step": 68
+    },
+    {
+      "epoch": 0.26953125,
+      "grad_norm": 17.04970932006836,
+      "learning_rate": 3.8477366255144036e-05,
+      "loss": 3.7399,
+      "step": 69
+    },
+    {
+      "epoch": 0.2734375,
+      "grad_norm": 25.48470687866211,
+      "learning_rate": 3.82716049382716e-05,
+      "loss": 3.6582,
+      "step": 70
+    },
+    {
+      "epoch": 0.27734375,
+      "grad_norm": 15.296791076660156,
+      "learning_rate": 3.806584362139918e-05,
+      "loss": 3.759,
+      "step": 71
+    },
+    {
+      "epoch": 0.28125,
+      "grad_norm": 18.959625244140625,
+      "learning_rate": 3.786008230452675e-05,
+      "loss": 3.6962,
+      "step": 72
+    },
+    {
+      "epoch": 0.28515625,
+      "grad_norm": 16.249454498291016,
+      "learning_rate": 3.7654320987654326e-05,
+      "loss": 3.5659,
+      "step": 73
+    },
+    {
+      "epoch": 0.2890625,
+      "grad_norm": 17.6076602935791,
+      "learning_rate": 3.74485596707819e-05,
+      "loss": 3.6214,
+      "step": 74
+    },
+    {
+      "epoch": 0.29296875,
+      "grad_norm": 15.325469017028809,
+      "learning_rate": 3.724279835390947e-05,
+      "loss": 3.5469,
+      "step": 75
+    },
+    {
+      "epoch": 0.296875,
+      "grad_norm": 14.967309951782227,
+      "learning_rate": 3.7037037037037037e-05,
+      "loss": 3.5844,
+      "step": 76
+    },
+    {
+      "epoch": 0.30078125,
+      "grad_norm": 16.454774856567383,
+      "learning_rate": 3.683127572016461e-05,
+      "loss": 3.4725,
+      "step": 77
+    },
+    {
+      "epoch": 0.3046875,
+      "grad_norm": 15.80962085723877,
+      "learning_rate": 3.662551440329218e-05,
+      "loss": 3.4781,
+      "step": 78
+    },
+    {
+      "epoch": 0.30859375,
+      "grad_norm": 12.834327697753906,
+      "learning_rate": 3.6419753086419754e-05,
+      "loss": 3.6498,
+      "step": 79
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 12.81863784790039,
+      "learning_rate": 3.6213991769547327e-05,
+      "loss": 3.4208,
+      "step": 80
+    },
+    {
+      "epoch": 0.31640625,
+      "grad_norm": 16.791345596313477,
+      "learning_rate": 3.60082304526749e-05,
+      "loss": 3.4432,
+      "step": 81
+    },
+    {
+      "epoch": 0.3203125,
+      "grad_norm": 20.827123641967773,
+      "learning_rate": 3.580246913580247e-05,
+      "loss": 3.4386,
+      "step": 82
+    },
+    {
+      "epoch": 0.32421875,
+      "grad_norm": 14.668889999389648,
+      "learning_rate": 3.5596707818930044e-05,
+      "loss": 3.3887,
+      "step": 83
+    },
+    {
+      "epoch": 0.328125,
+      "grad_norm": 17.22427749633789,
+      "learning_rate": 3.539094650205762e-05,
+      "loss": 3.6215,
+      "step": 84
+    },
+    {
+      "epoch": 0.33203125,
+      "grad_norm": 16.612550735473633,
+      "learning_rate": 3.518518518518519e-05,
+      "loss": 3.4215,
+      "step": 85
+    },
+    {
+      "epoch": 0.3359375,
+      "grad_norm": 15.173816680908203,
+      "learning_rate": 3.497942386831276e-05,
+      "loss": 3.3305,
+      "step": 86
+    },
+    {
+      "epoch": 0.33984375,
+      "grad_norm": 18.891874313354492,
+      "learning_rate": 3.4773662551440334e-05,
+      "loss": 3.4129,
+      "step": 87
+    },
+    {
+      "epoch": 0.34375,
+      "grad_norm": 19.012630462646484,
+      "learning_rate": 3.45679012345679e-05,
+      "loss": 3.4039,
+      "step": 88
+    },
+    {
+      "epoch": 0.34765625,
+      "grad_norm": 25.285091400146484,
+      "learning_rate": 3.436213991769547e-05,
+      "loss": 3.7148,
+      "step": 89
+    },
+    {
+      "epoch": 0.3515625,
+      "grad_norm": 23.138742446899414,
+      "learning_rate": 3.4156378600823045e-05,
+      "loss": 3.3594,
+      "step": 90
+    },
+    {
+      "epoch": 0.35546875,
+      "grad_norm": 24.770191192626953,
+      "learning_rate": 3.395061728395062e-05,
+      "loss": 3.3106,
+      "step": 91
+    },
+    {
+      "epoch": 0.359375,
+      "grad_norm": 17.597614288330078,
+      "learning_rate": 3.374485596707819e-05,
+      "loss": 3.2024,
+      "step": 92
+    },
+    {
+      "epoch": 0.36328125,
+      "grad_norm": 13.329920768737793,
+      "learning_rate": 3.353909465020576e-05,
+      "loss": 3.196,
+      "step": 93
+    },
+    {
+      "epoch": 0.3671875,
+      "grad_norm": 20.01732635498047,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 3.3282,
+      "step": 94
+    },
+    {
+      "epoch": 0.37109375,
+      "grad_norm": 21.501489639282227,
+      "learning_rate": 3.312757201646091e-05,
+      "loss": 3.2842,
+      "step": 95
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 15.159469604492188,
+      "learning_rate": 3.292181069958848e-05,
+      "loss": 3.149,
+      "step": 96
+    },
+    {
+      "epoch": 0.37890625,
+      "grad_norm": 15.139326095581055,
+      "learning_rate": 3.271604938271605e-05,
+      "loss": 3.2335,
+      "step": 97
+    },
+    {
+      "epoch": 0.3828125,
+      "grad_norm": 14.6196870803833,
+      "learning_rate": 3.2510288065843625e-05,
+      "loss": 3.104,
+      "step": 98
+    },
+    {
+      "epoch": 0.38671875,
+      "grad_norm": 12.317699432373047,
+      "learning_rate": 3.230452674897119e-05,
+      "loss": 3.1088,
+      "step": 99
+    },
+    {
+      "epoch": 0.390625,
+      "grad_norm": 17.009883880615234,
+      "learning_rate": 3.209876543209876e-05,
+      "loss": 3.1713,
+      "step": 100
+    },
+    {
+      "epoch": 0.39453125,
+      "grad_norm": 22.170177459716797,
+      "learning_rate": 3.1893004115226336e-05,
+      "loss": 3.1348,
+      "step": 101
+    },
+    {
+      "epoch": 0.3984375,
+      "grad_norm": 18.459367752075195,
+      "learning_rate": 3.168724279835391e-05,
+      "loss": 3.1625,
+      "step": 102
+    },
+    {
+      "epoch": 0.40234375,
+      "grad_norm": 11.8716402053833,
+      "learning_rate": 3.148148148148148e-05,
+      "loss": 3.0979,
+      "step": 103
+    },
+    {
+      "epoch": 0.40625,
+      "grad_norm": 16.9968204498291,
+      "learning_rate": 3.127572016460906e-05,
+      "loss": 3.1705,
+      "step": 104
+    },
+    {
+      "epoch": 0.41015625,
+      "grad_norm": 18.078325271606445,
+      "learning_rate": 3.1069958847736626e-05,
+      "loss": 3.1945,
+      "step": 105
+    },
+    {
+      "epoch": 0.4140625,
+      "grad_norm": 16.14826202392578,
+      "learning_rate": 3.08641975308642e-05,
+      "loss": 3.2458,
+      "step": 106
+    },
+    {
+      "epoch": 0.41796875,
+      "grad_norm": 14.381550788879395,
+      "learning_rate": 3.065843621399177e-05,
+      "loss": 3.1335,
+      "step": 107
+    },
+    {
+      "epoch": 0.421875,
+      "grad_norm": 11.909482955932617,
+      "learning_rate": 3.0452674897119343e-05,
+      "loss": 3.09,
+      "step": 108
+    },
+    {
+      "epoch": 0.42578125,
+      "grad_norm": 17.585634231567383,
+      "learning_rate": 3.0246913580246916e-05,
+      "loss": 3.0548,
+      "step": 109
+    },
+    {
+      "epoch": 0.4296875,
+      "grad_norm": 15.303757667541504,
+      "learning_rate": 3.0041152263374488e-05,
+      "loss": 2.9545,
+      "step": 110
+    },
+    {
+      "epoch": 0.43359375,
+      "grad_norm": 15.899727821350098,
+      "learning_rate": 2.9835390946502057e-05,
+      "loss": 3.0344,
+      "step": 111
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 12.18794059753418,
+      "learning_rate": 2.962962962962963e-05,
+      "loss": 3.1163,
+      "step": 112
+    },
+    {
+      "epoch": 0.44140625,
+      "grad_norm": 15.14059066772461,
+      "learning_rate": 2.9423868312757202e-05,
+      "loss": 3.0572,
+      "step": 113
+    },
+    {
+      "epoch": 0.4453125,
+      "grad_norm": 16.42298698425293,
+      "learning_rate": 2.9218106995884775e-05,
+      "loss": 2.9781,
+      "step": 114
+    },
+    {
+      "epoch": 0.44921875,
+      "grad_norm": 16.090763092041016,
+      "learning_rate": 2.9012345679012347e-05,
+      "loss": 3.1644,
+      "step": 115
+    },
+    {
+      "epoch": 0.453125,
+      "grad_norm": 18.76362419128418,
+      "learning_rate": 2.880658436213992e-05,
+      "loss": 3.0987,
+      "step": 116
+    },
+    {
+      "epoch": 0.45703125,
+      "grad_norm": 17.354793548583984,
+      "learning_rate": 2.860082304526749e-05,
+      "loss": 2.9983,
+      "step": 117
+    },
+    {
+      "epoch": 0.4609375,
+      "grad_norm": 13.282272338867188,
+      "learning_rate": 2.839506172839506e-05,
+      "loss": 3.0733,
+      "step": 118
+    },
+    {
+      "epoch": 0.46484375,
+      "grad_norm": 21.229665756225586,
+      "learning_rate": 2.8189300411522634e-05,
+      "loss": 3.0484,
+      "step": 119
+    },
+    {
+      "epoch": 0.46875,
+      "grad_norm": 16.58381462097168,
+      "learning_rate": 2.7983539094650207e-05,
+      "loss": 3.1174,
+      "step": 120
+    },
+    {
+      "epoch": 0.47265625,
+      "grad_norm": 15.544482231140137,
+      "learning_rate": 2.777777777777778e-05,
+      "loss": 3.1705,
+      "step": 121
+    },
+    {
+      "epoch": 0.4765625,
+      "grad_norm": 15.846022605895996,
+      "learning_rate": 2.757201646090535e-05,
+      "loss": 3.0341,
+      "step": 122
+    },
+    {
+      "epoch": 0.48046875,
+      "grad_norm": 21.867094039916992,
+      "learning_rate": 2.736625514403292e-05,
+      "loss": 2.9884,
+      "step": 123
+    },
+    {
+      "epoch": 0.484375,
+      "grad_norm": 13.162125587463379,
+      "learning_rate": 2.7160493827160493e-05,
+      "loss": 3.0612,
+      "step": 124
+    },
+    {
+      "epoch": 0.48828125,
+      "grad_norm": 11.88615608215332,
+      "learning_rate": 2.6954732510288066e-05,
+      "loss": 2.9845,
+      "step": 125
+    },
+    {
+      "epoch": 0.4921875,
+      "grad_norm": 11.516526222229004,
+      "learning_rate": 2.6748971193415638e-05,
+      "loss": 2.9974,
+      "step": 126
+    },
+    {
+      "epoch": 0.49609375,
+      "grad_norm": 13.762920379638672,
+      "learning_rate": 2.654320987654321e-05,
+      "loss": 3.0515,
+      "step": 127
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 13.856005668640137,
+      "learning_rate": 2.6337448559670787e-05,
+      "loss": 3.0096,
+      "step": 128
+    },
+    {
+      "epoch": 0.50390625,
+      "grad_norm": 12.380202293395996,
+      "learning_rate": 2.6131687242798352e-05,
+      "loss": 3.0274,
+      "step": 129
+    },
+    {
+      "epoch": 0.5078125,
+      "grad_norm": 15.164392471313477,
+      "learning_rate": 2.5925925925925925e-05,
+      "loss": 2.9838,
+      "step": 130
+    },
+    {
+      "epoch": 0.51171875,
+      "grad_norm": 15.10387134552002,
+      "learning_rate": 2.5720164609053497e-05,
+      "loss": 3.1058,
+      "step": 131
+    },
+    {
+      "epoch": 0.515625,
+      "grad_norm": 11.87817096710205,
+      "learning_rate": 2.551440329218107e-05,
+      "loss": 3.0164,
+      "step": 132
+    },
+    {
+      "epoch": 0.51953125,
+      "grad_norm": 17.242656707763672,
+      "learning_rate": 2.5308641975308646e-05,
+      "loss": 2.9376,
+      "step": 133
+    },
+    {
+      "epoch": 0.5234375,
+      "grad_norm": 20.221240997314453,
+      "learning_rate": 2.510288065843622e-05,
+      "loss": 3.0894,
+      "step": 134
+    },
+    {
+      "epoch": 0.52734375,
+      "grad_norm": 12.36820125579834,
+      "learning_rate": 2.4897119341563787e-05,
+      "loss": 3.0347,
+      "step": 135
+    },
+    {
+      "epoch": 0.53125,
+      "grad_norm": 14.862237930297852,
+      "learning_rate": 2.4691358024691357e-05,
+      "loss": 3.0826,
+      "step": 136
+    },
+    {
+      "epoch": 0.53515625,
+      "grad_norm": 14.29667854309082,
+      "learning_rate": 2.4485596707818932e-05,
+      "loss": 3.0467,
+      "step": 137
+    },
+    {
+      "epoch": 0.5390625,
+      "grad_norm": 15.347952842712402,
+      "learning_rate": 2.4279835390946505e-05,
+      "loss": 2.9934,
+      "step": 138
+    },
+    {
+      "epoch": 0.54296875,
+      "grad_norm": 19.236717224121094,
+      "learning_rate": 2.4074074074074074e-05,
+      "loss": 2.8921,
+      "step": 139
+    },
+    {
+      "epoch": 0.546875,
+      "grad_norm": 13.943548202514648,
+      "learning_rate": 2.3868312757201647e-05,
+      "loss": 3.1285,
+      "step": 140
+    },
+    {
+      "epoch": 0.55078125,
+      "grad_norm": 15.554057121276855,
+      "learning_rate": 2.366255144032922e-05,
+      "loss": 2.9709,
+      "step": 141
+    },
+    {
+      "epoch": 0.5546875,
+      "grad_norm": 15.124194145202637,
+      "learning_rate": 2.345679012345679e-05,
+      "loss": 2.9845,
+      "step": 142
+    },
+    {
+      "epoch": 0.55859375,
+      "grad_norm": 15.458159446716309,
+      "learning_rate": 2.3251028806584364e-05,
+      "loss": 2.9581,
+      "step": 143
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 11.531893730163574,
+      "learning_rate": 2.3045267489711937e-05,
+      "loss": 3.0545,
+      "step": 144
+    },
+    {
+      "epoch": 0.56640625,
+      "grad_norm": 16.634984970092773,
+      "learning_rate": 2.2839506172839506e-05,
+      "loss": 2.9772,
+      "step": 145
+    },
+    {
+      "epoch": 0.5703125,
+      "grad_norm": 13.670973777770996,
+      "learning_rate": 2.2633744855967078e-05,
+      "loss": 2.9478,
+      "step": 146
+    },
+    {
+      "epoch": 0.57421875,
+      "grad_norm": 13.582576751708984,
+      "learning_rate": 2.242798353909465e-05,
+      "loss": 3.1053,
+      "step": 147
+    },
+    {
+      "epoch": 0.578125,
+      "grad_norm": 17.05980110168457,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 2.9789,
+      "step": 148
+    },
+    {
+      "epoch": 0.58203125,
+      "grad_norm": 13.586709976196289,
+      "learning_rate": 2.2016460905349796e-05,
+      "loss": 2.9268,
+      "step": 149
+    },
+    {
+      "epoch": 0.5859375,
+      "grad_norm": 14.885200500488281,
+      "learning_rate": 2.1810699588477368e-05,
+      "loss": 3.0281,
+      "step": 150
+    },
+    {
+      "epoch": 0.58984375,
+      "grad_norm": 14.411493301391602,
+      "learning_rate": 2.1604938271604937e-05,
+      "loss": 2.9164,
+      "step": 151
+    },
+    {
+      "epoch": 0.59375,
+      "grad_norm": 16.72563934326172,
+      "learning_rate": 2.139917695473251e-05,
+      "loss": 2.9586,
+      "step": 152
+    },
+    {
+      "epoch": 0.59765625,
+      "grad_norm": 11.668434143066406,
+      "learning_rate": 2.1193415637860082e-05,
+      "loss": 3.0239,
+      "step": 153
+    },
+    {
+      "epoch": 0.6015625,
+      "grad_norm": 11.849235534667969,
+      "learning_rate": 2.0987654320987655e-05,
+      "loss": 2.9665,
+      "step": 154
+    },
+    {
+      "epoch": 0.60546875,
+      "grad_norm": 17.961620330810547,
+      "learning_rate": 2.0781893004115227e-05,
+      "loss": 3.0192,
+      "step": 155
+    },
+    {
+      "epoch": 0.609375,
+      "grad_norm": 13.321170806884766,
+      "learning_rate": 2.05761316872428e-05,
+      "loss": 2.9901,
+      "step": 156
+    },
+    {
+      "epoch": 0.61328125,
+      "grad_norm": 12.714600563049316,
+      "learning_rate": 2.037037037037037e-05,
+      "loss": 2.9933,
+      "step": 157
+    },
+    {
+      "epoch": 0.6171875,
+      "grad_norm": 11.27708625793457,
+      "learning_rate": 2.016460905349794e-05,
+      "loss": 2.9186,
+      "step": 158
+    },
+    {
+      "epoch": 0.62109375,
+      "grad_norm": 11.366385459899902,
+      "learning_rate": 1.9958847736625517e-05,
+      "loss": 2.9615,
+      "step": 159
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 14.28786849975586,
+      "learning_rate": 1.9753086419753087e-05,
+      "loss": 3.0847,
+      "step": 160
+    },
+    {
+      "epoch": 0.62890625,
+      "grad_norm": 17.186941146850586,
+      "learning_rate": 1.954732510288066e-05,
+      "loss": 3.1361,
+      "step": 161
+    },
+    {
+      "epoch": 0.6328125,
+      "grad_norm": 12.188273429870605,
+      "learning_rate": 1.934156378600823e-05,
+      "loss": 3.0134,
+      "step": 162
+    },
+    {
+      "epoch": 0.63671875,
+      "grad_norm": 18.98809051513672,
+      "learning_rate": 1.91358024691358e-05,
+      "loss": 2.904,
+      "step": 163
+    },
+    {
+      "epoch": 0.640625,
+      "grad_norm": 12.578585624694824,
+      "learning_rate": 1.8930041152263377e-05,
+      "loss": 2.9072,
+      "step": 164
+    },
+    {
+      "epoch": 0.64453125,
+      "grad_norm": 13.295378684997559,
+      "learning_rate": 1.872427983539095e-05,
+      "loss": 3.0163,
+      "step": 165
+    },
+    {
+      "epoch": 0.6484375,
+      "grad_norm": 16.138151168823242,
+      "learning_rate": 1.8518518518518518e-05,
+      "loss": 2.9509,
+      "step": 166
+    },
+    {
+      "epoch": 0.65234375,
+      "grad_norm": 18.39076805114746,
+      "learning_rate": 1.831275720164609e-05,
+      "loss": 2.9098,
+      "step": 167
+    },
+    {
+      "epoch": 0.65625,
+      "grad_norm": 19.46346664428711,
+      "learning_rate": 1.8106995884773663e-05,
+      "loss": 2.9096,
+      "step": 168
+    },
+    {
+      "epoch": 0.66015625,
+      "grad_norm": 13.3604154586792,
+      "learning_rate": 1.7901234567901236e-05,
+      "loss": 2.8992,
+      "step": 169
+    },
+    {
+      "epoch": 0.6640625,
+      "grad_norm": 12.542205810546875,
+      "learning_rate": 1.769547325102881e-05,
+      "loss": 2.9293,
+      "step": 170
+    },
+    {
+      "epoch": 0.66796875,
+      "grad_norm": 12.55959415435791,
+      "learning_rate": 1.748971193415638e-05,
+      "loss": 2.8809,
+      "step": 171
+    },
+    {
+      "epoch": 0.671875,
+      "grad_norm": 15.57677936553955,
+      "learning_rate": 1.728395061728395e-05,
+      "loss": 2.7552,
+      "step": 172
+    },
+    {
+      "epoch": 0.67578125,
+      "grad_norm": 11.344679832458496,
+      "learning_rate": 1.7078189300411522e-05,
+      "loss": 2.9377,
+      "step": 173
+    },
+    {
+      "epoch": 0.6796875,
+      "grad_norm": 15.26870059967041,
+      "learning_rate": 1.6872427983539095e-05,
+      "loss": 2.9145,
+      "step": 174
+    },
+    {
+      "epoch": 0.68359375,
+      "grad_norm": 12.774370193481445,
+      "learning_rate": 1.6666666666666667e-05,
+      "loss": 2.9775,
+      "step": 175
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 19.02437400817871,
+      "learning_rate": 1.646090534979424e-05,
+      "loss": 2.9581,
+      "step": 176
+    },
+    {
+      "epoch": 0.69140625,
+      "grad_norm": 14.445993423461914,
+      "learning_rate": 1.6255144032921812e-05,
+      "loss": 2.9639,
+      "step": 177
+    },
+    {
+      "epoch": 0.6953125,
+      "grad_norm": 13.563401222229004,
+      "learning_rate": 1.604938271604938e-05,
+      "loss": 2.8964,
+      "step": 178
+    },
+    {
+      "epoch": 0.69921875,
+      "grad_norm": 17.80751609802246,
+      "learning_rate": 1.5843621399176954e-05,
+      "loss": 3.0603,
+      "step": 179
+    },
+    {
+      "epoch": 0.703125,
+      "grad_norm": 13.13770866394043,
+      "learning_rate": 1.563786008230453e-05,
+      "loss": 3.0473,
+      "step": 180
+    },
+    {
+      "epoch": 0.70703125,
+      "grad_norm": 15.236124038696289,
+      "learning_rate": 1.54320987654321e-05,
+      "loss": 2.8392,
+      "step": 181
+    },
+    {
+      "epoch": 0.7109375,
+      "grad_norm": 17.386003494262695,
+      "learning_rate": 1.5226337448559672e-05,
+      "loss": 2.8805,
+      "step": 182
+    },
+    {
+      "epoch": 0.71484375,
+      "grad_norm": 16.233539581298828,
+      "learning_rate": 1.5020576131687244e-05,
+      "loss": 2.9692,
+      "step": 183
+    },
+    {
+      "epoch": 0.71875,
+      "grad_norm": 19.721954345703125,
+      "learning_rate": 1.4814814814814815e-05,
+      "loss": 3.0988,
+      "step": 184
+    },
+    {
+      "epoch": 0.72265625,
+      "grad_norm": 12.52316951751709,
+      "learning_rate": 1.4609053497942387e-05,
+      "loss": 2.933,
+      "step": 185
+    },
+    {
+      "epoch": 0.7265625,
+      "grad_norm": 12.95042610168457,
+      "learning_rate": 1.440329218106996e-05,
+      "loss": 2.927,
+      "step": 186
+    },
+    {
+      "epoch": 0.73046875,
+      "grad_norm": 12.077346801757812,
+      "learning_rate": 1.419753086419753e-05,
+      "loss": 2.8952,
+      "step": 187
+    },
+    {
+      "epoch": 0.734375,
+      "grad_norm": 11.764900207519531,
+      "learning_rate": 1.3991769547325103e-05,
+      "loss": 2.8488,
+      "step": 188
+    },
+    {
+      "epoch": 0.73828125,
+      "grad_norm": 13.644368171691895,
+      "learning_rate": 1.3786008230452676e-05,
+      "loss": 2.8838,
+      "step": 189
+    },
+    {
+      "epoch": 0.7421875,
+      "grad_norm": 12.35655689239502,
+      "learning_rate": 1.3580246913580247e-05,
+      "loss": 2.9183,
+      "step": 190
+    },
+    {
+      "epoch": 0.74609375,
+      "grad_norm": 13.980887413024902,
+      "learning_rate": 1.3374485596707819e-05,
+      "loss": 2.8803,
+      "step": 191
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 18.003271102905273,
+      "learning_rate": 1.3168724279835393e-05,
+      "loss": 3.0357,
+      "step": 192
+    },
+    {
+      "epoch": 0.75390625,
+      "grad_norm": 14.126523971557617,
+      "learning_rate": 1.2962962962962962e-05,
+      "loss": 2.9313,
+      "step": 193
+    },
+    {
+      "epoch": 0.7578125,
+      "grad_norm": 12.55939769744873,
+      "learning_rate": 1.2757201646090535e-05,
+      "loss": 2.9224,
+      "step": 194
+    },
+    {
+      "epoch": 0.76171875,
+      "grad_norm": 13.05750846862793,
+      "learning_rate": 1.255144032921811e-05,
+      "loss": 2.9167,
+      "step": 195
+    },
+    {
+      "epoch": 0.765625,
+      "grad_norm": 16.52781867980957,
+      "learning_rate": 1.2345679012345678e-05,
+      "loss": 2.8645,
+      "step": 196
+    },
+    {
+      "epoch": 0.76953125,
+      "grad_norm": 15.458019256591797,
+      "learning_rate": 1.2139917695473252e-05,
+      "loss": 2.9848,
+      "step": 197
+    },
+    {
+      "epoch": 0.7734375,
+      "grad_norm": 12.694581985473633,
+      "learning_rate": 1.1934156378600823e-05,
+      "loss": 3.0691,
+      "step": 198
+    },
+    {
+      "epoch": 0.77734375,
+      "grad_norm": 11.771615982055664,
+      "learning_rate": 1.1728395061728396e-05,
+      "loss": 2.9316,
+      "step": 199
+    },
+    {
+      "epoch": 0.78125,
+      "grad_norm": 15.438037872314453,
+      "learning_rate": 1.1522633744855968e-05,
+      "loss": 2.9295,
+      "step": 200
+    },
+    {
+      "epoch": 0.78515625,
+      "grad_norm": 13.371684074401855,
+      "learning_rate": 1.1316872427983539e-05,
+      "loss": 2.8849,
+      "step": 201
+    },
+    {
+      "epoch": 0.7890625,
+      "grad_norm": 11.904099464416504,
+      "learning_rate": 1.1111111111111112e-05,
+      "loss": 2.862,
+      "step": 202
+    },
+    {
+      "epoch": 0.79296875,
+      "grad_norm": 11.638395309448242,
+      "learning_rate": 1.0905349794238684e-05,
+      "loss": 2.949,
+      "step": 203
+    },
+    {
+      "epoch": 0.796875,
+      "grad_norm": 14.490175247192383,
+      "learning_rate": 1.0699588477366255e-05,
+      "loss": 2.9131,
+      "step": 204
+    },
+    {
+      "epoch": 0.80078125,
+      "grad_norm": 11.613717079162598,
+      "learning_rate": 1.0493827160493827e-05,
+      "loss": 2.9497,
+      "step": 205
+    },
+    {
+      "epoch": 0.8046875,
+      "grad_norm": 13.931456565856934,
+      "learning_rate": 1.02880658436214e-05,
+      "loss": 2.917,
+      "step": 206
+    },
+    {
+      "epoch": 0.80859375,
+      "grad_norm": 12.256906509399414,
+      "learning_rate": 1.008230452674897e-05,
+      "loss": 2.9048,
+      "step": 207
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 12.851861953735352,
+      "learning_rate": 9.876543209876543e-06,
+      "loss": 2.9084,
+      "step": 208
+    },
+    {
+      "epoch": 0.81640625,
+      "grad_norm": 18.05097770690918,
+      "learning_rate": 9.670781893004116e-06,
+      "loss": 2.802,
+      "step": 209
+    },
+    {
+      "epoch": 0.8203125,
+      "grad_norm": 12.831151962280273,
+      "learning_rate": 9.465020576131688e-06,
+      "loss": 2.8659,
+      "step": 210
+    },
+    {
+      "epoch": 0.82421875,
+      "grad_norm": 11.60468864440918,
+      "learning_rate": 9.259259259259259e-06,
+      "loss": 2.8285,
+      "step": 211
+    },
+    {
+      "epoch": 0.828125,
+      "grad_norm": 14.392627716064453,
+      "learning_rate": 9.053497942386832e-06,
+      "loss": 2.8605,
+      "step": 212
+    },
+    {
+      "epoch": 0.83203125,
+      "grad_norm": 17.23533821105957,
+      "learning_rate": 8.847736625514404e-06,
+      "loss": 2.9186,
+      "step": 213
+    },
+    {
+      "epoch": 0.8359375,
+      "grad_norm": 13.937773704528809,
+      "learning_rate": 8.641975308641975e-06,
+      "loss": 2.9164,
+      "step": 214
+    },
+    {
+      "epoch": 0.83984375,
+      "grad_norm": 16.57691764831543,
+      "learning_rate": 8.436213991769547e-06,
+      "loss": 2.8829,
+      "step": 215
+    },
+    {
+      "epoch": 0.84375,
+      "grad_norm": 12.391244888305664,
+      "learning_rate": 8.23045267489712e-06,
+      "loss": 2.8417,
+      "step": 216
+    },
+    {
+      "epoch": 0.84765625,
+      "grad_norm": 15.762882232666016,
+      "learning_rate": 8.02469135802469e-06,
+      "loss": 2.871,
+      "step": 217
+    },
+    {
+      "epoch": 0.8515625,
+      "grad_norm": 16.37859344482422,
+      "learning_rate": 7.818930041152265e-06,
+      "loss": 2.9265,
+      "step": 218
+    },
+    {
+      "epoch": 0.85546875,
+      "grad_norm": 21.5294132232666,
+      "learning_rate": 7.613168724279836e-06,
+      "loss": 2.9628,
+      "step": 219
+    },
+    {
+      "epoch": 0.859375,
+      "grad_norm": 18.281295776367188,
+      "learning_rate": 7.4074074074074075e-06,
+      "loss": 2.8301,
+      "step": 220
+    },
+    {
+      "epoch": 0.86328125,
+      "grad_norm": 16.0869140625,
+      "learning_rate": 7.20164609053498e-06,
+      "loss": 2.8179,
+      "step": 221
+    },
+    {
+      "epoch": 0.8671875,
+      "grad_norm": 21.662134170532227,
+      "learning_rate": 6.995884773662552e-06,
+      "loss": 2.8893,
+      "step": 222
+    },
+    {
+      "epoch": 0.87109375,
+      "grad_norm": 12.4130277633667,
+      "learning_rate": 6.790123456790123e-06,
+      "loss": 2.8628,
+      "step": 223
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 15.154963493347168,
+      "learning_rate": 6.584362139917697e-06,
+      "loss": 2.8991,
+      "step": 224
+    },
+    {
+      "epoch": 0.87890625,
+      "grad_norm": 12.677474021911621,
+      "learning_rate": 6.3786008230452675e-06,
+      "loss": 2.9428,
+      "step": 225
+    },
+    {
+      "epoch": 0.8828125,
+      "grad_norm": 12.973712921142578,
+      "learning_rate": 6.172839506172839e-06,
+      "loss": 2.8694,
+      "step": 226
+    },
+    {
+      "epoch": 0.88671875,
+      "grad_norm": 13.162025451660156,
+      "learning_rate": 5.967078189300412e-06,
+      "loss": 2.8119,
+      "step": 227
+    },
+    {
+      "epoch": 0.890625,
+      "grad_norm": 13.606464385986328,
+      "learning_rate": 5.761316872427984e-06,
+      "loss": 2.8455,
+      "step": 228
+    },
+    {
+      "epoch": 0.89453125,
+      "grad_norm": 17.518091201782227,
+      "learning_rate": 5.555555555555556e-06,
+      "loss": 2.8708,
+      "step": 229
+    },
+    {
+      "epoch": 0.8984375,
+      "grad_norm": 13.689889907836914,
+      "learning_rate": 5.3497942386831275e-06,
+      "loss": 2.8384,
+      "step": 230
+    },
+    {
+      "epoch": 0.90234375,
+      "grad_norm": 14.69385051727295,
+      "learning_rate": 5.1440329218107e-06,
+      "loss": 2.8291,
+      "step": 231
+    },
+    {
+      "epoch": 0.90625,
+      "grad_norm": 17.468791961669922,
+      "learning_rate": 4.938271604938272e-06,
+      "loss": 2.8588,
+      "step": 232
+    },
+    {
+      "epoch": 0.91015625,
+      "grad_norm": 14.33683967590332,
+      "learning_rate": 4.732510288065844e-06,
+      "loss": 2.7762,
+      "step": 233
+    },
+    {
+      "epoch": 0.9140625,
+      "grad_norm": 17.65381622314453,
+      "learning_rate": 4.526748971193416e-06,
+      "loss": 2.8697,
+      "step": 234
+    },
+    {
+      "epoch": 0.91796875,
+      "grad_norm": 17.74317169189453,
+      "learning_rate": 4.3209876543209875e-06,
+      "loss": 2.8151,
+      "step": 235
+    },
+    {
+      "epoch": 0.921875,
+      "grad_norm": 20.99629020690918,
+      "learning_rate": 4.11522633744856e-06,
+      "loss": 2.893,
+      "step": 236
+    },
+    {
+      "epoch": 0.92578125,
+      "grad_norm": 17.19089698791504,
+      "learning_rate": 3.9094650205761325e-06,
+      "loss": 2.8025,
+      "step": 237
+    },
+    {
+      "epoch": 0.9296875,
+      "grad_norm": 12.898162841796875,
+      "learning_rate": 3.7037037037037037e-06,
+      "loss": 2.9405,
+      "step": 238
+    },
+    {
+      "epoch": 0.93359375,
+      "grad_norm": 13.902021408081055,
+      "learning_rate": 3.497942386831276e-06,
+      "loss": 2.8937,
+      "step": 239
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 13.255682945251465,
+      "learning_rate": 3.2921810699588483e-06,
+      "loss": 2.7604,
+      "step": 240
+    },
+    {
+      "epoch": 0.94140625,
+      "grad_norm": 14.98218822479248,
+      "learning_rate": 3.0864197530864196e-06,
+      "loss": 2.8931,
+      "step": 241
+    },
+    {
+      "epoch": 0.9453125,
+      "grad_norm": 15.084565162658691,
+      "learning_rate": 2.880658436213992e-06,
+      "loss": 2.8565,
+      "step": 242
+    },
+    {
+      "epoch": 0.94921875,
+      "grad_norm": 12.473812103271484,
+      "learning_rate": 2.6748971193415637e-06,
+      "loss": 2.795,
+      "step": 243
+    },
+    {
+      "epoch": 0.953125,
+      "grad_norm": 13.222654342651367,
+      "learning_rate": 2.469135802469136e-06,
+      "loss": 2.8734,
+      "step": 244
+    },
+    {
+      "epoch": 0.95703125,
+      "grad_norm": 13.204935073852539,
+      "learning_rate": 2.263374485596708e-06,
+      "loss": 2.8013,
+      "step": 245
+    },
+    {
+      "epoch": 0.9609375,
+      "grad_norm": 15.458930015563965,
+      "learning_rate": 2.05761316872428e-06,
+      "loss": 2.8541,
+      "step": 246
+    },
+    {
+      "epoch": 0.96484375,
+      "grad_norm": 17.476573944091797,
+      "learning_rate": 1.8518518518518519e-06,
+      "loss": 2.7538,
+      "step": 247
+    },
+    {
+      "epoch": 0.96875,
+      "grad_norm": 13.751199722290039,
+      "learning_rate": 1.6460905349794242e-06,
+      "loss": 2.8324,
+      "step": 248
+    },
+    {
+      "epoch": 0.97265625,
+      "grad_norm": 15.517616271972656,
+      "learning_rate": 1.440329218106996e-06,
+      "loss": 2.8402,
+      "step": 249
+    },
+    {
+      "epoch": 0.9765625,
+      "grad_norm": 16.016067504882812,
+      "learning_rate": 1.234567901234568e-06,
+      "loss": 2.895,
+      "step": 250
+    },
+    {
+      "epoch": 0.98046875,
+      "grad_norm": 11.978571891784668,
+      "learning_rate": 1.02880658436214e-06,
+      "loss": 2.9139,
+      "step": 251
+    },
+    {
+      "epoch": 1.00390625,
+      "grad_norm": 58.774688720703125,
+      "learning_rate": 8.230452674897121e-07,
+      "loss": 3.7051,
+      "step": 252
+    },
+    {
+      "epoch": 1.0078125,
+      "grad_norm": 52.733154296875,
+      "learning_rate": 6.17283950617284e-07,
+      "loss": 3.7482,
+      "step": 253
+    },
+    {
+      "epoch": 1.01171875,
+      "grad_norm": 68.14625549316406,
+      "learning_rate": 4.1152263374485604e-07,
+      "loss": 3.8065,
+      "step": 254
+    },
+    {
+      "epoch": 1.015625,
+      "grad_norm": 59.991546630859375,
+      "learning_rate": 2.0576131687242802e-07,
+      "loss": 3.6531,
+      "step": 255
+    },
+    {
+      "epoch": 1.01953125,
+      "grad_norm": 40.77336883544922,
+      "learning_rate": 0.0,
+      "loss": 3.4788,
+      "step": 256
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 256,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 1.0,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 128,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a49daff0cdab71597b8bdd2a8e6a682d2b83844ed0288e7303a21b4aba64b31
+size 6008

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff