RohitSur commited on
Commit
643b8aa
·
verified ·
1 Parent(s): e3481ab

Upload folder using huggingface_hub

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ <|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>
2
+ {% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}
config.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SmolVLMForConditionalGeneration"
4
+ ],
5
+ "bos_token_id": 1,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 49279,
8
+ "image_token_id": 49190,
9
+ "model_type": "smolvlm",
10
+ "pad_token_id": 2,
11
+ "scale_factor": 4,
12
+ "text_config": {
13
+ "_flash_attn_2_enabled": true,
14
+ "_name_or_path": "None",
15
+ "architectures": [
16
+ "VLlama3ForCausalLM"
17
+ ],
18
+ "attention_bias": false,
19
+ "attention_dropout": 0.0,
20
+ "bos_token_id": 1,
21
+ "dtype": "bfloat16",
22
+ "eos_token_id": 2,
23
+ "head_dim": 64,
24
+ "hidden_act": "silu",
25
+ "hidden_size": 960,
26
+ "initializer_range": 0.02,
27
+ "intermediate_size": 2560,
28
+ "is_llama_config": true,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "model_type": "llama",
32
+ "neftune_noise_alpha": 0.0,
33
+ "num_attention_heads": 15,
34
+ "num_hidden_layers": 32,
35
+ "num_key_value_heads": 5,
36
+ "pad_token_id": 2,
37
+ "perceiver_config": {
38
+ "_attn_implementation_autoset": false,
39
+ "_name_or_path": "",
40
+ "add_cross_attention": false,
41
+ "architectures": null,
42
+ "attention_dropout": 0.0,
43
+ "bad_words_ids": null,
44
+ "begin_suppress_tokens": null,
45
+ "bos_token_id": null,
46
+ "chunk_size_feed_forward": 0,
47
+ "cross_attention_hidden_size": null,
48
+ "decoder_start_token_id": null,
49
+ "diversity_penalty": 0.0,
50
+ "do_sample": false,
51
+ "early_stopping": false,
52
+ "encoder_no_repeat_ngram_size": 0,
53
+ "eos_token_id": null,
54
+ "exponential_decay_length_penalty": null,
55
+ "finetuning_task": null,
56
+ "forced_bos_token_id": null,
57
+ "forced_eos_token_id": null,
58
+ "hidden_act": "silu",
59
+ "id2label": {
60
+ "0": "LABEL_0",
61
+ "1": "LABEL_1"
62
+ },
63
+ "is_decoder": false,
64
+ "is_encoder_decoder": false,
65
+ "label2id": {
66
+ "LABEL_0": 0,
67
+ "LABEL_1": 1
68
+ },
69
+ "length_penalty": 1.0,
70
+ "max_length": 20,
71
+ "min_length": 0,
72
+ "model_type": "vllama3",
73
+ "no_repeat_ngram_size": 0,
74
+ "num_beam_groups": 1,
75
+ "num_beams": 1,
76
+ "num_key_value_heads": 1,
77
+ "num_return_sequences": 1,
78
+ "output_attentions": false,
79
+ "output_hidden_states": false,
80
+ "output_scores": false,
81
+ "pad_token_id": null,
82
+ "prefix": null,
83
+ "problem_type": null,
84
+ "pruned_heads": {},
85
+ "qk_layer_norms_perceiver": false,
86
+ "remove_invalid_values": false,
87
+ "repetition_penalty": 1.0,
88
+ "resampler_depth": 6,
89
+ "resampler_head_dim": 96,
90
+ "resampler_n_heads": 16,
91
+ "resampler_n_latents": 64,
92
+ "return_dict": true,
93
+ "return_dict_in_generate": false,
94
+ "sep_token_id": null,
95
+ "suppress_tokens": null,
96
+ "task_specific_params": null,
97
+ "temperature": 1.0,
98
+ "tf_legacy_loss": false,
99
+ "tie_encoder_decoder": false,
100
+ "tie_word_embeddings": true,
101
+ "tokenizer_class": null,
102
+ "top_k": 50,
103
+ "top_p": 1.0,
104
+ "torch_dtype": null,
105
+ "torchscript": false,
106
+ "transformers_version": "4.46.0",
107
+ "typical_p": 1.0,
108
+ "use_bfloat16": false
109
+ },
110
+ "pixel_shuffle_factor": 4,
111
+ "pretraining_tp": 1,
112
+ "qk_layer_norms": false,
113
+ "rms_norm_eps": 1e-05,
114
+ "rope_interleaved": false,
115
+ "rope_parameters": {
116
+ "rope_theta": 100000,
117
+ "rope_type": "default"
118
+ },
119
+ "tie_word_embeddings": false,
120
+ "transformers.js_config": {
121
+ "kv_cache_dtype": {
122
+ "fp16": "float16",
123
+ "q4f16": "float16"
124
+ }
125
+ },
126
+ "use_cache": true,
127
+ "use_resampler": false,
128
+ "vocab_size": 49280
129
+ },
130
+ "tie_word_embeddings": false,
131
+ "transformers.js_config": {
132
+ "kv_cache_dtype": {
133
+ "fp16": "float16",
134
+ "q4f16": "float16"
135
+ }
136
+ },
137
+ "transformers_version": "5.0.0",
138
+ "use_cache": false,
139
+ "use_reentrant_checkpointing": false,
140
+ "vision_config": {
141
+ "attention_dropout": 0.0,
142
+ "dtype": "bfloat16",
143
+ "hidden_act": "gelu_pytorch_tanh",
144
+ "hidden_size": 768,
145
+ "image_size": 512,
146
+ "initializer_range": 0.02,
147
+ "intermediate_size": 3072,
148
+ "layer_norm_eps": 1e-06,
149
+ "max_image_size": {
150
+ "longest_edge": 512
151
+ },
152
+ "model_type": "smolvlm_vision",
153
+ "num_attention_heads": 12,
154
+ "num_channels": 3,
155
+ "num_hidden_layers": 12,
156
+ "patch_size": 16,
157
+ "size": {
158
+ "longest_edge": 2048
159
+ },
160
+ "tie_word_embeddings": false,
161
+ "use_base_siglip": false
162
+ },
163
+ "vocab_size": 49280
164
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": [
5
+ 49279,
6
+ 49279
7
+ ],
8
+ "pad_token_id": 2,
9
+ "transformers_version": "5.0.0"
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12c8c5e8d4cf1b4a0a834bcb539e3d8389e63a6306a0ce97d3d67be2ec4c4d48
3
+ size 1015025832
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a34a45c16f8d91835b8f6085165a440fe23896373ed2c70df14ddd4964d761dd
3
+ size 1637261579
processor_config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor": {
3
+ "do_convert_rgb": true,
4
+ "do_image_splitting": true,
5
+ "do_normalize": true,
6
+ "do_pad": true,
7
+ "do_rescale": true,
8
+ "do_resize": true,
9
+ "image_mean": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "image_processor_type": "SmolVLMImageProcessor",
15
+ "image_std": [
16
+ 0.5,
17
+ 0.5,
18
+ 0.5
19
+ ],
20
+ "max_image_size": {
21
+ "longest_edge": 512
22
+ },
23
+ "resample": 1,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "longest_edge": 2048
27
+ },
28
+ "video_sampling": {
29
+ "fps": 1,
30
+ "max_frames": 64,
31
+ "video_size": {
32
+ "longest_edge": 512
33
+ }
34
+ }
35
+ },
36
+ "image_seq_len": 64,
37
+ "processor_class": "SmolVLMProcessor",
38
+ "video_processor": {
39
+ "data_format": "channels_first",
40
+ "default_to_square": true,
41
+ "do_convert_rgb": true,
42
+ "do_image_splitting": true,
43
+ "do_normalize": true,
44
+ "do_pad": true,
45
+ "do_rescale": true,
46
+ "do_resize": true,
47
+ "do_sample_frames": false,
48
+ "fps": 1,
49
+ "image_mean": [
50
+ 0.5,
51
+ 0.5,
52
+ 0.5
53
+ ],
54
+ "image_processor_type": "SmolVLMImageProcessor",
55
+ "image_std": [
56
+ 0.5,
57
+ 0.5,
58
+ 0.5
59
+ ],
60
+ "max_image_size": {
61
+ "longest_edge": 512
62
+ },
63
+ "num_frames": 64,
64
+ "resample": 1,
65
+ "rescale_factor": 0.00392156862745098,
66
+ "return_metadata": false,
67
+ "size": {
68
+ "longest_edge": 2048
69
+ },
70
+ "video_processor_type": "SmolVLMVideoProcessor",
71
+ "video_sampling": {
72
+ "fps": 1,
73
+ "max_frames": 64,
74
+ "video_size": {
75
+ "longest_edge": 2048
76
+ }
77
+ }
78
+ }
79
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3d2378bc5b746c762327d4822ca1447f1f50eb97a95a925d360512fec0cf2bc
3
+ size 1465
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|im_start|>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "end_of_utterance_token": "<end_of_utterance>",
7
+ "eos_token": "<end_of_utterance>",
8
+ "extra_special_tokens": [
9
+ "<fake_token_around_image>",
10
+ "<image>",
11
+ "<end_of_utterance>"
12
+ ],
13
+ "fake_image_token": "<fake_token_around_image>",
14
+ "global_image_token": "<global-img>",
15
+ "image_token": "<image>",
16
+ "is_local": false,
17
+ "legacy": false,
18
+ "model_max_length": 8192,
19
+ "model_specific_special_tokens": {
20
+ "end_of_utterance_token": "<end_of_utterance>",
21
+ "fake_image_token": "<fake_token_around_image>",
22
+ "global_image_token": "<global-img>",
23
+ "image_token": "<image>"
24
+ },
25
+ "pad_token": "<|im_end|>",
26
+ "processor_class": "SmolVLMProcessor",
27
+ "tokenizer_class": "TokenizersBackend",
28
+ "truncation_side": "left",
29
+ "unk_token": "<|endoftext|>",
30
+ "vocab_size": 49152
31
+ }
trainer_state.json ADDED
@@ -0,0 +1,984 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 950,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.79375,
14
+ "epoch": 0.021052631578947368,
15
+ "grad_norm": 6.71875,
16
+ "learning_rate": 4.9526315789473685e-05,
17
+ "loss": 1.7312183380126953,
18
+ "mean_token_accuracy": 0.6867853045463562,
19
+ "num_tokens": 840.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.52421875,
24
+ "epoch": 0.042105263157894736,
25
+ "grad_norm": 3.90625,
26
+ "learning_rate": 4.9e-05,
27
+ "loss": 1.3260906219482422,
28
+ "mean_token_accuracy": 0.7451180815696716,
29
+ "num_tokens": 1573.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.69921875,
34
+ "epoch": 0.06315789473684211,
35
+ "grad_norm": 6.5625,
36
+ "learning_rate": 4.847368421052632e-05,
37
+ "loss": 1.6661336898803711,
38
+ "mean_token_accuracy": 0.7098163902759552,
39
+ "num_tokens": 2338.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.5484375,
44
+ "epoch": 0.08421052631578947,
45
+ "grad_norm": 7.4375,
46
+ "learning_rate": 4.794736842105264e-05,
47
+ "loss": 1.5569435119628907,
48
+ "mean_token_accuracy": 0.7198988318443298,
49
+ "num_tokens": 3038.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.59296875,
54
+ "epoch": 0.10526315789473684,
55
+ "grad_norm": 8.1875,
56
+ "learning_rate": 4.742105263157895e-05,
57
+ "loss": 1.5420040130615233,
58
+ "mean_token_accuracy": 0.7149245262145996,
59
+ "num_tokens": 3682.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 1.80390625,
64
+ "epoch": 0.12631578947368421,
65
+ "grad_norm": 9.9375,
66
+ "learning_rate": 4.6894736842105264e-05,
67
+ "loss": 1.6832901000976563,
68
+ "mean_token_accuracy": 0.7065998375415802,
69
+ "num_tokens": 4617.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 1.66796875,
74
+ "epoch": 0.14736842105263157,
75
+ "grad_norm": 8.375,
76
+ "learning_rate": 4.6368421052631584e-05,
77
+ "loss": 1.6378042221069335,
78
+ "mean_token_accuracy": 0.7060423612594604,
79
+ "num_tokens": 5349.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 1.5109375,
84
+ "epoch": 0.16842105263157894,
85
+ "grad_norm": 6.03125,
86
+ "learning_rate": 4.58421052631579e-05,
87
+ "loss": 1.5179196357727052,
88
+ "mean_token_accuracy": 0.7291716754436492,
89
+ "num_tokens": 6122.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 1.721875,
94
+ "epoch": 0.18947368421052632,
95
+ "grad_norm": 4.0625,
96
+ "learning_rate": 4.531578947368421e-05,
97
+ "loss": 1.8176845550537108,
98
+ "mean_token_accuracy": 0.6842875778675079,
99
+ "num_tokens": 7197.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 1.52265625,
104
+ "epoch": 0.21052631578947367,
105
+ "grad_norm": 6.46875,
106
+ "learning_rate": 4.478947368421053e-05,
107
+ "loss": 1.5252375602722168,
108
+ "mean_token_accuracy": 0.711300152540207,
109
+ "num_tokens": 8046.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 1.6515625,
114
+ "epoch": 0.23157894736842105,
115
+ "grad_norm": 7.0,
116
+ "learning_rate": 4.426315789473684e-05,
117
+ "loss": 1.611851119995117,
118
+ "mean_token_accuracy": 0.7179319798946381,
119
+ "num_tokens": 8698.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 1.54140625,
124
+ "epoch": 0.25263157894736843,
125
+ "grad_norm": 6.90625,
126
+ "learning_rate": 4.373684210526316e-05,
127
+ "loss": 1.5053813934326172,
128
+ "mean_token_accuracy": 0.7282716870307923,
129
+ "num_tokens": 9455.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 1.5375,
134
+ "epoch": 0.2736842105263158,
135
+ "grad_norm": 7.25,
136
+ "learning_rate": 4.3210526315789475e-05,
137
+ "loss": 1.5904606819152831,
138
+ "mean_token_accuracy": 0.7234093546867371,
139
+ "num_tokens": 10191.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 1.58984375,
144
+ "epoch": 0.29473684210526313,
145
+ "grad_norm": 9.375,
146
+ "learning_rate": 4.2684210526315795e-05,
147
+ "loss": 1.6070585250854492,
148
+ "mean_token_accuracy": 0.7228378415107727,
149
+ "num_tokens": 10859.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 1.80234375,
154
+ "epoch": 0.3157894736842105,
155
+ "grad_norm": 8.3125,
156
+ "learning_rate": 4.215789473684211e-05,
157
+ "loss": 1.7423076629638672,
158
+ "mean_token_accuracy": 0.6761326909065246,
159
+ "num_tokens": 11846.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 1.592578125,
164
+ "epoch": 0.3368421052631579,
165
+ "grad_norm": 5.1875,
166
+ "learning_rate": 4.163157894736842e-05,
167
+ "loss": 1.6300687789916992,
168
+ "mean_token_accuracy": 0.7145272672176362,
169
+ "num_tokens": 12722.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 1.5953125,
174
+ "epoch": 0.35789473684210527,
175
+ "grad_norm": 6.125,
176
+ "learning_rate": 4.110526315789474e-05,
177
+ "loss": 1.44827880859375,
178
+ "mean_token_accuracy": 0.725699108839035,
179
+ "num_tokens": 13458.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 1.8171875,
184
+ "epoch": 0.37894736842105264,
185
+ "grad_norm": 7.125,
186
+ "learning_rate": 4.0578947368421054e-05,
187
+ "loss": 1.7392475128173828,
188
+ "mean_token_accuracy": 0.6949776589870453,
189
+ "num_tokens": 14197.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 1.78671875,
194
+ "epoch": 0.4,
195
+ "grad_norm": 7.8125,
196
+ "learning_rate": 4.0052631578947367e-05,
197
+ "loss": 1.8162834167480468,
198
+ "mean_token_accuracy": 0.6562099277973175,
199
+ "num_tokens": 15056.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 1.77421875,
204
+ "epoch": 0.42105263157894735,
205
+ "grad_norm": 7.53125,
206
+ "learning_rate": 3.9526315789473686e-05,
207
+ "loss": 1.7153417587280273,
208
+ "mean_token_accuracy": 0.6948422849178314,
209
+ "num_tokens": 15807.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.6875,
214
+ "epoch": 0.4421052631578947,
215
+ "grad_norm": 9.125,
216
+ "learning_rate": 3.9000000000000006e-05,
217
+ "loss": 1.6782726287841796,
218
+ "mean_token_accuracy": 0.7238920211791993,
219
+ "num_tokens": 16432.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 1.446484375,
224
+ "epoch": 0.4631578947368421,
225
+ "grad_norm": 5.84375,
226
+ "learning_rate": 3.847368421052632e-05,
227
+ "loss": 1.4408910751342774,
228
+ "mean_token_accuracy": 0.7403214454650879,
229
+ "num_tokens": 17103.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 1.76171875,
234
+ "epoch": 0.4842105263157895,
235
+ "grad_norm": 6.375,
236
+ "learning_rate": 3.794736842105263e-05,
237
+ "loss": 1.7578521728515626,
238
+ "mean_token_accuracy": 0.6848492741584777,
239
+ "num_tokens": 18038.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 1.53515625,
244
+ "epoch": 0.5052631578947369,
245
+ "grad_norm": 6.71875,
246
+ "learning_rate": 3.742105263157895e-05,
247
+ "loss": 1.48400821685791,
248
+ "mean_token_accuracy": 0.7131125509738923,
249
+ "num_tokens": 18720.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 1.49140625,
254
+ "epoch": 0.5263157894736842,
255
+ "grad_norm": 6.375,
256
+ "learning_rate": 3.6894736842105265e-05,
257
+ "loss": 1.5496106147766113,
258
+ "mean_token_accuracy": 0.7365909218788147,
259
+ "num_tokens": 19308.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 1.46796875,
264
+ "epoch": 0.5473684210526316,
265
+ "grad_norm": 8.125,
266
+ "learning_rate": 3.636842105263158e-05,
267
+ "loss": 1.5240853309631348,
268
+ "mean_token_accuracy": 0.736497437953949,
269
+ "num_tokens": 19967.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 1.578125,
274
+ "epoch": 0.5684210526315789,
275
+ "grad_norm": 8.3125,
276
+ "learning_rate": 3.58421052631579e-05,
277
+ "loss": 1.5680004119873048,
278
+ "mean_token_accuracy": 0.7332142323255539,
279
+ "num_tokens": 20769.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 1.501953125,
284
+ "epoch": 0.5894736842105263,
285
+ "grad_norm": 8.375,
286
+ "learning_rate": 3.531578947368421e-05,
287
+ "loss": 1.5029385566711426,
288
+ "mean_token_accuracy": 0.7254173457622528,
289
+ "num_tokens": 22503.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 1.5921875,
294
+ "epoch": 0.6105263157894737,
295
+ "grad_norm": 7.0,
296
+ "learning_rate": 3.478947368421053e-05,
297
+ "loss": 1.6090343475341797,
298
+ "mean_token_accuracy": 0.7082146763801574,
299
+ "num_tokens": 23439.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 1.75078125,
304
+ "epoch": 0.631578947368421,
305
+ "grad_norm": 8.25,
306
+ "learning_rate": 3.426315789473684e-05,
307
+ "loss": 1.6250024795532227,
308
+ "mean_token_accuracy": 0.6883616149425507,
309
+ "num_tokens": 24161.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 1.269140625,
314
+ "epoch": 0.6526315789473685,
315
+ "grad_norm": 7.46875,
316
+ "learning_rate": 3.373684210526316e-05,
317
+ "loss": 1.3148769378662108,
318
+ "mean_token_accuracy": 0.7618813216686249,
319
+ "num_tokens": 24716.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 1.809375,
324
+ "epoch": 0.6736842105263158,
325
+ "grad_norm": 7.40625,
326
+ "learning_rate": 3.3210526315789476e-05,
327
+ "loss": 1.9381757736206056,
328
+ "mean_token_accuracy": 0.667089307308197,
329
+ "num_tokens": 25481.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 1.73515625,
334
+ "epoch": 0.6947368421052632,
335
+ "grad_norm": 11.5625,
336
+ "learning_rate": 3.268421052631579e-05,
337
+ "loss": 1.76402587890625,
338
+ "mean_token_accuracy": 0.6992617845535278,
339
+ "num_tokens": 26159.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 1.644140625,
344
+ "epoch": 0.7157894736842105,
345
+ "grad_norm": 7.625,
346
+ "learning_rate": 3.215789473684211e-05,
347
+ "loss": 1.6910707473754882,
348
+ "mean_token_accuracy": 0.6974358975887298,
349
+ "num_tokens": 26983.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 1.44140625,
354
+ "epoch": 0.7368421052631579,
355
+ "grad_norm": 7.84375,
356
+ "learning_rate": 3.163157894736842e-05,
357
+ "loss": 1.3823105812072753,
358
+ "mean_token_accuracy": 0.7533604800701141,
359
+ "num_tokens": 27564.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 1.41953125,
364
+ "epoch": 0.7578947368421053,
365
+ "grad_norm": 9.5,
366
+ "learning_rate": 3.1105263157894735e-05,
367
+ "loss": 1.488726806640625,
368
+ "mean_token_accuracy": 0.733670562505722,
369
+ "num_tokens": 28200.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 1.753125,
374
+ "epoch": 0.7789473684210526,
375
+ "grad_norm": 5.9375,
376
+ "learning_rate": 3.0578947368421054e-05,
377
+ "loss": 1.8948373794555664,
378
+ "mean_token_accuracy": 0.675409197807312,
379
+ "num_tokens": 28919.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 1.98828125,
384
+ "epoch": 0.8,
385
+ "grad_norm": 8.5,
386
+ "learning_rate": 3.005263157894737e-05,
387
+ "loss": 2.146737289428711,
388
+ "mean_token_accuracy": 0.6334406793117523,
389
+ "num_tokens": 29894.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 1.85625,
394
+ "epoch": 0.8210526315789474,
395
+ "grad_norm": 6.3125,
396
+ "learning_rate": 2.9526315789473684e-05,
397
+ "loss": 1.858269500732422,
398
+ "mean_token_accuracy": 0.6439402669668197,
399
+ "num_tokens": 30652.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 1.727734375,
404
+ "epoch": 0.8421052631578947,
405
+ "grad_norm": 7.96875,
406
+ "learning_rate": 2.9e-05,
407
+ "loss": 1.7343599319458007,
408
+ "mean_token_accuracy": 0.700012594461441,
409
+ "num_tokens": 31372.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 1.65,
414
+ "epoch": 0.8631578947368421,
415
+ "grad_norm": 8.0625,
416
+ "learning_rate": 2.847368421052632e-05,
417
+ "loss": 1.6553001403808594,
418
+ "mean_token_accuracy": 0.7165175020694733,
419
+ "num_tokens": 32149.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 1.596875,
424
+ "epoch": 0.8842105263157894,
425
+ "grad_norm": 6.59375,
426
+ "learning_rate": 2.7947368421052633e-05,
427
+ "loss": 1.5661128997802733,
428
+ "mean_token_accuracy": 0.7166097521781921,
429
+ "num_tokens": 33203.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 1.6015625,
434
+ "epoch": 0.9052631578947369,
435
+ "grad_norm": 7.1875,
436
+ "learning_rate": 2.7421052631578946e-05,
437
+ "loss": 1.6179162979125976,
438
+ "mean_token_accuracy": 0.7093785464763641,
439
+ "num_tokens": 33889.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 1.75546875,
444
+ "epoch": 0.9263157894736842,
445
+ "grad_norm": 7.21875,
446
+ "learning_rate": 2.6894736842105266e-05,
447
+ "loss": 1.7675729751586915,
448
+ "mean_token_accuracy": 0.6769322335720063,
449
+ "num_tokens": 34862.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 1.92109375,
454
+ "epoch": 0.9473684210526315,
455
+ "grad_norm": 11.0625,
456
+ "learning_rate": 2.6368421052631582e-05,
457
+ "loss": 1.9548963546752929,
458
+ "mean_token_accuracy": 0.648491358757019,
459
+ "num_tokens": 35710.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 1.699609375,
464
+ "epoch": 0.968421052631579,
465
+ "grad_norm": 6.03125,
466
+ "learning_rate": 2.5842105263157895e-05,
467
+ "loss": 1.6432403564453124,
468
+ "mean_token_accuracy": 0.6918732106685639,
469
+ "num_tokens": 36374.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 1.84453125,
474
+ "epoch": 0.9894736842105263,
475
+ "grad_norm": 7.03125,
476
+ "learning_rate": 2.5315789473684208e-05,
477
+ "loss": 1.7591934204101562,
478
+ "mean_token_accuracy": 0.6581568241119384,
479
+ "num_tokens": 37139.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 1.75859375,
484
+ "epoch": 1.0105263157894737,
485
+ "grad_norm": 7.15625,
486
+ "learning_rate": 2.4789473684210528e-05,
487
+ "loss": 1.7067642211914062,
488
+ "mean_token_accuracy": 0.7008034646511078,
489
+ "num_tokens": 38037.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 1.4203125,
494
+ "epoch": 1.0315789473684212,
495
+ "grad_norm": 6.4375,
496
+ "learning_rate": 2.4263157894736844e-05,
497
+ "loss": 1.378176498413086,
498
+ "mean_token_accuracy": 0.7486959993839264,
499
+ "num_tokens": 38668.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 1.56328125,
504
+ "epoch": 1.0526315789473684,
505
+ "grad_norm": 7.15625,
506
+ "learning_rate": 2.373684210526316e-05,
507
+ "loss": 1.4762983322143555,
508
+ "mean_token_accuracy": 0.7175065577030182,
509
+ "num_tokens": 39424.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 1.630078125,
514
+ "epoch": 1.0736842105263158,
515
+ "grad_norm": 4.84375,
516
+ "learning_rate": 2.3210526315789473e-05,
517
+ "loss": 1.5906378746032714,
518
+ "mean_token_accuracy": 0.6876044690608978,
519
+ "num_tokens": 40317.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 1.556640625,
524
+ "epoch": 1.0947368421052632,
525
+ "grad_norm": 6.59375,
526
+ "learning_rate": 2.268421052631579e-05,
527
+ "loss": 1.6246864318847656,
528
+ "mean_token_accuracy": 0.7021546125411987,
529
+ "num_tokens": 41101.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 1.86015625,
534
+ "epoch": 1.1157894736842104,
535
+ "grad_norm": 6.3125,
536
+ "learning_rate": 2.2157894736842106e-05,
537
+ "loss": 1.832110595703125,
538
+ "mean_token_accuracy": 0.6659113824367523,
539
+ "num_tokens": 42080.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 1.50859375,
544
+ "epoch": 1.1368421052631579,
545
+ "grad_norm": 4.75,
546
+ "learning_rate": 2.1631578947368423e-05,
547
+ "loss": 1.495261001586914,
548
+ "mean_token_accuracy": 0.7114485323429107,
549
+ "num_tokens": 43064.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "entropy": 1.379296875,
554
+ "epoch": 1.1578947368421053,
555
+ "grad_norm": 7.90625,
556
+ "learning_rate": 2.110526315789474e-05,
557
+ "loss": 1.3109845161437987,
558
+ "mean_token_accuracy": 0.7768619418144226,
559
+ "num_tokens": 43638.0,
560
+ "step": 550
561
+ },
562
+ {
563
+ "entropy": 1.671875,
564
+ "epoch": 1.1789473684210527,
565
+ "grad_norm": 8.1875,
566
+ "learning_rate": 2.0578947368421052e-05,
567
+ "loss": 1.7390716552734375,
568
+ "mean_token_accuracy": 0.6966245353221894,
569
+ "num_tokens": 44538.0,
570
+ "step": 560
571
+ },
572
+ {
573
+ "entropy": 1.45390625,
574
+ "epoch": 1.2,
575
+ "grad_norm": 8.5,
576
+ "learning_rate": 2.005263157894737e-05,
577
+ "loss": 1.3690235137939453,
578
+ "mean_token_accuracy": 0.742480456829071,
579
+ "num_tokens": 45159.0,
580
+ "step": 570
581
+ },
582
+ {
583
+ "entropy": 1.8234375,
584
+ "epoch": 1.2210526315789474,
585
+ "grad_norm": 6.65625,
586
+ "learning_rate": 1.9526315789473685e-05,
587
+ "loss": 1.890069580078125,
588
+ "mean_token_accuracy": 0.6900433540344239,
589
+ "num_tokens": 45918.0,
590
+ "step": 580
591
+ },
592
+ {
593
+ "entropy": 1.8640625,
594
+ "epoch": 1.2421052631578948,
595
+ "grad_norm": 8.125,
596
+ "learning_rate": 1.9e-05,
597
+ "loss": 1.7881799697875977,
598
+ "mean_token_accuracy": 0.6537608683109284,
599
+ "num_tokens": 46795.0,
600
+ "step": 590
601
+ },
602
+ {
603
+ "entropy": 1.59453125,
604
+ "epoch": 1.263157894736842,
605
+ "grad_norm": 6.5,
606
+ "learning_rate": 1.8473684210526317e-05,
607
+ "loss": 1.5872214317321778,
608
+ "mean_token_accuracy": 0.722487497329712,
609
+ "num_tokens": 47423.0,
610
+ "step": 600
611
+ },
612
+ {
613
+ "entropy": 1.78828125,
614
+ "epoch": 1.2842105263157895,
615
+ "grad_norm": 7.3125,
616
+ "learning_rate": 1.7947368421052634e-05,
617
+ "loss": 1.6725593566894532,
618
+ "mean_token_accuracy": 0.6980259001255036,
619
+ "num_tokens": 48158.0,
620
+ "step": 610
621
+ },
622
+ {
623
+ "entropy": 1.61015625,
624
+ "epoch": 1.305263157894737,
625
+ "grad_norm": 6.5,
626
+ "learning_rate": 1.742105263157895e-05,
627
+ "loss": 1.5390226364135742,
628
+ "mean_token_accuracy": 0.725999391078949,
629
+ "num_tokens": 48875.0,
630
+ "step": 620
631
+ },
632
+ {
633
+ "entropy": 1.71953125,
634
+ "epoch": 1.3263157894736843,
635
+ "grad_norm": 8.25,
636
+ "learning_rate": 1.6894736842105263e-05,
637
+ "loss": 1.6228282928466797,
638
+ "mean_token_accuracy": 0.7085169553756714,
639
+ "num_tokens": 49697.0,
640
+ "step": 630
641
+ },
642
+ {
643
+ "entropy": 1.437109375,
644
+ "epoch": 1.3473684210526315,
645
+ "grad_norm": 7.5625,
646
+ "learning_rate": 1.636842105263158e-05,
647
+ "loss": 1.4906560897827148,
648
+ "mean_token_accuracy": 0.7471937596797943,
649
+ "num_tokens": 50300.0,
650
+ "step": 640
651
+ },
652
+ {
653
+ "entropy": 1.73359375,
654
+ "epoch": 1.368421052631579,
655
+ "grad_norm": 4.375,
656
+ "learning_rate": 1.5842105263157896e-05,
657
+ "loss": 1.5986823081970214,
658
+ "mean_token_accuracy": 0.699000483751297,
659
+ "num_tokens": 51332.0,
660
+ "step": 650
661
+ },
662
+ {
663
+ "entropy": 1.2671875,
664
+ "epoch": 1.3894736842105262,
665
+ "grad_norm": 11.8125,
666
+ "learning_rate": 1.5315789473684212e-05,
667
+ "loss": 1.234378719329834,
668
+ "mean_token_accuracy": 0.7714344441890717,
669
+ "num_tokens": 51900.0,
670
+ "step": 660
671
+ },
672
+ {
673
+ "entropy": 1.90859375,
674
+ "epoch": 1.4105263157894736,
675
+ "grad_norm": 6.875,
676
+ "learning_rate": 1.4789473684210529e-05,
677
+ "loss": 1.815606689453125,
678
+ "mean_token_accuracy": 0.6656625211238861,
679
+ "num_tokens": 52883.0,
680
+ "step": 670
681
+ },
682
+ {
683
+ "entropy": 1.558203125,
684
+ "epoch": 1.431578947368421,
685
+ "grad_norm": 9.375,
686
+ "learning_rate": 1.4263157894736842e-05,
687
+ "loss": 1.4858011245727538,
688
+ "mean_token_accuracy": 0.7443289816379547,
689
+ "num_tokens": 53576.0,
690
+ "step": 680
691
+ },
692
+ {
693
+ "entropy": 1.64765625,
694
+ "epoch": 1.4526315789473685,
695
+ "grad_norm": 8.9375,
696
+ "learning_rate": 1.373684210526316e-05,
697
+ "loss": 1.534531307220459,
698
+ "mean_token_accuracy": 0.7204049170017243,
699
+ "num_tokens": 54450.0,
700
+ "step": 690
701
+ },
702
+ {
703
+ "entropy": 1.58359375,
704
+ "epoch": 1.4736842105263157,
705
+ "grad_norm": 6.46875,
706
+ "learning_rate": 1.3210526315789473e-05,
707
+ "loss": 1.5365165710449218,
708
+ "mean_token_accuracy": 0.7069519102573395,
709
+ "num_tokens": 55160.0,
710
+ "step": 700
711
+ },
712
+ {
713
+ "entropy": 1.4171875,
714
+ "epoch": 1.4947368421052631,
715
+ "grad_norm": 7.1875,
716
+ "learning_rate": 1.268421052631579e-05,
717
+ "loss": 1.307802391052246,
718
+ "mean_token_accuracy": 0.7627157270908356,
719
+ "num_tokens": 55929.0,
720
+ "step": 710
721
+ },
722
+ {
723
+ "entropy": 1.678125,
724
+ "epoch": 1.5157894736842106,
725
+ "grad_norm": 6.34375,
726
+ "learning_rate": 1.2157894736842105e-05,
727
+ "loss": 1.5420659065246582,
728
+ "mean_token_accuracy": 0.7097006201744079,
729
+ "num_tokens": 56725.0,
730
+ "step": 720
731
+ },
732
+ {
733
+ "entropy": 1.58125,
734
+ "epoch": 1.5368421052631578,
735
+ "grad_norm": 7.0625,
736
+ "learning_rate": 1.1631578947368422e-05,
737
+ "loss": 1.4044910430908204,
738
+ "mean_token_accuracy": 0.7465328335762024,
739
+ "num_tokens": 57398.0,
740
+ "step": 730
741
+ },
742
+ {
743
+ "entropy": 1.576171875,
744
+ "epoch": 1.5578947368421052,
745
+ "grad_norm": 6.03125,
746
+ "learning_rate": 1.1105263157894738e-05,
747
+ "loss": 1.605686569213867,
748
+ "mean_token_accuracy": 0.7333506286144257,
749
+ "num_tokens": 58072.0,
750
+ "step": 740
751
+ },
752
+ {
753
+ "entropy": 1.59140625,
754
+ "epoch": 1.5789473684210527,
755
+ "grad_norm": 6.03125,
756
+ "learning_rate": 1.0578947368421053e-05,
757
+ "loss": 1.4928099632263183,
758
+ "mean_token_accuracy": 0.7072650909423828,
759
+ "num_tokens": 58748.0,
760
+ "step": 750
761
+ },
762
+ {
763
+ "entropy": 1.63046875,
764
+ "epoch": 1.6,
765
+ "grad_norm": 7.1875,
766
+ "learning_rate": 1.005263157894737e-05,
767
+ "loss": 1.624325942993164,
768
+ "mean_token_accuracy": 0.690889635682106,
769
+ "num_tokens": 59658.0,
770
+ "step": 760
771
+ },
772
+ {
773
+ "entropy": 1.803125,
774
+ "epoch": 1.6210526315789475,
775
+ "grad_norm": 6.03125,
776
+ "learning_rate": 9.526315789473684e-06,
777
+ "loss": 1.7750425338745117,
778
+ "mean_token_accuracy": 0.6651369571685791,
779
+ "num_tokens": 60859.0,
780
+ "step": 770
781
+ },
782
+ {
783
+ "entropy": 1.4703125,
784
+ "epoch": 1.6421052631578947,
785
+ "grad_norm": 7.03125,
786
+ "learning_rate": 9e-06,
787
+ "loss": 1.4258437156677246,
788
+ "mean_token_accuracy": 0.7347829401493072,
789
+ "num_tokens": 62439.0,
790
+ "step": 780
791
+ },
792
+ {
793
+ "entropy": 1.671875,
794
+ "epoch": 1.663157894736842,
795
+ "grad_norm": 7.71875,
796
+ "learning_rate": 8.473684210526315e-06,
797
+ "loss": 1.721219825744629,
798
+ "mean_token_accuracy": 0.7035934925079346,
799
+ "num_tokens": 63290.0,
800
+ "step": 790
801
+ },
802
+ {
803
+ "entropy": 1.554296875,
804
+ "epoch": 1.6842105263157894,
805
+ "grad_norm": 9.0625,
806
+ "learning_rate": 7.947368421052633e-06,
807
+ "loss": 1.5237810134887695,
808
+ "mean_token_accuracy": 0.7325670003890992,
809
+ "num_tokens": 64010.0,
810
+ "step": 800
811
+ },
812
+ {
813
+ "entropy": 1.7859375,
814
+ "epoch": 1.7052631578947368,
815
+ "grad_norm": 8.5,
816
+ "learning_rate": 7.421052631578948e-06,
817
+ "loss": 1.7944671630859375,
818
+ "mean_token_accuracy": 0.683906614780426,
819
+ "num_tokens": 64966.0,
820
+ "step": 810
821
+ },
822
+ {
823
+ "entropy": 1.840625,
824
+ "epoch": 1.7263157894736842,
825
+ "grad_norm": 7.53125,
826
+ "learning_rate": 6.894736842105263e-06,
827
+ "loss": 1.787227249145508,
828
+ "mean_token_accuracy": 0.6670031368732452,
829
+ "num_tokens": 65703.0,
830
+ "step": 820
831
+ },
832
+ {
833
+ "entropy": 1.29765625,
834
+ "epoch": 1.7473684210526317,
835
+ "grad_norm": 9.0625,
836
+ "learning_rate": 6.368421052631579e-06,
837
+ "loss": 1.2813177108764648,
838
+ "mean_token_accuracy": 0.7610228896141052,
839
+ "num_tokens": 66421.0,
840
+ "step": 830
841
+ },
842
+ {
843
+ "entropy": 1.504296875,
844
+ "epoch": 1.768421052631579,
845
+ "grad_norm": 6.5,
846
+ "learning_rate": 5.842105263157895e-06,
847
+ "loss": 1.477138137817383,
848
+ "mean_token_accuracy": 0.7455608665943145,
849
+ "num_tokens": 67005.0,
850
+ "step": 840
851
+ },
852
+ {
853
+ "entropy": 1.60859375,
854
+ "epoch": 1.7894736842105263,
855
+ "grad_norm": 5.03125,
856
+ "learning_rate": 5.315789473684211e-06,
857
+ "loss": 1.6519662857055664,
858
+ "mean_token_accuracy": 0.7005816400051117,
859
+ "num_tokens": 67975.0,
860
+ "step": 850
861
+ },
862
+ {
863
+ "entropy": 1.6765625,
864
+ "epoch": 1.8105263157894735,
865
+ "grad_norm": 6.53125,
866
+ "learning_rate": 4.789473684210526e-06,
867
+ "loss": 1.6701608657836915,
868
+ "mean_token_accuracy": 0.7136963486671448,
869
+ "num_tokens": 68650.0,
870
+ "step": 860
871
+ },
872
+ {
873
+ "entropy": 1.44609375,
874
+ "epoch": 1.831578947368421,
875
+ "grad_norm": 7.1875,
876
+ "learning_rate": 4.2631578947368425e-06,
877
+ "loss": 1.3475713729858398,
878
+ "mean_token_accuracy": 0.7566476047039032,
879
+ "num_tokens": 69317.0,
880
+ "step": 870
881
+ },
882
+ {
883
+ "entropy": 1.71328125,
884
+ "epoch": 1.8526315789473684,
885
+ "grad_norm": 6.90625,
886
+ "learning_rate": 3.736842105263158e-06,
887
+ "loss": 1.7077770233154297,
888
+ "mean_token_accuracy": 0.6919207274913788,
889
+ "num_tokens": 70169.0,
890
+ "step": 880
891
+ },
892
+ {
893
+ "entropy": 1.646875,
894
+ "epoch": 1.8736842105263158,
895
+ "grad_norm": 7.8125,
896
+ "learning_rate": 3.2105263157894735e-06,
897
+ "loss": 1.5933343887329101,
898
+ "mean_token_accuracy": 0.708821702003479,
899
+ "num_tokens": 71138.0,
900
+ "step": 890
901
+ },
902
+ {
903
+ "entropy": 1.437109375,
904
+ "epoch": 1.8947368421052633,
905
+ "grad_norm": 8.625,
906
+ "learning_rate": 2.68421052631579e-06,
907
+ "loss": 1.4426955223083495,
908
+ "mean_token_accuracy": 0.7344056785106658,
909
+ "num_tokens": 71776.0,
910
+ "step": 900
911
+ },
912
+ {
913
+ "entropy": 1.591015625,
914
+ "epoch": 1.9157894736842105,
915
+ "grad_norm": 8.125,
916
+ "learning_rate": 2.1578947368421054e-06,
917
+ "loss": 1.4523811340332031,
918
+ "mean_token_accuracy": 0.7294364452362061,
919
+ "num_tokens": 72390.0,
920
+ "step": 910
921
+ },
922
+ {
923
+ "entropy": 1.48984375,
924
+ "epoch": 1.936842105263158,
925
+ "grad_norm": 8.375,
926
+ "learning_rate": 1.6315789473684212e-06,
927
+ "loss": 1.4202921867370606,
928
+ "mean_token_accuracy": 0.7283547043800354,
929
+ "num_tokens": 73109.0,
930
+ "step": 920
931
+ },
932
+ {
933
+ "entropy": 1.37578125,
934
+ "epoch": 1.9578947368421051,
935
+ "grad_norm": 7.34375,
936
+ "learning_rate": 1.1052631578947369e-06,
937
+ "loss": 1.3212156295776367,
938
+ "mean_token_accuracy": 0.765373581647873,
939
+ "num_tokens": 73665.0,
940
+ "step": 930
941
+ },
942
+ {
943
+ "entropy": 1.69140625,
944
+ "epoch": 1.9789473684210526,
945
+ "grad_norm": 7.5625,
946
+ "learning_rate": 5.789473684210527e-07,
947
+ "loss": 1.5747876167297363,
948
+ "mean_token_accuracy": 0.7353746354579925,
949
+ "num_tokens": 74320.0,
950
+ "step": 940
951
+ },
952
+ {
953
+ "entropy": 1.75390625,
954
+ "epoch": 2.0,
955
+ "grad_norm": 6.46875,
956
+ "learning_rate": 5.263157894736842e-08,
957
+ "loss": 1.7055414199829102,
958
+ "mean_token_accuracy": 0.6862038552761078,
959
+ "num_tokens": 75304.0,
960
+ "step": 950
961
+ }
962
+ ],
963
+ "logging_steps": 10,
964
+ "max_steps": 950,
965
+ "num_input_tokens_seen": 0,
966
+ "num_train_epochs": 2,
967
+ "save_steps": 100,
968
+ "stateful_callbacks": {
969
+ "TrainerControl": {
970
+ "args": {
971
+ "should_epoch_stop": false,
972
+ "should_evaluate": false,
973
+ "should_log": false,
974
+ "should_save": true,
975
+ "should_training_stop": true
976
+ },
977
+ "attributes": {}
978
+ }
979
+ },
980
+ "total_flos": 207562104419328.0,
981
+ "train_batch_size": 1,
982
+ "trial_name": null,
983
+ "trial_params": null
984
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81fe99ed782ca55dead5a35dfca3b7fd4020c233b42b2db4b7d03721505224d1
3
+ size 5585