Jerry999 commited on
Commit
8b68ee6
·
verified ·
1 Parent(s): 7280ef9

Upload checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308

Browse files
Files changed (30) hide show
  1. .gitattributes +2 -0
  2. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/chat_template.jinja +4 -0
  3. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/config.json +71 -0
  4. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/generation_config.json +12 -0
  5. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/model.safetensors +3 -0
  6. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/optimizer.pt +3 -0
  7. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/rng_state.pth +3 -0
  8. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/scheduler.pt +3 -0
  9. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokenizer.json +3 -0
  10. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokenizer_config.json +29 -0
  11. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokens_state. +1 -0
  12. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/trainer_state.json +2994 -0
  13. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/training_args.bin +3 -0
  14. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/chat_template.jinja +4 -0
  15. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/config.json +71 -0
  16. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/generation_config.json +12 -0
  17. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/model.safetensors +3 -0
  18. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/optimizer.pt +3 -0
  19. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/rng_state.pth +3 -0
  20. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/scheduler.pt +3 -0
  21. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokenizer.json +3 -0
  22. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokenizer_config.json +29 -0
  23. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokens_state. +1 -0
  24. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/trainer_state.json +0 -0
  25. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/training_args.bin +3 -0
  26. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/debug.log +0 -0
  27. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/balanced_test_alpaca_converted.jsonl +0 -0
  28. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/balanced_test_alpaca_results.jsonl +0 -0
  29. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/eval_results.csv +2 -0
  30. checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/eval_summary.json +133 -0
.gitattributes CHANGED
@@ -70,3 +70,5 @@ checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokeni
70
  checkpoints/math_operations/lora_sft_primitive_atomic_50k_t20260305/checkpoint-3090/tokenizer.json filter=lfs diff=lfs merge=lfs -text
71
  checkpoints/math_operations/lora_sft_primitive_atomic_50k_t20260305/checkpoint-4120/tokenizer.json filter=lfs diff=lfs merge=lfs -text
72
  checkpoints/math_operations/lora_sft_primitive_atomic_50k_t20260305/checkpoint-5150/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
70
  checkpoints/math_operations/lora_sft_primitive_atomic_50k_t20260305/checkpoint-3090/tokenizer.json filter=lfs diff=lfs merge=lfs -text
71
  checkpoints/math_operations/lora_sft_primitive_atomic_50k_t20260305/checkpoint-4120/tokenizer.json filter=lfs diff=lfs merge=lfs -text
72
  checkpoints/math_operations/lora_sft_primitive_atomic_50k_t20260305/checkpoint-5150/tokenizer.json filter=lfs diff=lfs merge=lfs -text
73
+ checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokenizer.json filter=lfs diff=lfs merge=lfs -text
74
+ checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ ' + message['content'] + '<|im_end|>' + '
3
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
+ ' }}{% endif %}
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2560,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 9728,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention"
52
+ ],
53
+ "max_position_embeddings": 262144,
54
+ "max_window_layers": 36,
55
+ "model_type": "qwen3",
56
+ "num_attention_heads": 32,
57
+ "num_hidden_layers": 36,
58
+ "num_key_value_heads": 8,
59
+ "pad_token_id": 151643,
60
+ "rms_norm_eps": 1e-06,
61
+ "rope_parameters": {
62
+ "rope_theta": 5000000,
63
+ "rope_type": "default"
64
+ },
65
+ "sliding_window": null,
66
+ "tie_word_embeddings": true,
67
+ "transformers_version": "5.0.0",
68
+ "use_cache": false,
69
+ "use_sliding_window": false,
70
+ "vocab_size": 151936
71
+ }
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "temperature": 0.7,
9
+ "top_k": 20,
10
+ "top_p": 0.8,
11
+ "transformers_version": "5.0.0"
12
+ }
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41548483ee88f462265e30988e470cae80e429f667488e27290ea8dcd96c7df8
3
+ size 8822894520
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b120fe9ab7d83a8f3aa901047393a5faab6a2e2a98a720c552f360e6688766b
3
+ size 16090225449
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea11996454b5587fcf33ae0ab5cf14b2031bf5f53f8c2ed5a48e87de31e29c84
3
+ size 14645
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c33112fd93bfc97f8f9bcbedcd3ae38bbd63fe54948a8b1440778efd51de260
3
+ size 1465
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": true,
24
+ "model_max_length": 1010000,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/tokens_state. ADDED
@@ -0,0 +1 @@
 
 
1
+ {"total": 34121728, "trainable": 10689603}
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/trainer_state.json ADDED
@@ -0,0 +1,2994 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1563,
3
+ "best_metric": 0.0007253550575114787,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.9992800575953924,
6
+ "eval_steps": 521,
7
+ "global_step": 2082,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0,
14
+ "eval_loss": 0.891994059085846,
15
+ "eval_ppl": 2.43999,
16
+ "eval_runtime": 17.3569,
17
+ "eval_samples_per_second": 11.523,
18
+ "eval_steps_per_second": 11.523,
19
+ "memory/device_reserved (GiB)": 10.64,
20
+ "memory/max_active (GiB)": 10.41,
21
+ "memory/max_allocated (GiB)": 10.41,
22
+ "step": 0
23
+ },
24
+ {
25
+ "epoch": 0.009599232061435085,
26
+ "grad_norm": 19.625,
27
+ "learning_rate": 2.884615384615385e-06,
28
+ "loss": 0.8230592727661132,
29
+ "memory/device_reserved (GiB)": 36.5,
30
+ "memory/max_active (GiB)": 33.97,
31
+ "memory/max_allocated (GiB)": 33.97,
32
+ "ppl": 2.27746,
33
+ "step": 10,
34
+ "tokens/total": 163840,
35
+ "tokens/train_per_sec_per_gpu": 8.53,
36
+ "tokens/trainable": 51937
37
+ },
38
+ {
39
+ "epoch": 0.01919846412287017,
40
+ "grad_norm": 6.0625,
41
+ "learning_rate": 6.08974358974359e-06,
42
+ "loss": 0.4511241436004639,
43
+ "memory/device_reserved (GiB)": 36.5,
44
+ "memory/max_active (GiB)": 33.97,
45
+ "memory/max_allocated (GiB)": 33.97,
46
+ "ppl": 1.57008,
47
+ "step": 20,
48
+ "tokens/total": 327680,
49
+ "tokens/train_per_sec_per_gpu": 8.58,
50
+ "tokens/trainable": 103351
51
+ },
52
+ {
53
+ "epoch": 0.028797696184305256,
54
+ "grad_norm": 3.796875,
55
+ "learning_rate": 9.294871794871795e-06,
56
+ "loss": 0.13991469144821167,
57
+ "memory/device_reserved (GiB)": 36.5,
58
+ "memory/max_active (GiB)": 33.97,
59
+ "memory/max_allocated (GiB)": 33.97,
60
+ "ppl": 1.15018,
61
+ "step": 30,
62
+ "tokens/total": 491520,
63
+ "tokens/train_per_sec_per_gpu": 8.33,
64
+ "tokens/trainable": 154482
65
+ },
66
+ {
67
+ "epoch": 0.03839692824574034,
68
+ "grad_norm": 1.6171875,
69
+ "learning_rate": 1.25e-05,
70
+ "loss": 0.017821089923381807,
71
+ "memory/device_reserved (GiB)": 36.5,
72
+ "memory/max_active (GiB)": 33.97,
73
+ "memory/max_allocated (GiB)": 33.97,
74
+ "ppl": 1.01798,
75
+ "step": 40,
76
+ "tokens/total": 655360,
77
+ "tokens/train_per_sec_per_gpu": 7.78,
78
+ "tokens/trainable": 205247
79
+ },
80
+ {
81
+ "epoch": 0.04799616030717543,
82
+ "grad_norm": 1.0546875,
83
+ "learning_rate": 1.5705128205128205e-05,
84
+ "loss": 0.005615117400884629,
85
+ "memory/device_reserved (GiB)": 36.5,
86
+ "memory/max_active (GiB)": 33.97,
87
+ "memory/max_allocated (GiB)": 33.97,
88
+ "ppl": 1.00563,
89
+ "step": 50,
90
+ "tokens/total": 819200,
91
+ "tokens/train_per_sec_per_gpu": 9.73,
92
+ "tokens/trainable": 257032
93
+ },
94
+ {
95
+ "epoch": 0.05759539236861051,
96
+ "grad_norm": 0.72265625,
97
+ "learning_rate": 1.891025641025641e-05,
98
+ "loss": 0.003246866911649704,
99
+ "memory/device_reserved (GiB)": 36.5,
100
+ "memory/max_active (GiB)": 33.97,
101
+ "memory/max_allocated (GiB)": 33.97,
102
+ "ppl": 1.00325,
103
+ "step": 60,
104
+ "tokens/total": 983040,
105
+ "tokens/train_per_sec_per_gpu": 7.76,
106
+ "tokens/trainable": 309407
107
+ },
108
+ {
109
+ "epoch": 0.06719462443004559,
110
+ "grad_norm": 0.609375,
111
+ "learning_rate": 2.2115384615384616e-05,
112
+ "loss": 0.002711128443479538,
113
+ "memory/device_reserved (GiB)": 36.5,
114
+ "memory/max_active (GiB)": 33.97,
115
+ "memory/max_allocated (GiB)": 33.97,
116
+ "ppl": 1.00271,
117
+ "step": 70,
118
+ "tokens/total": 1146880,
119
+ "tokens/train_per_sec_per_gpu": 7.92,
120
+ "tokens/trainable": 361385
121
+ },
122
+ {
123
+ "epoch": 0.07679385649148068,
124
+ "grad_norm": 0.359375,
125
+ "learning_rate": 2.5320512820512822e-05,
126
+ "loss": 0.00267685167491436,
127
+ "memory/device_reserved (GiB)": 36.5,
128
+ "memory/max_active (GiB)": 33.97,
129
+ "memory/max_allocated (GiB)": 33.97,
130
+ "ppl": 1.00268,
131
+ "step": 80,
132
+ "tokens/total": 1310720,
133
+ "tokens/train_per_sec_per_gpu": 7.9,
134
+ "tokens/trainable": 413011
135
+ },
136
+ {
137
+ "epoch": 0.08639308855291576,
138
+ "grad_norm": 1.3359375,
139
+ "learning_rate": 2.8525641025641025e-05,
140
+ "loss": 0.002553700841963291,
141
+ "memory/device_reserved (GiB)": 36.5,
142
+ "memory/max_active (GiB)": 33.97,
143
+ "memory/max_allocated (GiB)": 33.97,
144
+ "ppl": 1.00256,
145
+ "step": 90,
146
+ "tokens/total": 1474560,
147
+ "tokens/train_per_sec_per_gpu": 7.84,
148
+ "tokens/trainable": 464413
149
+ },
150
+ {
151
+ "epoch": 0.09599232061435085,
152
+ "grad_norm": 0.1640625,
153
+ "learning_rate": 3.1730769230769234e-05,
154
+ "loss": 0.003567858040332794,
155
+ "memory/device_reserved (GiB)": 36.5,
156
+ "memory/max_active (GiB)": 33.97,
157
+ "memory/max_allocated (GiB)": 33.97,
158
+ "ppl": 1.00357,
159
+ "step": 100,
160
+ "tokens/total": 1638400,
161
+ "tokens/train_per_sec_per_gpu": 7.55,
162
+ "tokens/trainable": 516315
163
+ },
164
+ {
165
+ "epoch": 0.10559155267578593,
166
+ "grad_norm": 0.7734375,
167
+ "learning_rate": 3.4935897435897436e-05,
168
+ "loss": 0.002936176210641861,
169
+ "memory/device_reserved (GiB)": 36.5,
170
+ "memory/max_active (GiB)": 33.97,
171
+ "memory/max_allocated (GiB)": 33.97,
172
+ "ppl": 1.00294,
173
+ "step": 110,
174
+ "tokens/total": 1802240,
175
+ "tokens/train_per_sec_per_gpu": 8.34,
176
+ "tokens/trainable": 568044
177
+ },
178
+ {
179
+ "epoch": 0.11519078473722102,
180
+ "grad_norm": 0.333984375,
181
+ "learning_rate": 3.814102564102564e-05,
182
+ "loss": 0.0027721570804715157,
183
+ "memory/device_reserved (GiB)": 36.5,
184
+ "memory/max_active (GiB)": 33.97,
185
+ "memory/max_allocated (GiB)": 33.97,
186
+ "ppl": 1.00278,
187
+ "step": 120,
188
+ "tokens/total": 1966080,
189
+ "tokens/train_per_sec_per_gpu": 9.06,
190
+ "tokens/trainable": 620018
191
+ },
192
+ {
193
+ "epoch": 0.1247900167986561,
194
+ "grad_norm": 0.2021484375,
195
+ "learning_rate": 4.134615384615385e-05,
196
+ "loss": 0.0024721408262848854,
197
+ "memory/device_reserved (GiB)": 36.5,
198
+ "memory/max_active (GiB)": 33.97,
199
+ "memory/max_allocated (GiB)": 33.97,
200
+ "ppl": 1.00248,
201
+ "step": 130,
202
+ "tokens/total": 2129920,
203
+ "tokens/train_per_sec_per_gpu": 7.08,
204
+ "tokens/trainable": 670706
205
+ },
206
+ {
207
+ "epoch": 0.13438924886009118,
208
+ "grad_norm": 0.337890625,
209
+ "learning_rate": 4.455128205128206e-05,
210
+ "loss": 0.003622889146208763,
211
+ "memory/device_reserved (GiB)": 36.5,
212
+ "memory/max_active (GiB)": 33.97,
213
+ "memory/max_allocated (GiB)": 33.97,
214
+ "ppl": 1.00363,
215
+ "step": 140,
216
+ "tokens/total": 2293760,
217
+ "tokens/train_per_sec_per_gpu": 8.19,
218
+ "tokens/trainable": 722137
219
+ },
220
+ {
221
+ "epoch": 0.14398848092152627,
222
+ "grad_norm": 0.298828125,
223
+ "learning_rate": 4.775641025641026e-05,
224
+ "loss": 0.002823374792933464,
225
+ "memory/device_reserved (GiB)": 36.5,
226
+ "memory/max_active (GiB)": 33.97,
227
+ "memory/max_allocated (GiB)": 33.97,
228
+ "ppl": 1.00283,
229
+ "step": 150,
230
+ "tokens/total": 2457600,
231
+ "tokens/train_per_sec_per_gpu": 7.49,
232
+ "tokens/trainable": 773351
233
+ },
234
+ {
235
+ "epoch": 0.15358771298296137,
236
+ "grad_norm": 0.2255859375,
237
+ "learning_rate": 5.096153846153846e-05,
238
+ "loss": 0.00175777580589056,
239
+ "memory/device_reserved (GiB)": 36.5,
240
+ "memory/max_active (GiB)": 33.97,
241
+ "memory/max_allocated (GiB)": 33.97,
242
+ "ppl": 1.00176,
243
+ "step": 160,
244
+ "tokens/total": 2621440,
245
+ "tokens/train_per_sec_per_gpu": 9.77,
246
+ "tokens/trainable": 824611
247
+ },
248
+ {
249
+ "epoch": 0.16318694504439646,
250
+ "grad_norm": 0.216796875,
251
+ "learning_rate": 5.4166666666666664e-05,
252
+ "loss": 0.0025411507114768027,
253
+ "memory/device_reserved (GiB)": 36.5,
254
+ "memory/max_active (GiB)": 33.97,
255
+ "memory/max_allocated (GiB)": 33.97,
256
+ "ppl": 1.00254,
257
+ "step": 170,
258
+ "tokens/total": 2785280,
259
+ "tokens/train_per_sec_per_gpu": 8.71,
260
+ "tokens/trainable": 875746
261
+ },
262
+ {
263
+ "epoch": 0.17278617710583152,
264
+ "grad_norm": 0.310546875,
265
+ "learning_rate": 5.737179487179487e-05,
266
+ "loss": 0.0037163086235523224,
267
+ "memory/device_reserved (GiB)": 36.5,
268
+ "memory/max_active (GiB)": 33.97,
269
+ "memory/max_allocated (GiB)": 33.97,
270
+ "ppl": 1.00372,
271
+ "step": 180,
272
+ "tokens/total": 2949120,
273
+ "tokens/train_per_sec_per_gpu": 7.88,
274
+ "tokens/trainable": 927000
275
+ },
276
+ {
277
+ "epoch": 0.18238540916726662,
278
+ "grad_norm": 0.3125,
279
+ "learning_rate": 6.0576923076923076e-05,
280
+ "loss": 0.0028080834075808526,
281
+ "memory/device_reserved (GiB)": 36.5,
282
+ "memory/max_active (GiB)": 33.97,
283
+ "memory/max_allocated (GiB)": 33.97,
284
+ "ppl": 1.00281,
285
+ "step": 190,
286
+ "tokens/total": 3112960,
287
+ "tokens/train_per_sec_per_gpu": 9.06,
288
+ "tokens/trainable": 978689
289
+ },
290
+ {
291
+ "epoch": 0.1919846412287017,
292
+ "grad_norm": 2.765625,
293
+ "learning_rate": 6.378205128205128e-05,
294
+ "loss": 0.05687007904052734,
295
+ "memory/device_reserved (GiB)": 36.5,
296
+ "memory/max_active (GiB)": 33.97,
297
+ "memory/max_allocated (GiB)": 33.97,
298
+ "ppl": 1.05852,
299
+ "step": 200,
300
+ "tokens/total": 3276800,
301
+ "tokens/train_per_sec_per_gpu": 9.12,
302
+ "tokens/trainable": 1029912
303
+ },
304
+ {
305
+ "epoch": 0.2015838732901368,
306
+ "grad_norm": 2.046875,
307
+ "learning_rate": 6.698717948717949e-05,
308
+ "loss": 0.021659491956233977,
309
+ "memory/device_reserved (GiB)": 36.5,
310
+ "memory/max_active (GiB)": 33.97,
311
+ "memory/max_allocated (GiB)": 33.97,
312
+ "ppl": 1.0219,
313
+ "step": 210,
314
+ "tokens/total": 3440640,
315
+ "tokens/train_per_sec_per_gpu": 8.6,
316
+ "tokens/trainable": 1081023
317
+ },
318
+ {
319
+ "epoch": 0.21118310535157186,
320
+ "grad_norm": 2.21875,
321
+ "learning_rate": 7.019230769230769e-05,
322
+ "loss": 0.012803517282009125,
323
+ "memory/device_reserved (GiB)": 36.5,
324
+ "memory/max_active (GiB)": 33.97,
325
+ "memory/max_allocated (GiB)": 33.97,
326
+ "ppl": 1.01289,
327
+ "step": 220,
328
+ "tokens/total": 3604480,
329
+ "tokens/train_per_sec_per_gpu": 9.27,
330
+ "tokens/trainable": 1132719
331
+ },
332
+ {
333
+ "epoch": 0.22078233741300696,
334
+ "grad_norm": 0.53125,
335
+ "learning_rate": 7.339743589743589e-05,
336
+ "loss": 0.012191119790077209,
337
+ "memory/device_reserved (GiB)": 36.5,
338
+ "memory/max_active (GiB)": 33.97,
339
+ "memory/max_allocated (GiB)": 33.97,
340
+ "ppl": 1.01227,
341
+ "step": 230,
342
+ "tokens/total": 3768320,
343
+ "tokens/train_per_sec_per_gpu": 8.39,
344
+ "tokens/trainable": 1184217
345
+ },
346
+ {
347
+ "epoch": 0.23038156947444205,
348
+ "grad_norm": 4.0625,
349
+ "learning_rate": 7.660256410256411e-05,
350
+ "loss": 0.011262766271829604,
351
+ "memory/device_reserved (GiB)": 36.5,
352
+ "memory/max_active (GiB)": 33.97,
353
+ "memory/max_allocated (GiB)": 33.97,
354
+ "ppl": 1.01133,
355
+ "step": 240,
356
+ "tokens/total": 3932160,
357
+ "tokens/train_per_sec_per_gpu": 7.51,
358
+ "tokens/trainable": 1235669
359
+ },
360
+ {
361
+ "epoch": 0.23998080153587714,
362
+ "grad_norm": 22.125,
363
+ "learning_rate": 7.980769230769231e-05,
364
+ "loss": 0.015024600923061371,
365
+ "memory/device_reserved (GiB)": 36.5,
366
+ "memory/max_active (GiB)": 33.97,
367
+ "memory/max_allocated (GiB)": 33.97,
368
+ "ppl": 1.01514,
369
+ "step": 250,
370
+ "tokens/total": 4096000,
371
+ "tokens/train_per_sec_per_gpu": 8.82,
372
+ "tokens/trainable": 1287445
373
+ },
374
+ {
375
+ "epoch": 0.2495800335973122,
376
+ "grad_norm": 0.68359375,
377
+ "learning_rate": 8.301282051282053e-05,
378
+ "loss": 0.01258222907781601,
379
+ "memory/device_reserved (GiB)": 36.5,
380
+ "memory/max_active (GiB)": 33.97,
381
+ "memory/max_allocated (GiB)": 33.97,
382
+ "ppl": 1.01266,
383
+ "step": 260,
384
+ "tokens/total": 4259840,
385
+ "tokens/train_per_sec_per_gpu": 9.24,
386
+ "tokens/trainable": 1339971
387
+ },
388
+ {
389
+ "epoch": 0.2591792656587473,
390
+ "grad_norm": 5.375,
391
+ "learning_rate": 8.621794871794873e-05,
392
+ "loss": 0.019256196916103363,
393
+ "memory/device_reserved (GiB)": 36.5,
394
+ "memory/max_active (GiB)": 33.97,
395
+ "memory/max_allocated (GiB)": 33.97,
396
+ "ppl": 1.01944,
397
+ "step": 270,
398
+ "tokens/total": 4423680,
399
+ "tokens/train_per_sec_per_gpu": 7.26,
400
+ "tokens/trainable": 1391647
401
+ },
402
+ {
403
+ "epoch": 0.26877849772018236,
404
+ "grad_norm": 0.6015625,
405
+ "learning_rate": 8.942307692307693e-05,
406
+ "loss": 0.03441511988639832,
407
+ "memory/device_reserved (GiB)": 36.5,
408
+ "memory/max_active (GiB)": 33.97,
409
+ "memory/max_allocated (GiB)": 33.97,
410
+ "ppl": 1.03501,
411
+ "step": 280,
412
+ "tokens/total": 4587520,
413
+ "tokens/train_per_sec_per_gpu": 8.07,
414
+ "tokens/trainable": 1443110
415
+ },
416
+ {
417
+ "epoch": 0.27837772978161746,
418
+ "grad_norm": 1.78125,
419
+ "learning_rate": 9.262820512820513e-05,
420
+ "loss": 0.0174191877245903,
421
+ "memory/device_reserved (GiB)": 36.5,
422
+ "memory/max_active (GiB)": 33.97,
423
+ "memory/max_allocated (GiB)": 33.97,
424
+ "ppl": 1.01757,
425
+ "step": 290,
426
+ "tokens/total": 4751360,
427
+ "tokens/train_per_sec_per_gpu": 8.75,
428
+ "tokens/trainable": 1495030
429
+ },
430
+ {
431
+ "epoch": 0.28797696184305255,
432
+ "grad_norm": 0.34375,
433
+ "learning_rate": 9.583333333333334e-05,
434
+ "loss": 0.010925143957138062,
435
+ "memory/device_reserved (GiB)": 36.5,
436
+ "memory/max_active (GiB)": 33.97,
437
+ "memory/max_allocated (GiB)": 33.97,
438
+ "ppl": 1.01099,
439
+ "step": 300,
440
+ "tokens/total": 4915200,
441
+ "tokens/train_per_sec_per_gpu": 8.65,
442
+ "tokens/trainable": 1545313
443
+ },
444
+ {
445
+ "epoch": 0.29757619390448764,
446
+ "grad_norm": 0.3125,
447
+ "learning_rate": 9.903846153846155e-05,
448
+ "loss": 0.010976283252239228,
449
+ "memory/device_reserved (GiB)": 36.5,
450
+ "memory/max_active (GiB)": 33.97,
451
+ "memory/max_allocated (GiB)": 33.97,
452
+ "ppl": 1.01104,
453
+ "step": 310,
454
+ "tokens/total": 5079040,
455
+ "tokens/train_per_sec_per_gpu": 9.3,
456
+ "tokens/trainable": 1596578
457
+ },
458
+ {
459
+ "epoch": 0.30717542596592273,
460
+ "grad_norm": 0.23046875,
461
+ "learning_rate": 0.0001,
462
+ "loss": 0.007863689959049226,
463
+ "memory/device_reserved (GiB)": 36.5,
464
+ "memory/max_active (GiB)": 33.97,
465
+ "memory/max_allocated (GiB)": 33.97,
466
+ "ppl": 1.00789,
467
+ "step": 320,
468
+ "tokens/total": 5242880,
469
+ "tokens/train_per_sec_per_gpu": 8.65,
470
+ "tokens/trainable": 1648111
471
+ },
472
+ {
473
+ "epoch": 0.3167746580273578,
474
+ "grad_norm": 0.4140625,
475
+ "learning_rate": 0.0001,
476
+ "loss": 0.0052565749734640125,
477
+ "memory/device_reserved (GiB)": 36.5,
478
+ "memory/max_active (GiB)": 33.97,
479
+ "memory/max_allocated (GiB)": 33.97,
480
+ "ppl": 1.00527,
481
+ "step": 330,
482
+ "tokens/total": 5406720,
483
+ "tokens/train_per_sec_per_gpu": 9.0,
484
+ "tokens/trainable": 1699588
485
+ },
486
+ {
487
+ "epoch": 0.3263738900887929,
488
+ "grad_norm": 0.296875,
489
+ "learning_rate": 0.0001,
490
+ "loss": 0.009607769548892975,
491
+ "memory/device_reserved (GiB)": 36.5,
492
+ "memory/max_active (GiB)": 33.97,
493
+ "memory/max_allocated (GiB)": 33.97,
494
+ "ppl": 1.00965,
495
+ "step": 340,
496
+ "tokens/total": 5570560,
497
+ "tokens/train_per_sec_per_gpu": 9.17,
498
+ "tokens/trainable": 1751730
499
+ },
500
+ {
501
+ "epoch": 0.33597312215022795,
502
+ "grad_norm": 0.38671875,
503
+ "learning_rate": 0.0001,
504
+ "loss": 0.007573225349187851,
505
+ "memory/device_reserved (GiB)": 36.5,
506
+ "memory/max_active (GiB)": 33.97,
507
+ "memory/max_allocated (GiB)": 33.97,
508
+ "ppl": 1.0076,
509
+ "step": 350,
510
+ "tokens/total": 5734400,
511
+ "tokens/train_per_sec_per_gpu": 9.0,
512
+ "tokens/trainable": 1802979
513
+ },
514
+ {
515
+ "epoch": 0.34557235421166305,
516
+ "grad_norm": 0.21484375,
517
+ "learning_rate": 0.0001,
518
+ "loss": 0.006453585624694824,
519
+ "memory/device_reserved (GiB)": 36.5,
520
+ "memory/max_active (GiB)": 33.97,
521
+ "memory/max_allocated (GiB)": 33.97,
522
+ "ppl": 1.00647,
523
+ "step": 360,
524
+ "tokens/total": 5898240,
525
+ "tokens/train_per_sec_per_gpu": 9.38,
526
+ "tokens/trainable": 1853732
527
+ },
528
+ {
529
+ "epoch": 0.35517158627309814,
530
+ "grad_norm": 0.169921875,
531
+ "learning_rate": 0.0001,
532
+ "loss": 0.006070464849472046,
533
+ "memory/device_reserved (GiB)": 36.5,
534
+ "memory/max_active (GiB)": 33.97,
535
+ "memory/max_allocated (GiB)": 33.97,
536
+ "ppl": 1.00609,
537
+ "step": 370,
538
+ "tokens/total": 6062080,
539
+ "tokens/train_per_sec_per_gpu": 8.22,
540
+ "tokens/trainable": 1904439
541
+ },
542
+ {
543
+ "epoch": 0.36477081833453323,
544
+ "grad_norm": 0.1435546875,
545
+ "learning_rate": 0.0001,
546
+ "loss": 0.005775686353445053,
547
+ "memory/device_reserved (GiB)": 36.5,
548
+ "memory/max_active (GiB)": 33.97,
549
+ "memory/max_allocated (GiB)": 33.97,
550
+ "ppl": 1.00579,
551
+ "step": 380,
552
+ "tokens/total": 6225920,
553
+ "tokens/train_per_sec_per_gpu": 8.22,
554
+ "tokens/trainable": 1955620
555
+ },
556
+ {
557
+ "epoch": 0.3743700503959683,
558
+ "grad_norm": 0.1513671875,
559
+ "learning_rate": 0.0001,
560
+ "loss": 0.005018413811922073,
561
+ "memory/device_reserved (GiB)": 36.5,
562
+ "memory/max_active (GiB)": 33.97,
563
+ "memory/max_allocated (GiB)": 33.97,
564
+ "ppl": 1.00503,
565
+ "step": 390,
566
+ "tokens/total": 6389760,
567
+ "tokens/train_per_sec_per_gpu": 9.88,
568
+ "tokens/trainable": 2007340
569
+ },
570
+ {
571
+ "epoch": 0.3839692824574034,
572
+ "grad_norm": 0.23828125,
573
+ "learning_rate": 0.0001,
574
+ "loss": 0.003989457339048386,
575
+ "memory/device_reserved (GiB)": 36.5,
576
+ "memory/max_active (GiB)": 33.97,
577
+ "memory/max_allocated (GiB)": 33.97,
578
+ "ppl": 1.004,
579
+ "step": 400,
580
+ "tokens/total": 6553600,
581
+ "tokens/train_per_sec_per_gpu": 7.98,
582
+ "tokens/trainable": 2059592
583
+ },
584
+ {
585
+ "epoch": 0.3935685145188385,
586
+ "grad_norm": 0.396484375,
587
+ "learning_rate": 0.0001,
588
+ "loss": 0.004139231517910957,
589
+ "memory/device_reserved (GiB)": 36.5,
590
+ "memory/max_active (GiB)": 33.97,
591
+ "memory/max_allocated (GiB)": 33.97,
592
+ "ppl": 1.00415,
593
+ "step": 410,
594
+ "tokens/total": 6717440,
595
+ "tokens/train_per_sec_per_gpu": 8.12,
596
+ "tokens/trainable": 2111025
597
+ },
598
+ {
599
+ "epoch": 0.4031677465802736,
600
+ "grad_norm": 0.265625,
601
+ "learning_rate": 0.0001,
602
+ "loss": 0.004084679111838341,
603
+ "memory/device_reserved (GiB)": 36.5,
604
+ "memory/max_active (GiB)": 33.97,
605
+ "memory/max_allocated (GiB)": 33.97,
606
+ "ppl": 1.00409,
607
+ "step": 420,
608
+ "tokens/total": 6881280,
609
+ "tokens/train_per_sec_per_gpu": 9.09,
610
+ "tokens/trainable": 2161939
611
+ },
612
+ {
613
+ "epoch": 0.41276697864170864,
614
+ "grad_norm": 0.111328125,
615
+ "learning_rate": 0.0001,
616
+ "loss": 0.0030223120003938673,
617
+ "memory/device_reserved (GiB)": 36.5,
618
+ "memory/max_active (GiB)": 33.97,
619
+ "memory/max_allocated (GiB)": 33.97,
620
+ "ppl": 1.00303,
621
+ "step": 430,
622
+ "tokens/total": 7045120,
623
+ "tokens/train_per_sec_per_gpu": 7.33,
624
+ "tokens/trainable": 2213313
625
+ },
626
+ {
627
+ "epoch": 0.42236621070314373,
628
+ "grad_norm": 0.107421875,
629
+ "learning_rate": 0.0001,
630
+ "loss": 0.0029419407248497008,
631
+ "memory/device_reserved (GiB)": 36.5,
632
+ "memory/max_active (GiB)": 33.97,
633
+ "memory/max_allocated (GiB)": 33.97,
634
+ "ppl": 1.00295,
635
+ "step": 440,
636
+ "tokens/total": 7208960,
637
+ "tokens/train_per_sec_per_gpu": 8.21,
638
+ "tokens/trainable": 2264334
639
+ },
640
+ {
641
+ "epoch": 0.4319654427645788,
642
+ "grad_norm": 0.1630859375,
643
+ "learning_rate": 0.0001,
644
+ "loss": 0.0034121278673410415,
645
+ "memory/device_reserved (GiB)": 36.5,
646
+ "memory/max_active (GiB)": 33.97,
647
+ "memory/max_allocated (GiB)": 33.97,
648
+ "ppl": 1.00342,
649
+ "step": 450,
650
+ "tokens/total": 7372800,
651
+ "tokens/train_per_sec_per_gpu": 7.85,
652
+ "tokens/trainable": 2315460
653
+ },
654
+ {
655
+ "epoch": 0.4415646748260139,
656
+ "grad_norm": 0.1513671875,
657
+ "learning_rate": 0.0001,
658
+ "loss": 0.002534863166511059,
659
+ "memory/device_reserved (GiB)": 36.5,
660
+ "memory/max_active (GiB)": 33.97,
661
+ "memory/max_allocated (GiB)": 33.97,
662
+ "ppl": 1.00254,
663
+ "step": 460,
664
+ "tokens/total": 7536640,
665
+ "tokens/train_per_sec_per_gpu": 8.37,
666
+ "tokens/trainable": 2366296
667
+ },
668
+ {
669
+ "epoch": 0.451163906887449,
670
+ "grad_norm": 0.07373046875,
671
+ "learning_rate": 0.0001,
672
+ "loss": 0.0022289998829364776,
673
+ "memory/device_reserved (GiB)": 36.5,
674
+ "memory/max_active (GiB)": 33.97,
675
+ "memory/max_allocated (GiB)": 33.97,
676
+ "ppl": 1.00223,
677
+ "step": 470,
678
+ "tokens/total": 7700480,
679
+ "tokens/train_per_sec_per_gpu": 8.15,
680
+ "tokens/trainable": 2417678
681
+ },
682
+ {
683
+ "epoch": 0.4607631389488841,
684
+ "grad_norm": 0.1494140625,
685
+ "learning_rate": 0.0001,
686
+ "loss": 0.002741745673120022,
687
+ "memory/device_reserved (GiB)": 36.5,
688
+ "memory/max_active (GiB)": 33.97,
689
+ "memory/max_allocated (GiB)": 33.97,
690
+ "ppl": 1.00275,
691
+ "step": 480,
692
+ "tokens/total": 7864320,
693
+ "tokens/train_per_sec_per_gpu": 9.5,
694
+ "tokens/trainable": 2469401
695
+ },
696
+ {
697
+ "epoch": 0.4703623710103192,
698
+ "grad_norm": 0.1884765625,
699
+ "learning_rate": 0.0001,
700
+ "loss": 0.0031233657151460647,
701
+ "memory/device_reserved (GiB)": 36.5,
702
+ "memory/max_active (GiB)": 33.97,
703
+ "memory/max_allocated (GiB)": 33.97,
704
+ "ppl": 1.00313,
705
+ "step": 490,
706
+ "tokens/total": 8028160,
707
+ "tokens/train_per_sec_per_gpu": 9.42,
708
+ "tokens/trainable": 2520706
709
+ },
710
+ {
711
+ "epoch": 0.4799616030717543,
712
+ "grad_norm": 0.2578125,
713
+ "learning_rate": 0.0001,
714
+ "loss": 0.007012879848480225,
715
+ "memory/device_reserved (GiB)": 36.5,
716
+ "memory/max_active (GiB)": 33.97,
717
+ "memory/max_allocated (GiB)": 33.97,
718
+ "ppl": 1.00704,
719
+ "step": 500,
720
+ "tokens/total": 8192000,
721
+ "tokens/train_per_sec_per_gpu": 8.62,
722
+ "tokens/trainable": 2571927
723
+ },
724
+ {
725
+ "epoch": 0.4895608351331893,
726
+ "grad_norm": 0.17578125,
727
+ "learning_rate": 0.0001,
728
+ "loss": 0.007117580622434616,
729
+ "memory/device_reserved (GiB)": 36.5,
730
+ "memory/max_active (GiB)": 33.97,
731
+ "memory/max_allocated (GiB)": 33.97,
732
+ "ppl": 1.00714,
733
+ "step": 510,
734
+ "tokens/total": 8355840,
735
+ "tokens/train_per_sec_per_gpu": 8.66,
736
+ "tokens/trainable": 2623451
737
+ },
738
+ {
739
+ "epoch": 0.4991600671946244,
740
+ "grad_norm": 0.2490234375,
741
+ "learning_rate": 0.0001,
742
+ "loss": 0.004591656103730202,
743
+ "memory/device_reserved (GiB)": 36.5,
744
+ "memory/max_active (GiB)": 33.97,
745
+ "memory/max_allocated (GiB)": 33.97,
746
+ "ppl": 1.0046,
747
+ "step": 520,
748
+ "tokens/total": 8519680,
749
+ "tokens/train_per_sec_per_gpu": 9.12,
750
+ "tokens/trainable": 2674443
751
+ },
752
+ {
753
+ "epoch": 0.5001199904007679,
754
+ "eval_loss": 0.0049089775420725346,
755
+ "eval_ppl": 1.00492,
756
+ "eval_runtime": 9.006,
757
+ "eval_samples_per_second": 22.207,
758
+ "eval_steps_per_second": 22.207,
759
+ "memory/device_reserved (GiB)": 36.5,
760
+ "memory/max_active (GiB)": 33.97,
761
+ "memory/max_allocated (GiB)": 33.97,
762
+ "step": 521
763
+ },
764
+ {
765
+ "epoch": 0.5087592992560596,
766
+ "grad_norm": 0.185546875,
767
+ "learning_rate": 0.0001,
768
+ "loss": 0.004098504409193992,
769
+ "memory/device_reserved (GiB)": 35.97,
770
+ "memory/max_active (GiB)": 33.96,
771
+ "memory/max_allocated (GiB)": 33.96,
772
+ "ppl": 1.00411,
773
+ "step": 530,
774
+ "tokens/total": 8683520,
775
+ "tokens/train_per_sec_per_gpu": 7.44,
776
+ "tokens/trainable": 2725882
777
+ },
778
+ {
779
+ "epoch": 0.5183585313174947,
780
+ "grad_norm": 0.2578125,
781
+ "learning_rate": 0.0001,
782
+ "loss": 0.004386116191744805,
783
+ "memory/device_reserved (GiB)": 35.97,
784
+ "memory/max_active (GiB)": 33.95,
785
+ "memory/max_allocated (GiB)": 33.95,
786
+ "ppl": 1.0044,
787
+ "step": 540,
788
+ "tokens/total": 8847360,
789
+ "tokens/train_per_sec_per_gpu": 8.34,
790
+ "tokens/trainable": 2777518
791
+ },
792
+ {
793
+ "epoch": 0.5279577633789296,
794
+ "grad_norm": 0.166015625,
795
+ "learning_rate": 0.0001,
796
+ "loss": 0.003661666065454483,
797
+ "memory/device_reserved (GiB)": 35.97,
798
+ "memory/max_active (GiB)": 33.95,
799
+ "memory/max_allocated (GiB)": 33.95,
800
+ "ppl": 1.00367,
801
+ "step": 550,
802
+ "tokens/total": 9011200,
803
+ "tokens/train_per_sec_per_gpu": 9.27,
804
+ "tokens/trainable": 2828761
805
+ },
806
+ {
807
+ "epoch": 0.5375569954403647,
808
+ "grad_norm": 0.26171875,
809
+ "learning_rate": 0.0001,
810
+ "loss": 0.0033755451440811157,
811
+ "memory/device_reserved (GiB)": 35.97,
812
+ "memory/max_active (GiB)": 33.95,
813
+ "memory/max_allocated (GiB)": 33.95,
814
+ "ppl": 1.00338,
815
+ "step": 560,
816
+ "tokens/total": 9175040,
817
+ "tokens/train_per_sec_per_gpu": 8.99,
818
+ "tokens/trainable": 2879740
819
+ },
820
+ {
821
+ "epoch": 0.5471562275017998,
822
+ "grad_norm": 0.201171875,
823
+ "learning_rate": 0.0001,
824
+ "loss": 0.003271551802754402,
825
+ "memory/device_reserved (GiB)": 35.97,
826
+ "memory/max_active (GiB)": 33.95,
827
+ "memory/max_allocated (GiB)": 33.95,
828
+ "ppl": 1.00328,
829
+ "step": 570,
830
+ "tokens/total": 9338880,
831
+ "tokens/train_per_sec_per_gpu": 8.89,
832
+ "tokens/trainable": 2931031
833
+ },
834
+ {
835
+ "epoch": 0.5567554595632349,
836
+ "grad_norm": 0.11279296875,
837
+ "learning_rate": 0.0001,
838
+ "loss": 0.00392816960811615,
839
+ "memory/device_reserved (GiB)": 35.97,
840
+ "memory/max_active (GiB)": 33.95,
841
+ "memory/max_allocated (GiB)": 33.95,
842
+ "ppl": 1.00394,
843
+ "step": 580,
844
+ "tokens/total": 9502720,
845
+ "tokens/train_per_sec_per_gpu": 8.26,
846
+ "tokens/trainable": 2981776
847
+ },
848
+ {
849
+ "epoch": 0.56635469162467,
850
+ "grad_norm": 0.1650390625,
851
+ "learning_rate": 0.0001,
852
+ "loss": 0.0031796425580978395,
853
+ "memory/device_reserved (GiB)": 35.97,
854
+ "memory/max_active (GiB)": 33.95,
855
+ "memory/max_allocated (GiB)": 33.95,
856
+ "ppl": 1.00318,
857
+ "step": 590,
858
+ "tokens/total": 9666560,
859
+ "tokens/train_per_sec_per_gpu": 8.26,
860
+ "tokens/trainable": 3032962
861
+ },
862
+ {
863
+ "epoch": 0.5759539236861051,
864
+ "grad_norm": 0.2060546875,
865
+ "learning_rate": 0.0001,
866
+ "loss": 0.002615358680486679,
867
+ "memory/device_reserved (GiB)": 35.97,
868
+ "memory/max_active (GiB)": 33.95,
869
+ "memory/max_allocated (GiB)": 33.95,
870
+ "ppl": 1.00262,
871
+ "step": 600,
872
+ "tokens/total": 9830400,
873
+ "tokens/train_per_sec_per_gpu": 7.91,
874
+ "tokens/trainable": 3084477
875
+ },
876
+ {
877
+ "epoch": 0.5855531557475402,
878
+ "grad_norm": 0.275390625,
879
+ "learning_rate": 0.0001,
880
+ "loss": 0.0032230135053396224,
881
+ "memory/device_reserved (GiB)": 35.97,
882
+ "memory/max_active (GiB)": 33.95,
883
+ "memory/max_allocated (GiB)": 33.95,
884
+ "ppl": 1.00323,
885
+ "step": 610,
886
+ "tokens/total": 9994240,
887
+ "tokens/train_per_sec_per_gpu": 8.79,
888
+ "tokens/trainable": 3135491
889
+ },
890
+ {
891
+ "epoch": 0.5951523878089753,
892
+ "grad_norm": 0.169921875,
893
+ "learning_rate": 0.0001,
894
+ "loss": 0.002824882231652737,
895
+ "memory/device_reserved (GiB)": 35.97,
896
+ "memory/max_active (GiB)": 33.95,
897
+ "memory/max_allocated (GiB)": 33.95,
898
+ "ppl": 1.00283,
899
+ "step": 620,
900
+ "tokens/total": 10158080,
901
+ "tokens/train_per_sec_per_gpu": 7.72,
902
+ "tokens/trainable": 3186671
903
+ },
904
+ {
905
+ "epoch": 0.6047516198704104,
906
+ "grad_norm": 0.126953125,
907
+ "learning_rate": 0.0001,
908
+ "loss": 0.0019276419654488564,
909
+ "memory/device_reserved (GiB)": 35.97,
910
+ "memory/max_active (GiB)": 33.95,
911
+ "memory/max_allocated (GiB)": 33.95,
912
+ "ppl": 1.00193,
913
+ "step": 630,
914
+ "tokens/total": 10321920,
915
+ "tokens/train_per_sec_per_gpu": 9.45,
916
+ "tokens/trainable": 3238286
917
+ },
918
+ {
919
+ "epoch": 0.6143508519318455,
920
+ "grad_norm": 0.09375,
921
+ "learning_rate": 0.0001,
922
+ "loss": 0.0023364221677184107,
923
+ "memory/device_reserved (GiB)": 35.97,
924
+ "memory/max_active (GiB)": 33.95,
925
+ "memory/max_allocated (GiB)": 33.95,
926
+ "ppl": 1.00234,
927
+ "step": 640,
928
+ "tokens/total": 10485760,
929
+ "tokens/train_per_sec_per_gpu": 8.34,
930
+ "tokens/trainable": 3290171
931
+ },
932
+ {
933
+ "epoch": 0.6239500839932806,
934
+ "grad_norm": 0.11181640625,
935
+ "learning_rate": 0.0001,
936
+ "loss": 0.0024619314819574354,
937
+ "memory/device_reserved (GiB)": 35.97,
938
+ "memory/max_active (GiB)": 33.95,
939
+ "memory/max_allocated (GiB)": 33.95,
940
+ "ppl": 1.00246,
941
+ "step": 650,
942
+ "tokens/total": 10649600,
943
+ "tokens/train_per_sec_per_gpu": 8.26,
944
+ "tokens/trainable": 3341228
945
+ },
946
+ {
947
+ "epoch": 0.6335493160547156,
948
+ "grad_norm": 0.017333984375,
949
+ "learning_rate": 0.0001,
950
+ "loss": 0.0014947694726288319,
951
+ "memory/device_reserved (GiB)": 35.97,
952
+ "memory/max_active (GiB)": 33.95,
953
+ "memory/max_allocated (GiB)": 33.95,
954
+ "ppl": 1.0015,
955
+ "step": 660,
956
+ "tokens/total": 10813440,
957
+ "tokens/train_per_sec_per_gpu": 7.53,
958
+ "tokens/trainable": 3392191
959
+ },
960
+ {
961
+ "epoch": 0.6431485481161507,
962
+ "grad_norm": 0.10693359375,
963
+ "learning_rate": 0.0001,
964
+ "loss": 0.0016866009682416916,
965
+ "memory/device_reserved (GiB)": 35.97,
966
+ "memory/max_active (GiB)": 33.95,
967
+ "memory/max_allocated (GiB)": 33.95,
968
+ "ppl": 1.00169,
969
+ "step": 670,
970
+ "tokens/total": 10977280,
971
+ "tokens/train_per_sec_per_gpu": 7.27,
972
+ "tokens/trainable": 3442697
973
+ },
974
+ {
975
+ "epoch": 0.6527477801775858,
976
+ "grad_norm": 0.09033203125,
977
+ "learning_rate": 0.0001,
978
+ "loss": 0.0014289443381130696,
979
+ "memory/device_reserved (GiB)": 35.97,
980
+ "memory/max_active (GiB)": 33.95,
981
+ "memory/max_allocated (GiB)": 33.95,
982
+ "ppl": 1.00143,
983
+ "step": 680,
984
+ "tokens/total": 11141120,
985
+ "tokens/train_per_sec_per_gpu": 9.48,
986
+ "tokens/trainable": 3494178
987
+ },
988
+ {
989
+ "epoch": 0.6623470122390209,
990
+ "grad_norm": 0.1328125,
991
+ "learning_rate": 0.0001,
992
+ "loss": 0.0012737856246531009,
993
+ "memory/device_reserved (GiB)": 35.97,
994
+ "memory/max_active (GiB)": 33.95,
995
+ "memory/max_allocated (GiB)": 33.95,
996
+ "ppl": 1.00127,
997
+ "step": 690,
998
+ "tokens/total": 11304960,
999
+ "tokens/train_per_sec_per_gpu": 9.0,
1000
+ "tokens/trainable": 3545238
1001
+ },
1002
+ {
1003
+ "epoch": 0.6719462443004559,
1004
+ "grad_norm": 0.0966796875,
1005
+ "learning_rate": 0.0001,
1006
+ "loss": 0.0016797658056020737,
1007
+ "memory/device_reserved (GiB)": 35.97,
1008
+ "memory/max_active (GiB)": 33.95,
1009
+ "memory/max_allocated (GiB)": 33.95,
1010
+ "ppl": 1.00168,
1011
+ "step": 700,
1012
+ "tokens/total": 11468800,
1013
+ "tokens/train_per_sec_per_gpu": 8.11,
1014
+ "tokens/trainable": 3595974
1015
+ },
1016
+ {
1017
+ "epoch": 0.681545476361891,
1018
+ "grad_norm": 0.10986328125,
1019
+ "learning_rate": 0.0001,
1020
+ "loss": 0.0012735738418996334,
1021
+ "memory/device_reserved (GiB)": 35.97,
1022
+ "memory/max_active (GiB)": 33.95,
1023
+ "memory/max_allocated (GiB)": 33.95,
1024
+ "ppl": 1.00127,
1025
+ "step": 710,
1026
+ "tokens/total": 11632640,
1027
+ "tokens/train_per_sec_per_gpu": 7.24,
1028
+ "tokens/trainable": 3646406
1029
+ },
1030
+ {
1031
+ "epoch": 0.6911447084233261,
1032
+ "grad_norm": 0.1171875,
1033
+ "learning_rate": 0.0001,
1034
+ "loss": 0.0016826316714286804,
1035
+ "memory/device_reserved (GiB)": 35.97,
1036
+ "memory/max_active (GiB)": 33.95,
1037
+ "memory/max_allocated (GiB)": 33.95,
1038
+ "ppl": 1.00168,
1039
+ "step": 720,
1040
+ "tokens/total": 11796480,
1041
+ "tokens/train_per_sec_per_gpu": 8.32,
1042
+ "tokens/trainable": 3697893
1043
+ },
1044
+ {
1045
+ "epoch": 0.7007439404847612,
1046
+ "grad_norm": 0.08642578125,
1047
+ "learning_rate": 0.0001,
1048
+ "loss": 0.001028579194098711,
1049
+ "memory/device_reserved (GiB)": 35.97,
1050
+ "memory/max_active (GiB)": 33.95,
1051
+ "memory/max_allocated (GiB)": 33.95,
1052
+ "ppl": 1.00103,
1053
+ "step": 730,
1054
+ "tokens/total": 11960320,
1055
+ "tokens/train_per_sec_per_gpu": 7.29,
1056
+ "tokens/trainable": 3749410
1057
+ },
1058
+ {
1059
+ "epoch": 0.7103431725461963,
1060
+ "grad_norm": 0.078125,
1061
+ "learning_rate": 0.0001,
1062
+ "loss": 0.0013211018405854702,
1063
+ "memory/device_reserved (GiB)": 35.97,
1064
+ "memory/max_active (GiB)": 33.95,
1065
+ "memory/max_allocated (GiB)": 33.95,
1066
+ "ppl": 1.00132,
1067
+ "step": 740,
1068
+ "tokens/total": 12124160,
1069
+ "tokens/train_per_sec_per_gpu": 7.22,
1070
+ "tokens/trainable": 3800410
1071
+ },
1072
+ {
1073
+ "epoch": 0.7199424046076314,
1074
+ "grad_norm": 3.546875,
1075
+ "learning_rate": 0.0001,
1076
+ "loss": 0.5068239688873291,
1077
+ "memory/device_reserved (GiB)": 35.97,
1078
+ "memory/max_active (GiB)": 33.95,
1079
+ "memory/max_allocated (GiB)": 33.95,
1080
+ "ppl": 1.66001,
1081
+ "step": 750,
1082
+ "tokens/total": 12288000,
1083
+ "tokens/train_per_sec_per_gpu": 8.82,
1084
+ "tokens/trainable": 3852024
1085
+ },
1086
+ {
1087
+ "epoch": 0.7295416366690665,
1088
+ "grad_norm": 1.609375,
1089
+ "learning_rate": 0.0001,
1090
+ "loss": 0.042395052313804624,
1091
+ "memory/device_reserved (GiB)": 35.97,
1092
+ "memory/max_active (GiB)": 33.95,
1093
+ "memory/max_allocated (GiB)": 33.95,
1094
+ "ppl": 1.04331,
1095
+ "step": 760,
1096
+ "tokens/total": 12451840,
1097
+ "tokens/train_per_sec_per_gpu": 9.1,
1098
+ "tokens/trainable": 3903571
1099
+ },
1100
+ {
1101
+ "epoch": 0.7391408687305016,
1102
+ "grad_norm": 0.443359375,
1103
+ "learning_rate": 0.0001,
1104
+ "loss": 0.054154080152511594,
1105
+ "memory/device_reserved (GiB)": 35.97,
1106
+ "memory/max_active (GiB)": 33.95,
1107
+ "memory/max_allocated (GiB)": 33.95,
1108
+ "ppl": 1.05565,
1109
+ "step": 770,
1110
+ "tokens/total": 12615680,
1111
+ "tokens/train_per_sec_per_gpu": 8.87,
1112
+ "tokens/trainable": 3954797
1113
+ },
1114
+ {
1115
+ "epoch": 0.7487401007919366,
1116
+ "grad_norm": 0.62109375,
1117
+ "learning_rate": 0.0001,
1118
+ "loss": 0.015584257245063782,
1119
+ "memory/device_reserved (GiB)": 35.97,
1120
+ "memory/max_active (GiB)": 33.95,
1121
+ "memory/max_allocated (GiB)": 33.95,
1122
+ "ppl": 1.01571,
1123
+ "step": 780,
1124
+ "tokens/total": 12779520,
1125
+ "tokens/train_per_sec_per_gpu": 7.53,
1126
+ "tokens/trainable": 4005775
1127
+ },
1128
+ {
1129
+ "epoch": 0.7583393328533717,
1130
+ "grad_norm": 0.9765625,
1131
+ "learning_rate": 0.0001,
1132
+ "loss": 0.02899232506752014,
1133
+ "memory/device_reserved (GiB)": 35.97,
1134
+ "memory/max_active (GiB)": 33.95,
1135
+ "memory/max_allocated (GiB)": 33.95,
1136
+ "ppl": 1.02942,
1137
+ "step": 790,
1138
+ "tokens/total": 12943360,
1139
+ "tokens/train_per_sec_per_gpu": 8.92,
1140
+ "tokens/trainable": 4056679
1141
+ },
1142
+ {
1143
+ "epoch": 0.7679385649148068,
1144
+ "grad_norm": 0.291015625,
1145
+ "learning_rate": 0.0001,
1146
+ "loss": 0.011905992776155472,
1147
+ "memory/device_reserved (GiB)": 35.97,
1148
+ "memory/max_active (GiB)": 33.95,
1149
+ "memory/max_allocated (GiB)": 33.95,
1150
+ "ppl": 1.01198,
1151
+ "step": 800,
1152
+ "tokens/total": 13107200,
1153
+ "tokens/train_per_sec_per_gpu": 9.32,
1154
+ "tokens/trainable": 4107997
1155
+ },
1156
+ {
1157
+ "epoch": 0.7775377969762419,
1158
+ "grad_norm": 0.234375,
1159
+ "learning_rate": 0.0001,
1160
+ "loss": 0.006751462817192078,
1161
+ "memory/device_reserved (GiB)": 35.97,
1162
+ "memory/max_active (GiB)": 33.95,
1163
+ "memory/max_allocated (GiB)": 33.95,
1164
+ "ppl": 1.00677,
1165
+ "step": 810,
1166
+ "tokens/total": 13271040,
1167
+ "tokens/train_per_sec_per_gpu": 7.92,
1168
+ "tokens/trainable": 4158984
1169
+ },
1170
+ {
1171
+ "epoch": 0.787137029037677,
1172
+ "grad_norm": 0.294921875,
1173
+ "learning_rate": 0.0001,
1174
+ "loss": 0.005448491126298904,
1175
+ "memory/device_reserved (GiB)": 35.97,
1176
+ "memory/max_active (GiB)": 33.95,
1177
+ "memory/max_allocated (GiB)": 33.95,
1178
+ "ppl": 1.00546,
1179
+ "step": 820,
1180
+ "tokens/total": 13434880,
1181
+ "tokens/train_per_sec_per_gpu": 9.18,
1182
+ "tokens/trainable": 4210162
1183
+ },
1184
+ {
1185
+ "epoch": 0.7967362610991121,
1186
+ "grad_norm": 0.357421875,
1187
+ "learning_rate": 0.0001,
1188
+ "loss": 0.00531839057803154,
1189
+ "memory/device_reserved (GiB)": 35.97,
1190
+ "memory/max_active (GiB)": 33.95,
1191
+ "memory/max_allocated (GiB)": 33.95,
1192
+ "ppl": 1.00533,
1193
+ "step": 830,
1194
+ "tokens/total": 13598720,
1195
+ "tokens/train_per_sec_per_gpu": 7.73,
1196
+ "tokens/trainable": 4261207
1197
+ },
1198
+ {
1199
+ "epoch": 0.8063354931605472,
1200
+ "grad_norm": 0.54296875,
1201
+ "learning_rate": 0.0001,
1202
+ "loss": 0.004298893362283706,
1203
+ "memory/device_reserved (GiB)": 35.97,
1204
+ "memory/max_active (GiB)": 33.95,
1205
+ "memory/max_allocated (GiB)": 33.95,
1206
+ "ppl": 1.00431,
1207
+ "step": 840,
1208
+ "tokens/total": 13762560,
1209
+ "tokens/train_per_sec_per_gpu": 8.2,
1210
+ "tokens/trainable": 4312249
1211
+ },
1212
+ {
1213
+ "epoch": 0.8159347252219823,
1214
+ "grad_norm": 0.369140625,
1215
+ "learning_rate": 0.0001,
1216
+ "loss": 0.006566829234361649,
1217
+ "memory/device_reserved (GiB)": 35.97,
1218
+ "memory/max_active (GiB)": 33.95,
1219
+ "memory/max_allocated (GiB)": 33.95,
1220
+ "ppl": 1.00659,
1221
+ "step": 850,
1222
+ "tokens/total": 13926400,
1223
+ "tokens/train_per_sec_per_gpu": 9.0,
1224
+ "tokens/trainable": 4363740
1225
+ },
1226
+ {
1227
+ "epoch": 0.8255339572834173,
1228
+ "grad_norm": 0.2138671875,
1229
+ "learning_rate": 0.0001,
1230
+ "loss": 0.0052708122879266735,
1231
+ "memory/device_reserved (GiB)": 35.97,
1232
+ "memory/max_active (GiB)": 33.95,
1233
+ "memory/max_allocated (GiB)": 33.95,
1234
+ "ppl": 1.00528,
1235
+ "step": 860,
1236
+ "tokens/total": 14090240,
1237
+ "tokens/train_per_sec_per_gpu": 8.52,
1238
+ "tokens/trainable": 4414546
1239
+ },
1240
+ {
1241
+ "epoch": 0.8351331893448524,
1242
+ "grad_norm": 0.251953125,
1243
+ "learning_rate": 0.0001,
1244
+ "loss": 0.004131903126835823,
1245
+ "memory/device_reserved (GiB)": 35.97,
1246
+ "memory/max_active (GiB)": 33.95,
1247
+ "memory/max_allocated (GiB)": 33.95,
1248
+ "ppl": 1.00414,
1249
+ "step": 870,
1250
+ "tokens/total": 14254080,
1251
+ "tokens/train_per_sec_per_gpu": 9.16,
1252
+ "tokens/trainable": 4465604
1253
+ },
1254
+ {
1255
+ "epoch": 0.8447324214062875,
1256
+ "grad_norm": 0.1435546875,
1257
+ "learning_rate": 0.0001,
1258
+ "loss": 0.003341007232666016,
1259
+ "memory/device_reserved (GiB)": 35.97,
1260
+ "memory/max_active (GiB)": 33.95,
1261
+ "memory/max_allocated (GiB)": 33.95,
1262
+ "ppl": 1.00335,
1263
+ "step": 880,
1264
+ "tokens/total": 14417920,
1265
+ "tokens/train_per_sec_per_gpu": 9.26,
1266
+ "tokens/trainable": 4516755
1267
+ },
1268
+ {
1269
+ "epoch": 0.8543316534677226,
1270
+ "grad_norm": 0.1337890625,
1271
+ "learning_rate": 0.0001,
1272
+ "loss": 0.0022021437063813208,
1273
+ "memory/device_reserved (GiB)": 35.97,
1274
+ "memory/max_active (GiB)": 33.95,
1275
+ "memory/max_allocated (GiB)": 33.95,
1276
+ "ppl": 1.0022,
1277
+ "step": 890,
1278
+ "tokens/total": 14581760,
1279
+ "tokens/train_per_sec_per_gpu": 9.37,
1280
+ "tokens/trainable": 4568307
1281
+ },
1282
+ {
1283
+ "epoch": 0.8639308855291576,
1284
+ "grad_norm": 0.10400390625,
1285
+ "learning_rate": 0.0001,
1286
+ "loss": 0.001743432879447937,
1287
+ "memory/device_reserved (GiB)": 35.97,
1288
+ "memory/max_active (GiB)": 33.95,
1289
+ "memory/max_allocated (GiB)": 33.95,
1290
+ "ppl": 1.00174,
1291
+ "step": 900,
1292
+ "tokens/total": 14745600,
1293
+ "tokens/train_per_sec_per_gpu": 7.47,
1294
+ "tokens/trainable": 4619967
1295
+ },
1296
+ {
1297
+ "epoch": 0.8735301175905927,
1298
+ "grad_norm": 0.0966796875,
1299
+ "learning_rate": 0.0001,
1300
+ "loss": 0.001867898181080818,
1301
+ "memory/device_reserved (GiB)": 35.97,
1302
+ "memory/max_active (GiB)": 33.95,
1303
+ "memory/max_allocated (GiB)": 33.95,
1304
+ "ppl": 1.00187,
1305
+ "step": 910,
1306
+ "tokens/total": 14909440,
1307
+ "tokens/train_per_sec_per_gpu": 9.38,
1308
+ "tokens/trainable": 4671271
1309
+ },
1310
+ {
1311
+ "epoch": 0.8831293496520278,
1312
+ "grad_norm": 0.06494140625,
1313
+ "learning_rate": 0.0001,
1314
+ "loss": 0.0017737392336130142,
1315
+ "memory/device_reserved (GiB)": 35.97,
1316
+ "memory/max_active (GiB)": 33.95,
1317
+ "memory/max_allocated (GiB)": 33.95,
1318
+ "ppl": 1.00178,
1319
+ "step": 920,
1320
+ "tokens/total": 15073280,
1321
+ "tokens/train_per_sec_per_gpu": 9.17,
1322
+ "tokens/trainable": 4723106
1323
+ },
1324
+ {
1325
+ "epoch": 0.8927285817134629,
1326
+ "grad_norm": 0.1572265625,
1327
+ "learning_rate": 0.0001,
1328
+ "loss": 0.0029280630871653555,
1329
+ "memory/device_reserved (GiB)": 35.97,
1330
+ "memory/max_active (GiB)": 33.95,
1331
+ "memory/max_allocated (GiB)": 33.95,
1332
+ "ppl": 1.00293,
1333
+ "step": 930,
1334
+ "tokens/total": 15237120,
1335
+ "tokens/train_per_sec_per_gpu": 8.5,
1336
+ "tokens/trainable": 4774009
1337
+ },
1338
+ {
1339
+ "epoch": 0.902327813774898,
1340
+ "grad_norm": 0.09326171875,
1341
+ "learning_rate": 0.0001,
1342
+ "loss": 0.002748473361134529,
1343
+ "memory/device_reserved (GiB)": 35.97,
1344
+ "memory/max_active (GiB)": 33.95,
1345
+ "memory/max_allocated (GiB)": 33.95,
1346
+ "ppl": 1.00275,
1347
+ "step": 940,
1348
+ "tokens/total": 15400960,
1349
+ "tokens/train_per_sec_per_gpu": 8.9,
1350
+ "tokens/trainable": 4825081
1351
+ },
1352
+ {
1353
+ "epoch": 0.9119270458363331,
1354
+ "grad_norm": 0.10302734375,
1355
+ "learning_rate": 0.0001,
1356
+ "loss": 0.0015982367098331452,
1357
+ "memory/device_reserved (GiB)": 35.97,
1358
+ "memory/max_active (GiB)": 33.95,
1359
+ "memory/max_allocated (GiB)": 33.95,
1360
+ "ppl": 1.0016,
1361
+ "step": 950,
1362
+ "tokens/total": 15564800,
1363
+ "tokens/train_per_sec_per_gpu": 9.43,
1364
+ "tokens/trainable": 4877201
1365
+ },
1366
+ {
1367
+ "epoch": 0.9215262778977682,
1368
+ "grad_norm": 0.0810546875,
1369
+ "learning_rate": 0.0001,
1370
+ "loss": 0.0018960090354084968,
1371
+ "memory/device_reserved (GiB)": 35.97,
1372
+ "memory/max_active (GiB)": 33.95,
1373
+ "memory/max_allocated (GiB)": 33.95,
1374
+ "ppl": 1.0019,
1375
+ "step": 960,
1376
+ "tokens/total": 15728640,
1377
+ "tokens/train_per_sec_per_gpu": 9.03,
1378
+ "tokens/trainable": 4929476
1379
+ },
1380
+ {
1381
+ "epoch": 0.9311255099592033,
1382
+ "grad_norm": 0.1484375,
1383
+ "learning_rate": 0.0001,
1384
+ "loss": 0.0017032548785209656,
1385
+ "memory/device_reserved (GiB)": 35.97,
1386
+ "memory/max_active (GiB)": 33.95,
1387
+ "memory/max_allocated (GiB)": 33.95,
1388
+ "ppl": 1.0017,
1389
+ "step": 970,
1390
+ "tokens/total": 15892480,
1391
+ "tokens/train_per_sec_per_gpu": 8.97,
1392
+ "tokens/trainable": 4981090
1393
+ },
1394
+ {
1395
+ "epoch": 0.9407247420206384,
1396
+ "grad_norm": 0.0380859375,
1397
+ "learning_rate": 0.0001,
1398
+ "loss": 0.0012425887398421764,
1399
+ "memory/device_reserved (GiB)": 35.97,
1400
+ "memory/max_active (GiB)": 33.95,
1401
+ "memory/max_allocated (GiB)": 33.95,
1402
+ "ppl": 1.00124,
1403
+ "step": 980,
1404
+ "tokens/total": 16056320,
1405
+ "tokens/train_per_sec_per_gpu": 8.64,
1406
+ "tokens/trainable": 5032187
1407
+ },
1408
+ {
1409
+ "epoch": 0.9503239740820735,
1410
+ "grad_norm": 0.1767578125,
1411
+ "learning_rate": 0.0001,
1412
+ "loss": 0.0016014887019991874,
1413
+ "memory/device_reserved (GiB)": 35.97,
1414
+ "memory/max_active (GiB)": 33.95,
1415
+ "memory/max_allocated (GiB)": 33.95,
1416
+ "ppl": 1.0016,
1417
+ "step": 990,
1418
+ "tokens/total": 16220160,
1419
+ "tokens/train_per_sec_per_gpu": 9.25,
1420
+ "tokens/trainable": 5083255
1421
+ },
1422
+ {
1423
+ "epoch": 0.9599232061435086,
1424
+ "grad_norm": 0.173828125,
1425
+ "learning_rate": 0.0001,
1426
+ "loss": 0.0015840081498026848,
1427
+ "memory/device_reserved (GiB)": 35.97,
1428
+ "memory/max_active (GiB)": 33.95,
1429
+ "memory/max_allocated (GiB)": 33.95,
1430
+ "ppl": 1.00159,
1431
+ "step": 1000,
1432
+ "tokens/total": 16384000,
1433
+ "tokens/train_per_sec_per_gpu": 8.04,
1434
+ "tokens/trainable": 5133643
1435
+ },
1436
+ {
1437
+ "epoch": 0.9695224382049437,
1438
+ "grad_norm": 0.078125,
1439
+ "learning_rate": 0.0001,
1440
+ "loss": 0.0026744097471237183,
1441
+ "memory/device_reserved (GiB)": 35.97,
1442
+ "memory/max_active (GiB)": 33.95,
1443
+ "memory/max_allocated (GiB)": 33.95,
1444
+ "ppl": 1.00268,
1445
+ "step": 1010,
1446
+ "tokens/total": 16547840,
1447
+ "tokens/train_per_sec_per_gpu": 7.69,
1448
+ "tokens/trainable": 5185486
1449
+ },
1450
+ {
1451
+ "epoch": 0.9791216702663786,
1452
+ "grad_norm": 0.0306396484375,
1453
+ "learning_rate": 0.0001,
1454
+ "loss": 0.0014427711255848407,
1455
+ "memory/device_reserved (GiB)": 35.97,
1456
+ "memory/max_active (GiB)": 33.95,
1457
+ "memory/max_allocated (GiB)": 33.95,
1458
+ "ppl": 1.00144,
1459
+ "step": 1020,
1460
+ "tokens/total": 16711680,
1461
+ "tokens/train_per_sec_per_gpu": 7.37,
1462
+ "tokens/trainable": 5236078
1463
+ },
1464
+ {
1465
+ "epoch": 0.9887209023278137,
1466
+ "grad_norm": 0.044921875,
1467
+ "learning_rate": 0.0001,
1468
+ "loss": 0.0015522641129791736,
1469
+ "memory/device_reserved (GiB)": 35.97,
1470
+ "memory/max_active (GiB)": 33.95,
1471
+ "memory/max_allocated (GiB)": 33.95,
1472
+ "ppl": 1.00155,
1473
+ "step": 1030,
1474
+ "tokens/total": 16875520,
1475
+ "tokens/train_per_sec_per_gpu": 8.48,
1476
+ "tokens/trainable": 5287369
1477
+ },
1478
+ {
1479
+ "epoch": 0.9983201343892488,
1480
+ "grad_norm": 0.09423828125,
1481
+ "learning_rate": 0.0001,
1482
+ "loss": 0.001351279579102993,
1483
+ "memory/device_reserved (GiB)": 35.97,
1484
+ "memory/max_active (GiB)": 33.95,
1485
+ "memory/max_allocated (GiB)": 33.95,
1486
+ "ppl": 1.00135,
1487
+ "step": 1040,
1488
+ "tokens/total": 17039360,
1489
+ "tokens/train_per_sec_per_gpu": 8.94,
1490
+ "tokens/trainable": 5338389
1491
+ },
1492
+ {
1493
+ "epoch": 1.0009599232061435,
1494
+ "eval_loss": 0.001463641761802137,
1495
+ "eval_ppl": 1.00146,
1496
+ "eval_runtime": 8.8734,
1497
+ "eval_samples_per_second": 22.539,
1498
+ "eval_steps_per_second": 22.539,
1499
+ "memory/device_reserved (GiB)": 35.97,
1500
+ "memory/max_active (GiB)": 33.95,
1501
+ "memory/max_allocated (GiB)": 33.95,
1502
+ "step": 1042
1503
+ },
1504
+ {
1505
+ "epoch": 1.0086393088552916,
1506
+ "grad_norm": 0.052490234375,
1507
+ "learning_rate": 0.0001,
1508
+ "loss": 0.0011446304619312287,
1509
+ "memory/device_reserved (GiB)": 35.97,
1510
+ "memory/max_active (GiB)": 33.96,
1511
+ "memory/max_allocated (GiB)": 33.96,
1512
+ "ppl": 1.00115,
1513
+ "step": 1050,
1514
+ "tokens/total": 17213440,
1515
+ "tokens/train_per_sec_per_gpu": 10.07,
1516
+ "tokens/trainable": 5392886
1517
+ },
1518
+ {
1519
+ "epoch": 1.0182385409167267,
1520
+ "grad_norm": 0.09375,
1521
+ "learning_rate": 0.0001,
1522
+ "loss": 0.001120314747095108,
1523
+ "memory/device_reserved (GiB)": 35.97,
1524
+ "memory/max_active (GiB)": 33.95,
1525
+ "memory/max_allocated (GiB)": 33.95,
1526
+ "ppl": 1.00112,
1527
+ "step": 1060,
1528
+ "tokens/total": 17377280,
1529
+ "tokens/train_per_sec_per_gpu": 8.48,
1530
+ "tokens/trainable": 5444495
1531
+ },
1532
+ {
1533
+ "epoch": 1.0278377729781618,
1534
+ "grad_norm": 0.0712890625,
1535
+ "learning_rate": 0.0001,
1536
+ "loss": 0.0010949315503239632,
1537
+ "memory/device_reserved (GiB)": 35.97,
1538
+ "memory/max_active (GiB)": 33.95,
1539
+ "memory/max_allocated (GiB)": 33.95,
1540
+ "ppl": 1.0011,
1541
+ "step": 1070,
1542
+ "tokens/total": 17541120,
1543
+ "tokens/train_per_sec_per_gpu": 8.47,
1544
+ "tokens/trainable": 5496208
1545
+ },
1546
+ {
1547
+ "epoch": 1.037437005039597,
1548
+ "grad_norm": 0.0184326171875,
1549
+ "learning_rate": 0.0001,
1550
+ "loss": 0.000931826326996088,
1551
+ "memory/device_reserved (GiB)": 35.97,
1552
+ "memory/max_active (GiB)": 33.95,
1553
+ "memory/max_allocated (GiB)": 33.95,
1554
+ "ppl": 1.00093,
1555
+ "step": 1080,
1556
+ "tokens/total": 17704960,
1557
+ "tokens/train_per_sec_per_gpu": 7.56,
1558
+ "tokens/trainable": 5547530
1559
+ },
1560
+ {
1561
+ "epoch": 1.047036237101032,
1562
+ "grad_norm": 0.09912109375,
1563
+ "learning_rate": 0.0001,
1564
+ "loss": 0.0007877454161643981,
1565
+ "memory/device_reserved (GiB)": 35.97,
1566
+ "memory/max_active (GiB)": 33.95,
1567
+ "memory/max_allocated (GiB)": 33.95,
1568
+ "ppl": 1.00079,
1569
+ "step": 1090,
1570
+ "tokens/total": 17868800,
1571
+ "tokens/train_per_sec_per_gpu": 8.39,
1572
+ "tokens/trainable": 5598017
1573
+ },
1574
+ {
1575
+ "epoch": 1.056635469162467,
1576
+ "grad_norm": 0.029052734375,
1577
+ "learning_rate": 0.0001,
1578
+ "loss": 0.000616989703848958,
1579
+ "memory/device_reserved (GiB)": 35.97,
1580
+ "memory/max_active (GiB)": 33.95,
1581
+ "memory/max_allocated (GiB)": 33.95,
1582
+ "ppl": 1.00062,
1583
+ "step": 1100,
1584
+ "tokens/total": 18032640,
1585
+ "tokens/train_per_sec_per_gpu": 8.31,
1586
+ "tokens/trainable": 5649223
1587
+ },
1588
+ {
1589
+ "epoch": 1.0662347012239022,
1590
+ "grad_norm": 0.054443359375,
1591
+ "learning_rate": 0.0001,
1592
+ "loss": 0.0008214156143367291,
1593
+ "memory/device_reserved (GiB)": 35.97,
1594
+ "memory/max_active (GiB)": 33.95,
1595
+ "memory/max_allocated (GiB)": 33.95,
1596
+ "ppl": 1.00082,
1597
+ "step": 1110,
1598
+ "tokens/total": 18196480,
1599
+ "tokens/train_per_sec_per_gpu": 9.55,
1600
+ "tokens/trainable": 5701017
1601
+ },
1602
+ {
1603
+ "epoch": 1.0758339332853373,
1604
+ "grad_norm": 0.0791015625,
1605
+ "learning_rate": 0.0001,
1606
+ "loss": 0.0007145676761865615,
1607
+ "memory/device_reserved (GiB)": 35.97,
1608
+ "memory/max_active (GiB)": 33.95,
1609
+ "memory/max_allocated (GiB)": 33.95,
1610
+ "ppl": 1.00071,
1611
+ "step": 1120,
1612
+ "tokens/total": 18360320,
1613
+ "tokens/train_per_sec_per_gpu": 7.91,
1614
+ "tokens/trainable": 5752591
1615
+ },
1616
+ {
1617
+ "epoch": 1.0854331653467724,
1618
+ "grad_norm": 0.09375,
1619
+ "learning_rate": 0.0001,
1620
+ "loss": 0.0010736193507909775,
1621
+ "memory/device_reserved (GiB)": 35.97,
1622
+ "memory/max_active (GiB)": 33.95,
1623
+ "memory/max_allocated (GiB)": 33.95,
1624
+ "ppl": 1.00107,
1625
+ "step": 1130,
1626
+ "tokens/total": 18524160,
1627
+ "tokens/train_per_sec_per_gpu": 9.44,
1628
+ "tokens/trainable": 5803980
1629
+ },
1630
+ {
1631
+ "epoch": 1.0950323974082075,
1632
+ "grad_norm": 0.0673828125,
1633
+ "learning_rate": 0.0001,
1634
+ "loss": 0.0010662767104804515,
1635
+ "memory/device_reserved (GiB)": 35.97,
1636
+ "memory/max_active (GiB)": 33.95,
1637
+ "memory/max_allocated (GiB)": 33.95,
1638
+ "ppl": 1.00107,
1639
+ "step": 1140,
1640
+ "tokens/total": 18688000,
1641
+ "tokens/train_per_sec_per_gpu": 8.0,
1642
+ "tokens/trainable": 5854955
1643
+ },
1644
+ {
1645
+ "epoch": 1.1046316294696423,
1646
+ "grad_norm": 0.0257568359375,
1647
+ "learning_rate": 0.0001,
1648
+ "loss": 0.0005197681020945311,
1649
+ "memory/device_reserved (GiB)": 35.97,
1650
+ "memory/max_active (GiB)": 33.95,
1651
+ "memory/max_allocated (GiB)": 33.95,
1652
+ "ppl": 1.00052,
1653
+ "step": 1150,
1654
+ "tokens/total": 18851840,
1655
+ "tokens/train_per_sec_per_gpu": 7.34,
1656
+ "tokens/trainable": 5905676
1657
+ },
1658
+ {
1659
+ "epoch": 1.1142308615310774,
1660
+ "grad_norm": 0.0172119140625,
1661
+ "learning_rate": 0.0001,
1662
+ "loss": 0.0009939110837876796,
1663
+ "memory/device_reserved (GiB)": 35.97,
1664
+ "memory/max_active (GiB)": 33.95,
1665
+ "memory/max_allocated (GiB)": 33.95,
1666
+ "ppl": 1.00099,
1667
+ "step": 1160,
1668
+ "tokens/total": 19015680,
1669
+ "tokens/train_per_sec_per_gpu": 7.12,
1670
+ "tokens/trainable": 5956999
1671
+ },
1672
+ {
1673
+ "epoch": 1.1238300935925125,
1674
+ "grad_norm": 0.0830078125,
1675
+ "learning_rate": 0.0001,
1676
+ "loss": 0.0008747033774852752,
1677
+ "memory/device_reserved (GiB)": 35.97,
1678
+ "memory/max_active (GiB)": 33.95,
1679
+ "memory/max_allocated (GiB)": 33.95,
1680
+ "ppl": 1.00088,
1681
+ "step": 1170,
1682
+ "tokens/total": 19179520,
1683
+ "tokens/train_per_sec_per_gpu": 7.56,
1684
+ "tokens/trainable": 6008446
1685
+ },
1686
+ {
1687
+ "epoch": 1.1334293256539476,
1688
+ "grad_norm": 0.1298828125,
1689
+ "learning_rate": 0.0001,
1690
+ "loss": 0.0009802436456084252,
1691
+ "memory/device_reserved (GiB)": 35.97,
1692
+ "memory/max_active (GiB)": 33.95,
1693
+ "memory/max_allocated (GiB)": 33.95,
1694
+ "ppl": 1.00098,
1695
+ "step": 1180,
1696
+ "tokens/total": 19343360,
1697
+ "tokens/train_per_sec_per_gpu": 8.36,
1698
+ "tokens/trainable": 6060444
1699
+ },
1700
+ {
1701
+ "epoch": 1.1430285577153827,
1702
+ "grad_norm": 0.126953125,
1703
+ "learning_rate": 0.0001,
1704
+ "loss": 0.0006262516602873802,
1705
+ "memory/device_reserved (GiB)": 35.97,
1706
+ "memory/max_active (GiB)": 33.95,
1707
+ "memory/max_allocated (GiB)": 33.95,
1708
+ "ppl": 1.00063,
1709
+ "step": 1190,
1710
+ "tokens/total": 19507200,
1711
+ "tokens/train_per_sec_per_gpu": 9.44,
1712
+ "tokens/trainable": 6112318
1713
+ },
1714
+ {
1715
+ "epoch": 1.1526277897768178,
1716
+ "grad_norm": 0.0546875,
1717
+ "learning_rate": 0.0001,
1718
+ "loss": 0.0008734981529414654,
1719
+ "memory/device_reserved (GiB)": 35.97,
1720
+ "memory/max_active (GiB)": 33.95,
1721
+ "memory/max_allocated (GiB)": 33.95,
1722
+ "ppl": 1.00087,
1723
+ "step": 1200,
1724
+ "tokens/total": 19671040,
1725
+ "tokens/train_per_sec_per_gpu": 8.47,
1726
+ "tokens/trainable": 6163992
1727
+ },
1728
+ {
1729
+ "epoch": 1.1622270218382529,
1730
+ "grad_norm": 0.2734375,
1731
+ "learning_rate": 0.0001,
1732
+ "loss": 0.0009970812126994133,
1733
+ "memory/device_reserved (GiB)": 35.97,
1734
+ "memory/max_active (GiB)": 33.95,
1735
+ "memory/max_allocated (GiB)": 33.95,
1736
+ "ppl": 1.001,
1737
+ "step": 1210,
1738
+ "tokens/total": 19834880,
1739
+ "tokens/train_per_sec_per_gpu": 7.83,
1740
+ "tokens/trainable": 6214313
1741
+ },
1742
+ {
1743
+ "epoch": 1.171826253899688,
1744
+ "grad_norm": 0.048828125,
1745
+ "learning_rate": 0.0001,
1746
+ "loss": 0.0009464750066399575,
1747
+ "memory/device_reserved (GiB)": 35.97,
1748
+ "memory/max_active (GiB)": 33.95,
1749
+ "memory/max_allocated (GiB)": 33.95,
1750
+ "ppl": 1.00095,
1751
+ "step": 1220,
1752
+ "tokens/total": 19998720,
1753
+ "tokens/train_per_sec_per_gpu": 9.08,
1754
+ "tokens/trainable": 6265730
1755
+ },
1756
+ {
1757
+ "epoch": 1.181425485961123,
1758
+ "grad_norm": 0.051513671875,
1759
+ "learning_rate": 0.0001,
1760
+ "loss": 0.001540043018758297,
1761
+ "memory/device_reserved (GiB)": 35.97,
1762
+ "memory/max_active (GiB)": 33.95,
1763
+ "memory/max_allocated (GiB)": 33.95,
1764
+ "ppl": 1.00154,
1765
+ "step": 1230,
1766
+ "tokens/total": 20162560,
1767
+ "tokens/train_per_sec_per_gpu": 9.53,
1768
+ "tokens/trainable": 6317338
1769
+ },
1770
+ {
1771
+ "epoch": 1.1910247180225582,
1772
+ "grad_norm": 0.08056640625,
1773
+ "learning_rate": 0.0001,
1774
+ "loss": 0.001301754917949438,
1775
+ "memory/device_reserved (GiB)": 35.97,
1776
+ "memory/max_active (GiB)": 33.95,
1777
+ "memory/max_allocated (GiB)": 33.95,
1778
+ "ppl": 1.0013,
1779
+ "step": 1240,
1780
+ "tokens/total": 20326400,
1781
+ "tokens/train_per_sec_per_gpu": 6.71,
1782
+ "tokens/trainable": 6368478
1783
+ },
1784
+ {
1785
+ "epoch": 1.2006239500839933,
1786
+ "grad_norm": 0.01708984375,
1787
+ "learning_rate": 0.0001,
1788
+ "loss": 0.0007991308346390724,
1789
+ "memory/device_reserved (GiB)": 35.97,
1790
+ "memory/max_active (GiB)": 33.95,
1791
+ "memory/max_allocated (GiB)": 33.95,
1792
+ "ppl": 1.0008,
1793
+ "step": 1250,
1794
+ "tokens/total": 20490240,
1795
+ "tokens/train_per_sec_per_gpu": 8.48,
1796
+ "tokens/trainable": 6420144
1797
+ },
1798
+ {
1799
+ "epoch": 1.2102231821454283,
1800
+ "grad_norm": 0.05908203125,
1801
+ "learning_rate": 0.0001,
1802
+ "loss": 0.0011655298061668874,
1803
+ "memory/device_reserved (GiB)": 35.97,
1804
+ "memory/max_active (GiB)": 33.95,
1805
+ "memory/max_allocated (GiB)": 33.95,
1806
+ "ppl": 1.00117,
1807
+ "step": 1260,
1808
+ "tokens/total": 20654080,
1809
+ "tokens/train_per_sec_per_gpu": 9.43,
1810
+ "tokens/trainable": 6471183
1811
+ },
1812
+ {
1813
+ "epoch": 1.2198224142068634,
1814
+ "grad_norm": 0.09130859375,
1815
+ "learning_rate": 0.0001,
1816
+ "loss": 0.0007856052368879318,
1817
+ "memory/device_reserved (GiB)": 35.97,
1818
+ "memory/max_active (GiB)": 33.95,
1819
+ "memory/max_allocated (GiB)": 33.95,
1820
+ "ppl": 1.00079,
1821
+ "step": 1270,
1822
+ "tokens/total": 20817920,
1823
+ "tokens/train_per_sec_per_gpu": 9.04,
1824
+ "tokens/trainable": 6522523
1825
+ },
1826
+ {
1827
+ "epoch": 1.2294216462682985,
1828
+ "grad_norm": 0.037109375,
1829
+ "learning_rate": 0.0001,
1830
+ "loss": 0.0009363952092826366,
1831
+ "memory/device_reserved (GiB)": 35.97,
1832
+ "memory/max_active (GiB)": 33.95,
1833
+ "memory/max_allocated (GiB)": 33.95,
1834
+ "ppl": 1.00094,
1835
+ "step": 1280,
1836
+ "tokens/total": 20981760,
1837
+ "tokens/train_per_sec_per_gpu": 8.04,
1838
+ "tokens/trainable": 6573574
1839
+ },
1840
+ {
1841
+ "epoch": 1.2390208783297336,
1842
+ "grad_norm": 0.04052734375,
1843
+ "learning_rate": 0.0001,
1844
+ "loss": 0.0008232606574892998,
1845
+ "memory/device_reserved (GiB)": 35.97,
1846
+ "memory/max_active (GiB)": 33.95,
1847
+ "memory/max_allocated (GiB)": 33.95,
1848
+ "ppl": 1.00082,
1849
+ "step": 1290,
1850
+ "tokens/total": 21145600,
1851
+ "tokens/train_per_sec_per_gpu": 8.19,
1852
+ "tokens/trainable": 6624754
1853
+ },
1854
+ {
1855
+ "epoch": 1.2486201103911687,
1856
+ "grad_norm": 0.045166015625,
1857
+ "learning_rate": 0.0001,
1858
+ "loss": 0.0007726194337010384,
1859
+ "memory/device_reserved (GiB)": 35.97,
1860
+ "memory/max_active (GiB)": 33.95,
1861
+ "memory/max_allocated (GiB)": 33.95,
1862
+ "ppl": 1.00077,
1863
+ "step": 1300,
1864
+ "tokens/total": 21309440,
1865
+ "tokens/train_per_sec_per_gpu": 8.56,
1866
+ "tokens/trainable": 6676467
1867
+ },
1868
+ {
1869
+ "epoch": 1.2582193424526038,
1870
+ "grad_norm": 0.0289306640625,
1871
+ "learning_rate": 0.0001,
1872
+ "loss": 0.0007264631800353527,
1873
+ "memory/device_reserved (GiB)": 35.97,
1874
+ "memory/max_active (GiB)": 33.95,
1875
+ "memory/max_allocated (GiB)": 33.95,
1876
+ "ppl": 1.00073,
1877
+ "step": 1310,
1878
+ "tokens/total": 21473280,
1879
+ "tokens/train_per_sec_per_gpu": 6.8,
1880
+ "tokens/trainable": 6727717
1881
+ },
1882
+ {
1883
+ "epoch": 1.267818574514039,
1884
+ "grad_norm": 0.0810546875,
1885
+ "learning_rate": 0.0001,
1886
+ "loss": 0.0010542750358581543,
1887
+ "memory/device_reserved (GiB)": 35.97,
1888
+ "memory/max_active (GiB)": 33.95,
1889
+ "memory/max_allocated (GiB)": 33.95,
1890
+ "ppl": 1.00105,
1891
+ "step": 1320,
1892
+ "tokens/total": 21637120,
1893
+ "tokens/train_per_sec_per_gpu": 8.18,
1894
+ "tokens/trainable": 6778400
1895
+ },
1896
+ {
1897
+ "epoch": 1.277417806575474,
1898
+ "grad_norm": 0.12255859375,
1899
+ "learning_rate": 0.0001,
1900
+ "loss": 0.0007948096841573715,
1901
+ "memory/device_reserved (GiB)": 35.97,
1902
+ "memory/max_active (GiB)": 33.95,
1903
+ "memory/max_allocated (GiB)": 33.95,
1904
+ "ppl": 1.0008,
1905
+ "step": 1330,
1906
+ "tokens/total": 21800960,
1907
+ "tokens/train_per_sec_per_gpu": 8.11,
1908
+ "tokens/trainable": 6829680
1909
+ },
1910
+ {
1911
+ "epoch": 1.287017038636909,
1912
+ "grad_norm": 0.05224609375,
1913
+ "learning_rate": 0.0001,
1914
+ "loss": 0.0010158532299101354,
1915
+ "memory/device_reserved (GiB)": 35.97,
1916
+ "memory/max_active (GiB)": 33.95,
1917
+ "memory/max_allocated (GiB)": 33.95,
1918
+ "ppl": 1.00102,
1919
+ "step": 1340,
1920
+ "tokens/total": 21964800,
1921
+ "tokens/train_per_sec_per_gpu": 8.58,
1922
+ "tokens/trainable": 6880865
1923
+ },
1924
+ {
1925
+ "epoch": 1.2966162706983442,
1926
+ "grad_norm": 0.07470703125,
1927
+ "learning_rate": 0.0001,
1928
+ "loss": 0.0007738139480352402,
1929
+ "memory/device_reserved (GiB)": 35.97,
1930
+ "memory/max_active (GiB)": 33.95,
1931
+ "memory/max_allocated (GiB)": 33.95,
1932
+ "ppl": 1.00077,
1933
+ "step": 1350,
1934
+ "tokens/total": 22128640,
1935
+ "tokens/train_per_sec_per_gpu": 7.45,
1936
+ "tokens/trainable": 6932720
1937
+ },
1938
+ {
1939
+ "epoch": 1.3062155027597793,
1940
+ "grad_norm": 0.01361083984375,
1941
+ "learning_rate": 0.0001,
1942
+ "loss": 0.00043031726963818075,
1943
+ "memory/device_reserved (GiB)": 35.97,
1944
+ "memory/max_active (GiB)": 33.95,
1945
+ "memory/max_allocated (GiB)": 33.95,
1946
+ "ppl": 1.00043,
1947
+ "step": 1360,
1948
+ "tokens/total": 22292480,
1949
+ "tokens/train_per_sec_per_gpu": 7.66,
1950
+ "tokens/trainable": 6983558
1951
+ },
1952
+ {
1953
+ "epoch": 1.3158147348212144,
1954
+ "grad_norm": 0.06689453125,
1955
+ "learning_rate": 0.0001,
1956
+ "loss": 0.0005287491250783205,
1957
+ "memory/device_reserved (GiB)": 35.97,
1958
+ "memory/max_active (GiB)": 33.95,
1959
+ "memory/max_allocated (GiB)": 33.95,
1960
+ "ppl": 1.00053,
1961
+ "step": 1370,
1962
+ "tokens/total": 22456320,
1963
+ "tokens/train_per_sec_per_gpu": 8.04,
1964
+ "tokens/trainable": 7035633
1965
+ },
1966
+ {
1967
+ "epoch": 1.3254139668826495,
1968
+ "grad_norm": 0.03173828125,
1969
+ "learning_rate": 0.0001,
1970
+ "loss": 0.0010193496011197567,
1971
+ "memory/device_reserved (GiB)": 35.97,
1972
+ "memory/max_active (GiB)": 33.95,
1973
+ "memory/max_allocated (GiB)": 33.95,
1974
+ "ppl": 1.00102,
1975
+ "step": 1380,
1976
+ "tokens/total": 22620160,
1977
+ "tokens/train_per_sec_per_gpu": 8.95,
1978
+ "tokens/trainable": 7087613
1979
+ },
1980
+ {
1981
+ "epoch": 1.3350131989440845,
1982
+ "grad_norm": 0.1796875,
1983
+ "learning_rate": 0.0001,
1984
+ "loss": 0.0008692140690982342,
1985
+ "memory/device_reserved (GiB)": 35.97,
1986
+ "memory/max_active (GiB)": 33.95,
1987
+ "memory/max_allocated (GiB)": 33.95,
1988
+ "ppl": 1.00087,
1989
+ "step": 1390,
1990
+ "tokens/total": 22784000,
1991
+ "tokens/train_per_sec_per_gpu": 9.06,
1992
+ "tokens/trainable": 7138863
1993
+ },
1994
+ {
1995
+ "epoch": 1.3446124310055196,
1996
+ "grad_norm": 0.1416015625,
1997
+ "learning_rate": 0.0001,
1998
+ "loss": 0.0008631485514342784,
1999
+ "memory/device_reserved (GiB)": 35.97,
2000
+ "memory/max_active (GiB)": 33.95,
2001
+ "memory/max_allocated (GiB)": 33.95,
2002
+ "ppl": 1.00086,
2003
+ "step": 1400,
2004
+ "tokens/total": 22947840,
2005
+ "tokens/train_per_sec_per_gpu": 8.53,
2006
+ "tokens/trainable": 7190354
2007
+ },
2008
+ {
2009
+ "epoch": 1.3542116630669545,
2010
+ "grad_norm": 0.028564453125,
2011
+ "learning_rate": 0.0001,
2012
+ "loss": 0.0010508927516639233,
2013
+ "memory/device_reserved (GiB)": 35.97,
2014
+ "memory/max_active (GiB)": 33.95,
2015
+ "memory/max_allocated (GiB)": 33.95,
2016
+ "ppl": 1.00105,
2017
+ "step": 1410,
2018
+ "tokens/total": 23111680,
2019
+ "tokens/train_per_sec_per_gpu": 8.97,
2020
+ "tokens/trainable": 7241829
2021
+ },
2022
+ {
2023
+ "epoch": 1.3638108951283896,
2024
+ "grad_norm": 0.068359375,
2025
+ "learning_rate": 0.0001,
2026
+ "loss": 0.0008184337988495827,
2027
+ "memory/device_reserved (GiB)": 35.97,
2028
+ "memory/max_active (GiB)": 33.95,
2029
+ "memory/max_allocated (GiB)": 33.95,
2030
+ "ppl": 1.00082,
2031
+ "step": 1420,
2032
+ "tokens/total": 23275520,
2033
+ "tokens/train_per_sec_per_gpu": 7.94,
2034
+ "tokens/trainable": 7292651
2035
+ },
2036
+ {
2037
+ "epoch": 1.3734101271898247,
2038
+ "grad_norm": 0.08837890625,
2039
+ "learning_rate": 0.0001,
2040
+ "loss": 0.0009149087592959404,
2041
+ "memory/device_reserved (GiB)": 35.97,
2042
+ "memory/max_active (GiB)": 33.95,
2043
+ "memory/max_allocated (GiB)": 33.95,
2044
+ "ppl": 1.00092,
2045
+ "step": 1430,
2046
+ "tokens/total": 23439360,
2047
+ "tokens/train_per_sec_per_gpu": 8.76,
2048
+ "tokens/trainable": 7344149
2049
+ },
2050
+ {
2051
+ "epoch": 1.3830093592512598,
2052
+ "grad_norm": 0.0322265625,
2053
+ "learning_rate": 0.0001,
2054
+ "loss": 0.000701209157705307,
2055
+ "memory/device_reserved (GiB)": 35.97,
2056
+ "memory/max_active (GiB)": 33.95,
2057
+ "memory/max_allocated (GiB)": 33.95,
2058
+ "ppl": 1.0007,
2059
+ "step": 1440,
2060
+ "tokens/total": 23603200,
2061
+ "tokens/train_per_sec_per_gpu": 7.78,
2062
+ "tokens/trainable": 7394829
2063
+ },
2064
+ {
2065
+ "epoch": 1.3926085913126949,
2066
+ "grad_norm": 0.181640625,
2067
+ "learning_rate": 0.0001,
2068
+ "loss": 0.0005787152796983719,
2069
+ "memory/device_reserved (GiB)": 35.97,
2070
+ "memory/max_active (GiB)": 33.95,
2071
+ "memory/max_allocated (GiB)": 33.95,
2072
+ "ppl": 1.00058,
2073
+ "step": 1450,
2074
+ "tokens/total": 23767040,
2075
+ "tokens/train_per_sec_per_gpu": 9.0,
2076
+ "tokens/trainable": 7446945
2077
+ },
2078
+ {
2079
+ "epoch": 1.40220782337413,
2080
+ "grad_norm": 0.0478515625,
2081
+ "learning_rate": 0.0001,
2082
+ "loss": 0.0005680257920175791,
2083
+ "memory/device_reserved (GiB)": 35.97,
2084
+ "memory/max_active (GiB)": 33.95,
2085
+ "memory/max_allocated (GiB)": 33.95,
2086
+ "ppl": 1.00057,
2087
+ "step": 1460,
2088
+ "tokens/total": 23930880,
2089
+ "tokens/train_per_sec_per_gpu": 7.56,
2090
+ "tokens/trainable": 7498437
2091
+ },
2092
+ {
2093
+ "epoch": 1.411807055435565,
2094
+ "grad_norm": 0.06494140625,
2095
+ "learning_rate": 0.0001,
2096
+ "loss": 0.001165725290775299,
2097
+ "memory/device_reserved (GiB)": 35.97,
2098
+ "memory/max_active (GiB)": 33.95,
2099
+ "memory/max_allocated (GiB)": 33.95,
2100
+ "ppl": 1.00117,
2101
+ "step": 1470,
2102
+ "tokens/total": 24094720,
2103
+ "tokens/train_per_sec_per_gpu": 6.74,
2104
+ "tokens/trainable": 7549736
2105
+ },
2106
+ {
2107
+ "epoch": 1.4214062874970002,
2108
+ "grad_norm": 0.039794921875,
2109
+ "learning_rate": 0.0001,
2110
+ "loss": 0.0007651465013623238,
2111
+ "memory/device_reserved (GiB)": 35.97,
2112
+ "memory/max_active (GiB)": 33.95,
2113
+ "memory/max_allocated (GiB)": 33.95,
2114
+ "ppl": 1.00077,
2115
+ "step": 1480,
2116
+ "tokens/total": 24258560,
2117
+ "tokens/train_per_sec_per_gpu": 9.69,
2118
+ "tokens/trainable": 7601091
2119
+ },
2120
+ {
2121
+ "epoch": 1.4310055195584352,
2122
+ "grad_norm": 0.0927734375,
2123
+ "learning_rate": 0.0001,
2124
+ "loss": 0.000851003173738718,
2125
+ "memory/device_reserved (GiB)": 35.97,
2126
+ "memory/max_active (GiB)": 33.95,
2127
+ "memory/max_allocated (GiB)": 33.95,
2128
+ "ppl": 1.00085,
2129
+ "step": 1490,
2130
+ "tokens/total": 24422400,
2131
+ "tokens/train_per_sec_per_gpu": 9.01,
2132
+ "tokens/trainable": 7652714
2133
+ },
2134
+ {
2135
+ "epoch": 1.4406047516198703,
2136
+ "grad_norm": 0.08056640625,
2137
+ "learning_rate": 0.0001,
2138
+ "loss": 0.0009756641462445259,
2139
+ "memory/device_reserved (GiB)": 35.97,
2140
+ "memory/max_active (GiB)": 33.95,
2141
+ "memory/max_allocated (GiB)": 33.95,
2142
+ "ppl": 1.00098,
2143
+ "step": 1500,
2144
+ "tokens/total": 24586240,
2145
+ "tokens/train_per_sec_per_gpu": 8.81,
2146
+ "tokens/trainable": 7703464
2147
+ },
2148
+ {
2149
+ "epoch": 1.4502039836813054,
2150
+ "grad_norm": 0.263671875,
2151
+ "learning_rate": 0.0001,
2152
+ "loss": 0.001250309031456709,
2153
+ "memory/device_reserved (GiB)": 35.97,
2154
+ "memory/max_active (GiB)": 33.95,
2155
+ "memory/max_allocated (GiB)": 33.95,
2156
+ "ppl": 1.00125,
2157
+ "step": 1510,
2158
+ "tokens/total": 24750080,
2159
+ "tokens/train_per_sec_per_gpu": 8.21,
2160
+ "tokens/trainable": 7754782
2161
+ },
2162
+ {
2163
+ "epoch": 1.4598032157427405,
2164
+ "grad_norm": 0.09326171875,
2165
+ "learning_rate": 0.0001,
2166
+ "loss": 0.0014243194833397864,
2167
+ "memory/device_reserved (GiB)": 35.97,
2168
+ "memory/max_active (GiB)": 33.95,
2169
+ "memory/max_allocated (GiB)": 33.95,
2170
+ "ppl": 1.00143,
2171
+ "step": 1520,
2172
+ "tokens/total": 24913920,
2173
+ "tokens/train_per_sec_per_gpu": 7.23,
2174
+ "tokens/trainable": 7806224
2175
+ },
2176
+ {
2177
+ "epoch": 1.4694024478041756,
2178
+ "grad_norm": 0.049072265625,
2179
+ "learning_rate": 0.0001,
2180
+ "loss": 0.0011884530074894428,
2181
+ "memory/device_reserved (GiB)": 35.97,
2182
+ "memory/max_active (GiB)": 33.95,
2183
+ "memory/max_allocated (GiB)": 33.95,
2184
+ "ppl": 1.00119,
2185
+ "step": 1530,
2186
+ "tokens/total": 25077760,
2187
+ "tokens/train_per_sec_per_gpu": 8.68,
2188
+ "tokens/trainable": 7857372
2189
+ },
2190
+ {
2191
+ "epoch": 1.4790016798656107,
2192
+ "grad_norm": 0.072265625,
2193
+ "learning_rate": 0.0001,
2194
+ "loss": 0.0008188777603209019,
2195
+ "memory/device_reserved (GiB)": 35.97,
2196
+ "memory/max_active (GiB)": 33.95,
2197
+ "memory/max_allocated (GiB)": 33.95,
2198
+ "ppl": 1.00082,
2199
+ "step": 1540,
2200
+ "tokens/total": 25241600,
2201
+ "tokens/train_per_sec_per_gpu": 9.24,
2202
+ "tokens/trainable": 7909316
2203
+ },
2204
+ {
2205
+ "epoch": 1.4886009119270458,
2206
+ "grad_norm": 0.138671875,
2207
+ "learning_rate": 0.0001,
2208
+ "loss": 0.0008213745430111885,
2209
+ "memory/device_reserved (GiB)": 35.97,
2210
+ "memory/max_active (GiB)": 33.95,
2211
+ "memory/max_allocated (GiB)": 33.95,
2212
+ "ppl": 1.00082,
2213
+ "step": 1550,
2214
+ "tokens/total": 25405440,
2215
+ "tokens/train_per_sec_per_gpu": 8.18,
2216
+ "tokens/trainable": 7960179
2217
+ },
2218
+ {
2219
+ "epoch": 1.498200143988481,
2220
+ "grad_norm": 0.12158203125,
2221
+ "learning_rate": 0.0001,
2222
+ "loss": 0.0010140080004930497,
2223
+ "memory/device_reserved (GiB)": 35.97,
2224
+ "memory/max_active (GiB)": 33.95,
2225
+ "memory/max_allocated (GiB)": 33.95,
2226
+ "ppl": 1.00101,
2227
+ "step": 1560,
2228
+ "tokens/total": 25569280,
2229
+ "tokens/train_per_sec_per_gpu": 9.44,
2230
+ "tokens/trainable": 8011395
2231
+ },
2232
+ {
2233
+ "epoch": 1.5010799136069113,
2234
+ "eval_loss": 0.0007253550575114787,
2235
+ "eval_ppl": 1.00073,
2236
+ "eval_runtime": 9.2519,
2237
+ "eval_samples_per_second": 21.617,
2238
+ "eval_steps_per_second": 21.617,
2239
+ "memory/device_reserved (GiB)": 35.97,
2240
+ "memory/max_active (GiB)": 33.95,
2241
+ "memory/max_allocated (GiB)": 33.95,
2242
+ "step": 1563
2243
+ },
2244
+ {
2245
+ "epoch": 1.507799376049916,
2246
+ "grad_norm": 0.1298828125,
2247
+ "learning_rate": 0.0001,
2248
+ "loss": 0.0008709205314517022,
2249
+ "memory/device_reserved (GiB)": 35.97,
2250
+ "memory/max_active (GiB)": 33.96,
2251
+ "memory/max_allocated (GiB)": 33.96,
2252
+ "ppl": 1.00087,
2253
+ "step": 1570,
2254
+ "tokens/total": 25733120,
2255
+ "tokens/train_per_sec_per_gpu": 9.84,
2256
+ "tokens/trainable": 8062743
2257
+ },
2258
+ {
2259
+ "epoch": 1.517398608111351,
2260
+ "grad_norm": 0.004058837890625,
2261
+ "learning_rate": 0.0001,
2262
+ "loss": 0.00034918386954814197,
2263
+ "memory/device_reserved (GiB)": 35.97,
2264
+ "memory/max_active (GiB)": 33.95,
2265
+ "memory/max_allocated (GiB)": 33.95,
2266
+ "ppl": 1.00035,
2267
+ "step": 1580,
2268
+ "tokens/total": 25896960,
2269
+ "tokens/train_per_sec_per_gpu": 9.47,
2270
+ "tokens/trainable": 8113834
2271
+ },
2272
+ {
2273
+ "epoch": 1.5269978401727862,
2274
+ "grad_norm": 0.0030517578125,
2275
+ "learning_rate": 0.0001,
2276
+ "loss": 0.0003432748606428504,
2277
+ "memory/device_reserved (GiB)": 35.97,
2278
+ "memory/max_active (GiB)": 33.95,
2279
+ "memory/max_allocated (GiB)": 33.95,
2280
+ "ppl": 1.00034,
2281
+ "step": 1590,
2282
+ "tokens/total": 26060800,
2283
+ "tokens/train_per_sec_per_gpu": 7.8,
2284
+ "tokens/trainable": 8165964
2285
+ },
2286
+ {
2287
+ "epoch": 1.5365970722342213,
2288
+ "grad_norm": 0.032958984375,
2289
+ "learning_rate": 0.0001,
2290
+ "loss": 0.0005480392836034298,
2291
+ "memory/device_reserved (GiB)": 35.97,
2292
+ "memory/max_active (GiB)": 33.95,
2293
+ "memory/max_allocated (GiB)": 33.95,
2294
+ "ppl": 1.00055,
2295
+ "step": 1600,
2296
+ "tokens/total": 26224640,
2297
+ "tokens/train_per_sec_per_gpu": 9.52,
2298
+ "tokens/trainable": 8217121
2299
+ },
2300
+ {
2301
+ "epoch": 1.5461963042956564,
2302
+ "grad_norm": 0.123046875,
2303
+ "learning_rate": 0.0001,
2304
+ "loss": 0.0005881413817405701,
2305
+ "memory/device_reserved (GiB)": 35.97,
2306
+ "memory/max_active (GiB)": 33.95,
2307
+ "memory/max_allocated (GiB)": 33.95,
2308
+ "ppl": 1.00059,
2309
+ "step": 1610,
2310
+ "tokens/total": 26388480,
2311
+ "tokens/train_per_sec_per_gpu": 9.15,
2312
+ "tokens/trainable": 8268301
2313
+ },
2314
+ {
2315
+ "epoch": 1.5557955363570914,
2316
+ "grad_norm": 0.0859375,
2317
+ "learning_rate": 0.0001,
2318
+ "loss": 0.0004818507470190525,
2319
+ "memory/device_reserved (GiB)": 35.97,
2320
+ "memory/max_active (GiB)": 33.95,
2321
+ "memory/max_allocated (GiB)": 33.95,
2322
+ "ppl": 1.00048,
2323
+ "step": 1620,
2324
+ "tokens/total": 26552320,
2325
+ "tokens/train_per_sec_per_gpu": 8.56,
2326
+ "tokens/trainable": 8320320
2327
+ },
2328
+ {
2329
+ "epoch": 1.5653947684185265,
2330
+ "grad_norm": 0.0927734375,
2331
+ "learning_rate": 0.0001,
2332
+ "loss": 0.0007268225774168969,
2333
+ "memory/device_reserved (GiB)": 35.97,
2334
+ "memory/max_active (GiB)": 33.95,
2335
+ "memory/max_allocated (GiB)": 33.95,
2336
+ "ppl": 1.00073,
2337
+ "step": 1630,
2338
+ "tokens/total": 26716160,
2339
+ "tokens/train_per_sec_per_gpu": 8.69,
2340
+ "tokens/trainable": 8372031
2341
+ },
2342
+ {
2343
+ "epoch": 1.5749940004799616,
2344
+ "grad_norm": 0.0147705078125,
2345
+ "learning_rate": 0.0001,
2346
+ "loss": 0.0006106278859078884,
2347
+ "memory/device_reserved (GiB)": 35.97,
2348
+ "memory/max_active (GiB)": 33.95,
2349
+ "memory/max_allocated (GiB)": 33.95,
2350
+ "ppl": 1.00061,
2351
+ "step": 1640,
2352
+ "tokens/total": 26880000,
2353
+ "tokens/train_per_sec_per_gpu": 9.11,
2354
+ "tokens/trainable": 8422723
2355
+ },
2356
+ {
2357
+ "epoch": 1.5845932325413967,
2358
+ "grad_norm": 0.08203125,
2359
+ "learning_rate": 0.0001,
2360
+ "loss": 0.0009039029479026795,
2361
+ "memory/device_reserved (GiB)": 35.97,
2362
+ "memory/max_active (GiB)": 33.95,
2363
+ "memory/max_allocated (GiB)": 33.95,
2364
+ "ppl": 1.0009,
2365
+ "step": 1650,
2366
+ "tokens/total": 27043840,
2367
+ "tokens/train_per_sec_per_gpu": 9.5,
2368
+ "tokens/trainable": 8474243
2369
+ },
2370
+ {
2371
+ "epoch": 1.5941924646028318,
2372
+ "grad_norm": 0.047607421875,
2373
+ "learning_rate": 0.0001,
2374
+ "loss": 0.0013276168145239353,
2375
+ "memory/device_reserved (GiB)": 35.97,
2376
+ "memory/max_active (GiB)": 33.95,
2377
+ "memory/max_allocated (GiB)": 33.95,
2378
+ "ppl": 1.00133,
2379
+ "step": 1660,
2380
+ "tokens/total": 27207680,
2381
+ "tokens/train_per_sec_per_gpu": 7.72,
2382
+ "tokens/trainable": 8524834
2383
+ },
2384
+ {
2385
+ "epoch": 1.603791696664267,
2386
+ "grad_norm": 0.1416015625,
2387
+ "learning_rate": 0.0001,
2388
+ "loss": 0.0019244521856307984,
2389
+ "memory/device_reserved (GiB)": 35.97,
2390
+ "memory/max_active (GiB)": 33.95,
2391
+ "memory/max_allocated (GiB)": 33.95,
2392
+ "ppl": 1.00193,
2393
+ "step": 1670,
2394
+ "tokens/total": 27371520,
2395
+ "tokens/train_per_sec_per_gpu": 8.16,
2396
+ "tokens/trainable": 8576115
2397
+ },
2398
+ {
2399
+ "epoch": 1.613390928725702,
2400
+ "grad_norm": 0.0859375,
2401
+ "learning_rate": 0.0001,
2402
+ "loss": 0.0014983797445893288,
2403
+ "memory/device_reserved (GiB)": 35.97,
2404
+ "memory/max_active (GiB)": 33.95,
2405
+ "memory/max_allocated (GiB)": 33.95,
2406
+ "ppl": 1.0015,
2407
+ "step": 1680,
2408
+ "tokens/total": 27535360,
2409
+ "tokens/train_per_sec_per_gpu": 8.39,
2410
+ "tokens/trainable": 8627475
2411
+ },
2412
+ {
2413
+ "epoch": 1.622990160787137,
2414
+ "grad_norm": 0.042236328125,
2415
+ "learning_rate": 0.0001,
2416
+ "loss": 0.0012701219879090787,
2417
+ "memory/device_reserved (GiB)": 35.97,
2418
+ "memory/max_active (GiB)": 33.95,
2419
+ "memory/max_allocated (GiB)": 33.95,
2420
+ "ppl": 1.00127,
2421
+ "step": 1690,
2422
+ "tokens/total": 27699200,
2423
+ "tokens/train_per_sec_per_gpu": 7.92,
2424
+ "tokens/trainable": 8678654
2425
+ },
2426
+ {
2427
+ "epoch": 1.6325893928485722,
2428
+ "grad_norm": 0.10986328125,
2429
+ "learning_rate": 0.0001,
2430
+ "loss": 0.0013377158902585506,
2431
+ "memory/device_reserved (GiB)": 35.97,
2432
+ "memory/max_active (GiB)": 33.95,
2433
+ "memory/max_allocated (GiB)": 33.95,
2434
+ "ppl": 1.00134,
2435
+ "step": 1700,
2436
+ "tokens/total": 27863040,
2437
+ "tokens/train_per_sec_per_gpu": 8.21,
2438
+ "tokens/trainable": 8729739
2439
+ },
2440
+ {
2441
+ "epoch": 1.6421886249100073,
2442
+ "grad_norm": 0.06298828125,
2443
+ "learning_rate": 0.0001,
2444
+ "loss": 0.001280638948082924,
2445
+ "memory/device_reserved (GiB)": 35.97,
2446
+ "memory/max_active (GiB)": 33.95,
2447
+ "memory/max_allocated (GiB)": 33.95,
2448
+ "ppl": 1.00128,
2449
+ "step": 1710,
2450
+ "tokens/total": 28026880,
2451
+ "tokens/train_per_sec_per_gpu": 8.23,
2452
+ "tokens/trainable": 8781378
2453
+ },
2454
+ {
2455
+ "epoch": 1.6517878569714424,
2456
+ "grad_norm": 0.034423828125,
2457
+ "learning_rate": 0.0001,
2458
+ "loss": 0.0007919369265437127,
2459
+ "memory/device_reserved (GiB)": 35.97,
2460
+ "memory/max_active (GiB)": 33.95,
2461
+ "memory/max_allocated (GiB)": 33.95,
2462
+ "ppl": 1.00079,
2463
+ "step": 1720,
2464
+ "tokens/total": 28190720,
2465
+ "tokens/train_per_sec_per_gpu": 8.31,
2466
+ "tokens/trainable": 8832775
2467
+ },
2468
+ {
2469
+ "epoch": 1.6613870890328775,
2470
+ "grad_norm": 0.05322265625,
2471
+ "learning_rate": 0.0001,
2472
+ "loss": 0.0013359258882701397,
2473
+ "memory/device_reserved (GiB)": 35.97,
2474
+ "memory/max_active (GiB)": 33.95,
2475
+ "memory/max_allocated (GiB)": 33.95,
2476
+ "ppl": 1.00134,
2477
+ "step": 1730,
2478
+ "tokens/total": 28354560,
2479
+ "tokens/train_per_sec_per_gpu": 7.45,
2480
+ "tokens/trainable": 8884196
2481
+ },
2482
+ {
2483
+ "epoch": 1.6709863210943126,
2484
+ "grad_norm": 0.0498046875,
2485
+ "learning_rate": 0.0001,
2486
+ "loss": 0.0010936973616480828,
2487
+ "memory/device_reserved (GiB)": 35.97,
2488
+ "memory/max_active (GiB)": 33.95,
2489
+ "memory/max_allocated (GiB)": 33.95,
2490
+ "ppl": 1.00109,
2491
+ "step": 1740,
2492
+ "tokens/total": 28518400,
2493
+ "tokens/train_per_sec_per_gpu": 8.84,
2494
+ "tokens/trainable": 8935603
2495
+ },
2496
+ {
2497
+ "epoch": 1.6805855531557476,
2498
+ "grad_norm": 0.0216064453125,
2499
+ "learning_rate": 0.0001,
2500
+ "loss": 0.0009528477676212788,
2501
+ "memory/device_reserved (GiB)": 35.97,
2502
+ "memory/max_active (GiB)": 33.95,
2503
+ "memory/max_allocated (GiB)": 33.95,
2504
+ "ppl": 1.00095,
2505
+ "step": 1750,
2506
+ "tokens/total": 28682240,
2507
+ "tokens/train_per_sec_per_gpu": 7.94,
2508
+ "tokens/trainable": 8987083
2509
+ },
2510
+ {
2511
+ "epoch": 1.6901847852171827,
2512
+ "grad_norm": 0.076171875,
2513
+ "learning_rate": 0.0001,
2514
+ "loss": 0.0006039697211235762,
2515
+ "memory/device_reserved (GiB)": 35.97,
2516
+ "memory/max_active (GiB)": 33.95,
2517
+ "memory/max_allocated (GiB)": 33.95,
2518
+ "ppl": 1.0006,
2519
+ "step": 1760,
2520
+ "tokens/total": 28846080,
2521
+ "tokens/train_per_sec_per_gpu": 8.42,
2522
+ "tokens/trainable": 9038209
2523
+ },
2524
+ {
2525
+ "epoch": 1.6997840172786178,
2526
+ "grad_norm": 0.034423828125,
2527
+ "learning_rate": 0.0001,
2528
+ "loss": 0.0006967922672629356,
2529
+ "memory/device_reserved (GiB)": 35.97,
2530
+ "memory/max_active (GiB)": 33.95,
2531
+ "memory/max_allocated (GiB)": 33.95,
2532
+ "ppl": 1.0007,
2533
+ "step": 1770,
2534
+ "tokens/total": 29009920,
2535
+ "tokens/train_per_sec_per_gpu": 8.48,
2536
+ "tokens/trainable": 9088795
2537
+ },
2538
+ {
2539
+ "epoch": 1.709383249340053,
2540
+ "grad_norm": 0.016357421875,
2541
+ "learning_rate": 0.0001,
2542
+ "loss": 0.0008365864865481854,
2543
+ "memory/device_reserved (GiB)": 35.97,
2544
+ "memory/max_active (GiB)": 33.95,
2545
+ "memory/max_allocated (GiB)": 33.95,
2546
+ "ppl": 1.00084,
2547
+ "step": 1780,
2548
+ "tokens/total": 29173760,
2549
+ "tokens/train_per_sec_per_gpu": 9.15,
2550
+ "tokens/trainable": 9140155
2551
+ },
2552
+ {
2553
+ "epoch": 1.718982481401488,
2554
+ "grad_norm": 0.0810546875,
2555
+ "learning_rate": 0.0001,
2556
+ "loss": 0.0005419908091425895,
2557
+ "memory/device_reserved (GiB)": 35.97,
2558
+ "memory/max_active (GiB)": 33.95,
2559
+ "memory/max_allocated (GiB)": 33.95,
2560
+ "ppl": 1.00054,
2561
+ "step": 1790,
2562
+ "tokens/total": 29337600,
2563
+ "tokens/train_per_sec_per_gpu": 9.11,
2564
+ "tokens/trainable": 9191457
2565
+ },
2566
+ {
2567
+ "epoch": 1.728581713462923,
2568
+ "grad_norm": 0.10546875,
2569
+ "learning_rate": 0.0001,
2570
+ "loss": 0.0008764880709350109,
2571
+ "memory/device_reserved (GiB)": 35.97,
2572
+ "memory/max_active (GiB)": 33.95,
2573
+ "memory/max_allocated (GiB)": 33.95,
2574
+ "ppl": 1.00088,
2575
+ "step": 1800,
2576
+ "tokens/total": 29501440,
2577
+ "tokens/train_per_sec_per_gpu": 9.69,
2578
+ "tokens/trainable": 9242381
2579
+ },
2580
+ {
2581
+ "epoch": 1.7381809455243582,
2582
+ "grad_norm": 0.365234375,
2583
+ "learning_rate": 0.0001,
2584
+ "loss": 0.0016637198626995088,
2585
+ "memory/device_reserved (GiB)": 35.97,
2586
+ "memory/max_active (GiB)": 33.95,
2587
+ "memory/max_allocated (GiB)": 33.95,
2588
+ "ppl": 1.00167,
2589
+ "step": 1810,
2590
+ "tokens/total": 29665280,
2591
+ "tokens/train_per_sec_per_gpu": 7.77,
2592
+ "tokens/trainable": 9294105
2593
+ },
2594
+ {
2595
+ "epoch": 1.7477801775857933,
2596
+ "grad_norm": 0.057861328125,
2597
+ "learning_rate": 0.0001,
2598
+ "loss": 0.0023251190781593324,
2599
+ "memory/device_reserved (GiB)": 35.97,
2600
+ "memory/max_active (GiB)": 33.95,
2601
+ "memory/max_allocated (GiB)": 33.95,
2602
+ "ppl": 1.00233,
2603
+ "step": 1820,
2604
+ "tokens/total": 29829120,
2605
+ "tokens/train_per_sec_per_gpu": 9.14,
2606
+ "tokens/trainable": 9345913
2607
+ },
2608
+ {
2609
+ "epoch": 1.7573794096472284,
2610
+ "grad_norm": 0.1953125,
2611
+ "learning_rate": 0.0001,
2612
+ "loss": 0.004122686386108398,
2613
+ "memory/device_reserved (GiB)": 35.97,
2614
+ "memory/max_active (GiB)": 33.95,
2615
+ "memory/max_allocated (GiB)": 33.95,
2616
+ "ppl": 1.00413,
2617
+ "step": 1830,
2618
+ "tokens/total": 29992960,
2619
+ "tokens/train_per_sec_per_gpu": 8.09,
2620
+ "tokens/trainable": 9397553
2621
+ },
2622
+ {
2623
+ "epoch": 1.7669786417086633,
2624
+ "grad_norm": 0.1875,
2625
+ "learning_rate": 0.0001,
2626
+ "loss": 0.00475989505648613,
2627
+ "memory/device_reserved (GiB)": 35.97,
2628
+ "memory/max_active (GiB)": 33.95,
2629
+ "memory/max_allocated (GiB)": 33.95,
2630
+ "ppl": 1.00477,
2631
+ "step": 1840,
2632
+ "tokens/total": 30156800,
2633
+ "tokens/train_per_sec_per_gpu": 9.12,
2634
+ "tokens/trainable": 9448264
2635
+ },
2636
+ {
2637
+ "epoch": 1.7765778737700983,
2638
+ "grad_norm": 0.12353515625,
2639
+ "learning_rate": 0.0001,
2640
+ "loss": 0.0030113702639937403,
2641
+ "memory/device_reserved (GiB)": 35.97,
2642
+ "memory/max_active (GiB)": 33.95,
2643
+ "memory/max_allocated (GiB)": 33.95,
2644
+ "ppl": 1.00302,
2645
+ "step": 1850,
2646
+ "tokens/total": 30320640,
2647
+ "tokens/train_per_sec_per_gpu": 8.87,
2648
+ "tokens/trainable": 9499714
2649
+ },
2650
+ {
2651
+ "epoch": 1.7861771058315334,
2652
+ "grad_norm": 0.08935546875,
2653
+ "learning_rate": 0.0001,
2654
+ "loss": 0.0021218497306108473,
2655
+ "memory/device_reserved (GiB)": 35.97,
2656
+ "memory/max_active (GiB)": 33.95,
2657
+ "memory/max_allocated (GiB)": 33.95,
2658
+ "ppl": 1.00212,
2659
+ "step": 1860,
2660
+ "tokens/total": 30484480,
2661
+ "tokens/train_per_sec_per_gpu": 8.7,
2662
+ "tokens/trainable": 9551484
2663
+ },
2664
+ {
2665
+ "epoch": 1.7957763378929685,
2666
+ "grad_norm": 0.0595703125,
2667
+ "learning_rate": 0.0001,
2668
+ "loss": 0.0019322805106639861,
2669
+ "memory/device_reserved (GiB)": 35.97,
2670
+ "memory/max_active (GiB)": 33.95,
2671
+ "memory/max_allocated (GiB)": 33.95,
2672
+ "ppl": 1.00193,
2673
+ "step": 1870,
2674
+ "tokens/total": 30648320,
2675
+ "tokens/train_per_sec_per_gpu": 8.71,
2676
+ "tokens/trainable": 9603376
2677
+ },
2678
+ {
2679
+ "epoch": 1.8053755699544036,
2680
+ "grad_norm": 0.1572265625,
2681
+ "learning_rate": 0.0001,
2682
+ "loss": 0.002130831032991409,
2683
+ "memory/device_reserved (GiB)": 35.97,
2684
+ "memory/max_active (GiB)": 33.95,
2685
+ "memory/max_allocated (GiB)": 33.95,
2686
+ "ppl": 1.00213,
2687
+ "step": 1880,
2688
+ "tokens/total": 30812160,
2689
+ "tokens/train_per_sec_per_gpu": 8.24,
2690
+ "tokens/trainable": 9654126
2691
+ },
2692
+ {
2693
+ "epoch": 1.8149748020158387,
2694
+ "grad_norm": 0.10546875,
2695
+ "learning_rate": 0.0001,
2696
+ "loss": 0.0013952101580798626,
2697
+ "memory/device_reserved (GiB)": 35.97,
2698
+ "memory/max_active (GiB)": 33.95,
2699
+ "memory/max_allocated (GiB)": 33.95,
2700
+ "ppl": 1.0014,
2701
+ "step": 1890,
2702
+ "tokens/total": 30976000,
2703
+ "tokens/train_per_sec_per_gpu": 9.71,
2704
+ "tokens/trainable": 9704260
2705
+ },
2706
+ {
2707
+ "epoch": 1.8245740340772738,
2708
+ "grad_norm": 0.06591796875,
2709
+ "learning_rate": 0.0001,
2710
+ "loss": 0.0013564865104854107,
2711
+ "memory/device_reserved (GiB)": 35.97,
2712
+ "memory/max_active (GiB)": 33.95,
2713
+ "memory/max_allocated (GiB)": 33.95,
2714
+ "ppl": 1.00136,
2715
+ "step": 1900,
2716
+ "tokens/total": 31139840,
2717
+ "tokens/train_per_sec_per_gpu": 8.05,
2718
+ "tokens/trainable": 9755570
2719
+ },
2720
+ {
2721
+ "epoch": 1.834173266138709,
2722
+ "grad_norm": 0.099609375,
2723
+ "learning_rate": 0.0001,
2724
+ "loss": 0.0014380639418959617,
2725
+ "memory/device_reserved (GiB)": 35.97,
2726
+ "memory/max_active (GiB)": 33.95,
2727
+ "memory/max_allocated (GiB)": 33.95,
2728
+ "ppl": 1.00144,
2729
+ "step": 1910,
2730
+ "tokens/total": 31303680,
2731
+ "tokens/train_per_sec_per_gpu": 8.37,
2732
+ "tokens/trainable": 9806775
2733
+ },
2734
+ {
2735
+ "epoch": 1.843772498200144,
2736
+ "grad_norm": 0.0908203125,
2737
+ "learning_rate": 0.0001,
2738
+ "loss": 0.0013548688031733036,
2739
+ "memory/device_reserved (GiB)": 35.97,
2740
+ "memory/max_active (GiB)": 33.95,
2741
+ "memory/max_allocated (GiB)": 33.95,
2742
+ "ppl": 1.00136,
2743
+ "step": 1920,
2744
+ "tokens/total": 31467520,
2745
+ "tokens/train_per_sec_per_gpu": 8.63,
2746
+ "tokens/trainable": 9858270
2747
+ },
2748
+ {
2749
+ "epoch": 1.853371730261579,
2750
+ "grad_norm": 0.02587890625,
2751
+ "learning_rate": 0.0001,
2752
+ "loss": 0.001307238917797804,
2753
+ "memory/device_reserved (GiB)": 35.97,
2754
+ "memory/max_active (GiB)": 33.95,
2755
+ "memory/max_allocated (GiB)": 33.95,
2756
+ "ppl": 1.00131,
2757
+ "step": 1930,
2758
+ "tokens/total": 31631360,
2759
+ "tokens/train_per_sec_per_gpu": 8.62,
2760
+ "tokens/trainable": 9909516
2761
+ },
2762
+ {
2763
+ "epoch": 1.8629709623230142,
2764
+ "grad_norm": 0.08447265625,
2765
+ "learning_rate": 0.0001,
2766
+ "loss": 0.001157990377396345,
2767
+ "memory/device_reserved (GiB)": 35.97,
2768
+ "memory/max_active (GiB)": 33.95,
2769
+ "memory/max_allocated (GiB)": 33.95,
2770
+ "ppl": 1.00116,
2771
+ "step": 1940,
2772
+ "tokens/total": 31795200,
2773
+ "tokens/train_per_sec_per_gpu": 8.34,
2774
+ "tokens/trainable": 9960863
2775
+ },
2776
+ {
2777
+ "epoch": 1.8725701943844493,
2778
+ "grad_norm": 0.0263671875,
2779
+ "learning_rate": 0.0001,
2780
+ "loss": 0.0011683990247547626,
2781
+ "memory/device_reserved (GiB)": 35.97,
2782
+ "memory/max_active (GiB)": 33.95,
2783
+ "memory/max_allocated (GiB)": 33.95,
2784
+ "ppl": 1.00117,
2785
+ "step": 1950,
2786
+ "tokens/total": 31959040,
2787
+ "tokens/train_per_sec_per_gpu": 7.79,
2788
+ "tokens/trainable": 10013368
2789
+ },
2790
+ {
2791
+ "epoch": 1.8821694264458844,
2792
+ "grad_norm": 0.05908203125,
2793
+ "learning_rate": 0.0001,
2794
+ "loss": 0.0007935081608593464,
2795
+ "memory/device_reserved (GiB)": 35.97,
2796
+ "memory/max_active (GiB)": 33.95,
2797
+ "memory/max_allocated (GiB)": 33.95,
2798
+ "ppl": 1.00079,
2799
+ "step": 1960,
2800
+ "tokens/total": 32122880,
2801
+ "tokens/train_per_sec_per_gpu": 9.08,
2802
+ "tokens/trainable": 10064774
2803
+ },
2804
+ {
2805
+ "epoch": 1.8917686585073195,
2806
+ "grad_norm": 0.055419921875,
2807
+ "learning_rate": 0.0001,
2808
+ "loss": 0.0008186689577996731,
2809
+ "memory/device_reserved (GiB)": 35.97,
2810
+ "memory/max_active (GiB)": 33.95,
2811
+ "memory/max_allocated (GiB)": 33.95,
2812
+ "ppl": 1.00082,
2813
+ "step": 1970,
2814
+ "tokens/total": 32286720,
2815
+ "tokens/train_per_sec_per_gpu": 8.17,
2816
+ "tokens/trainable": 10116172
2817
+ },
2818
+ {
2819
+ "epoch": 1.9013678905687545,
2820
+ "grad_norm": 0.091796875,
2821
+ "learning_rate": 0.0001,
2822
+ "loss": 0.0007270051632076502,
2823
+ "memory/device_reserved (GiB)": 35.97,
2824
+ "memory/max_active (GiB)": 33.95,
2825
+ "memory/max_allocated (GiB)": 33.95,
2826
+ "ppl": 1.00073,
2827
+ "step": 1980,
2828
+ "tokens/total": 32450560,
2829
+ "tokens/train_per_sec_per_gpu": 8.34,
2830
+ "tokens/trainable": 10167347
2831
+ },
2832
+ {
2833
+ "epoch": 1.9109671226301894,
2834
+ "grad_norm": 0.091796875,
2835
+ "learning_rate": 0.0001,
2836
+ "loss": 0.0011019655503332615,
2837
+ "memory/device_reserved (GiB)": 35.97,
2838
+ "memory/max_active (GiB)": 33.95,
2839
+ "memory/max_allocated (GiB)": 33.95,
2840
+ "ppl": 1.0011,
2841
+ "step": 1990,
2842
+ "tokens/total": 32614400,
2843
+ "tokens/train_per_sec_per_gpu": 9.52,
2844
+ "tokens/trainable": 10218140
2845
+ },
2846
+ {
2847
+ "epoch": 1.9205663546916245,
2848
+ "grad_norm": 0.1396484375,
2849
+ "learning_rate": 0.0001,
2850
+ "loss": 0.0009611468762159347,
2851
+ "memory/device_reserved (GiB)": 35.97,
2852
+ "memory/max_active (GiB)": 33.95,
2853
+ "memory/max_allocated (GiB)": 33.95,
2854
+ "ppl": 1.00096,
2855
+ "step": 2000,
2856
+ "tokens/total": 32778240,
2857
+ "tokens/train_per_sec_per_gpu": 6.9,
2858
+ "tokens/trainable": 10269179
2859
+ },
2860
+ {
2861
+ "epoch": 1.9301655867530596,
2862
+ "grad_norm": 0.0247802734375,
2863
+ "learning_rate": 0.0001,
2864
+ "loss": 0.000824358593672514,
2865
+ "memory/device_reserved (GiB)": 35.97,
2866
+ "memory/max_active (GiB)": 33.95,
2867
+ "memory/max_allocated (GiB)": 33.95,
2868
+ "ppl": 1.00082,
2869
+ "step": 2010,
2870
+ "tokens/total": 32942080,
2871
+ "tokens/train_per_sec_per_gpu": 8.34,
2872
+ "tokens/trainable": 10320155
2873
+ },
2874
+ {
2875
+ "epoch": 1.9397648188144947,
2876
+ "grad_norm": 0.06396484375,
2877
+ "learning_rate": 0.0001,
2878
+ "loss": 0.0006628005299717188,
2879
+ "memory/device_reserved (GiB)": 35.97,
2880
+ "memory/max_active (GiB)": 33.95,
2881
+ "memory/max_allocated (GiB)": 33.95,
2882
+ "ppl": 1.00066,
2883
+ "step": 2020,
2884
+ "tokens/total": 33105920,
2885
+ "tokens/train_per_sec_per_gpu": 6.81,
2886
+ "tokens/trainable": 10371059
2887
+ },
2888
+ {
2889
+ "epoch": 1.9493640508759298,
2890
+ "grad_norm": 0.0291748046875,
2891
+ "learning_rate": 0.0001,
2892
+ "loss": 0.0009558702819049359,
2893
+ "memory/device_reserved (GiB)": 35.97,
2894
+ "memory/max_active (GiB)": 33.95,
2895
+ "memory/max_allocated (GiB)": 33.95,
2896
+ "ppl": 1.00096,
2897
+ "step": 2030,
2898
+ "tokens/total": 33269760,
2899
+ "tokens/train_per_sec_per_gpu": 8.88,
2900
+ "tokens/trainable": 10422089
2901
+ },
2902
+ {
2903
+ "epoch": 1.9589632829373649,
2904
+ "grad_norm": 0.08349609375,
2905
+ "learning_rate": 0.0001,
2906
+ "loss": 0.0006137116346508264,
2907
+ "memory/device_reserved (GiB)": 35.97,
2908
+ "memory/max_active (GiB)": 33.95,
2909
+ "memory/max_allocated (GiB)": 33.95,
2910
+ "ppl": 1.00061,
2911
+ "step": 2040,
2912
+ "tokens/total": 33433600,
2913
+ "tokens/train_per_sec_per_gpu": 7.84,
2914
+ "tokens/trainable": 10473462
2915
+ },
2916
+ {
2917
+ "epoch": 1.9685625149988,
2918
+ "grad_norm": 0.291015625,
2919
+ "learning_rate": 0.0001,
2920
+ "loss": 0.0007995942607522011,
2921
+ "memory/device_reserved (GiB)": 35.97,
2922
+ "memory/max_active (GiB)": 33.95,
2923
+ "memory/max_allocated (GiB)": 33.95,
2924
+ "ppl": 1.0008,
2925
+ "step": 2050,
2926
+ "tokens/total": 33597440,
2927
+ "tokens/train_per_sec_per_gpu": 8.46,
2928
+ "tokens/trainable": 10524388
2929
+ },
2930
+ {
2931
+ "epoch": 1.978161747060235,
2932
+ "grad_norm": 0.01318359375,
2933
+ "learning_rate": 0.0001,
2934
+ "loss": 0.0012844327837228775,
2935
+ "memory/device_reserved (GiB)": 35.97,
2936
+ "memory/max_active (GiB)": 33.95,
2937
+ "memory/max_allocated (GiB)": 33.95,
2938
+ "ppl": 1.00129,
2939
+ "step": 2060,
2940
+ "tokens/total": 33761280,
2941
+ "tokens/train_per_sec_per_gpu": 8.64,
2942
+ "tokens/trainable": 10575622
2943
+ },
2944
+ {
2945
+ "epoch": 1.9877609791216702,
2946
+ "grad_norm": 0.0113525390625,
2947
+ "learning_rate": 0.0001,
2948
+ "loss": 0.0011016235686838627,
2949
+ "memory/device_reserved (GiB)": 35.97,
2950
+ "memory/max_active (GiB)": 33.95,
2951
+ "memory/max_allocated (GiB)": 33.95,
2952
+ "ppl": 1.0011,
2953
+ "step": 2070,
2954
+ "tokens/total": 33925120,
2955
+ "tokens/train_per_sec_per_gpu": 9.57,
2956
+ "tokens/trainable": 10627503
2957
+ },
2958
+ {
2959
+ "epoch": 1.9973602111831052,
2960
+ "grad_norm": 0.1630859375,
2961
+ "learning_rate": 0.0001,
2962
+ "loss": 0.0009904997423291206,
2963
+ "memory/device_reserved (GiB)": 35.97,
2964
+ "memory/max_active (GiB)": 33.95,
2965
+ "memory/max_allocated (GiB)": 33.95,
2966
+ "ppl": 1.00099,
2967
+ "step": 2080,
2968
+ "tokens/total": 34088960,
2969
+ "tokens/train_per_sec_per_gpu": 8.51,
2970
+ "tokens/trainable": 10679027
2971
+ }
2972
+ ],
2973
+ "logging_steps": 10,
2974
+ "max_steps": 3123,
2975
+ "num_input_tokens_seen": 0,
2976
+ "num_train_epochs": 3,
2977
+ "save_steps": 1041,
2978
+ "stateful_callbacks": {
2979
+ "TrainerControl": {
2980
+ "args": {
2981
+ "should_epoch_stop": false,
2982
+ "should_evaluate": false,
2983
+ "should_log": false,
2984
+ "should_save": true,
2985
+ "should_training_stop": false
2986
+ },
2987
+ "attributes": {}
2988
+ }
2989
+ },
2990
+ "total_flos": 7.438902357896724e+17,
2991
+ "train_batch_size": 1,
2992
+ "trial_name": null,
2993
+ "trial_params": null
2994
+ }
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-2082/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bc5dc0a6b631434a1e530ec14cbf9d04e0cb0394c28ae6df258badbdff9da4e
3
+ size 7121
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ ' + message['content'] + '<|im_end|>' + '
3
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
+ ' }}{% endif %}
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2560,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 9728,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention"
52
+ ],
53
+ "max_position_embeddings": 262144,
54
+ "max_window_layers": 36,
55
+ "model_type": "qwen3",
56
+ "num_attention_heads": 32,
57
+ "num_hidden_layers": 36,
58
+ "num_key_value_heads": 8,
59
+ "pad_token_id": 151643,
60
+ "rms_norm_eps": 1e-06,
61
+ "rope_parameters": {
62
+ "rope_theta": 5000000,
63
+ "rope_type": "default"
64
+ },
65
+ "sliding_window": null,
66
+ "tie_word_embeddings": true,
67
+ "transformers_version": "5.0.0",
68
+ "use_cache": false,
69
+ "use_sliding_window": false,
70
+ "vocab_size": 151936
71
+ }
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "temperature": 0.7,
9
+ "top_k": 20,
10
+ "top_p": 0.8,
11
+ "transformers_version": "5.0.0"
12
+ }
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20f299eec38f2ee6b3400fd956fdd92266f72da5225fa3b04e2fe1e66ccf72d5
3
+ size 8822894520
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1bc46ef1ffa4a3e07cac67e81070ecec954323920d27e8b2388f5f89d6909ec
3
+ size 16090225449
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20ea3a198ff666cb4ace1c684b598fe43fc7c3c276b83efc553a1b787e12a304
3
+ size 14645
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78c3c62dddcf61ce76eba74e4febde7485ae697ca0a51e1ac7b67acf61c1d077
3
+ size 1465
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": true,
24
+ "model_max_length": 1010000,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/tokens_state. ADDED
@@ -0,0 +1 @@
 
 
1
+ {"total": 51173376, "trainable": 16031558}
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/checkpoint-3123/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bc5dc0a6b631434a1e530ec14cbf9d04e0cb0394c28ae6df258badbdff9da4e
3
+ size 7121
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/debug.log ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/balanced_test_alpaca_converted.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/balanced_test_alpaca_results.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/eval_results.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ category,filename,total,correct,accuracy,format_found,format_accuracy,errors_count
2
+ math_operations,balanced_test_alpaca_results,500,3,0.60,500,100.00,497
checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/eval_summary.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "overall": {
3
+ "total": 500,
4
+ "correct": 3,
5
+ "accuracy": 0.6,
6
+ "format_found": 500,
7
+ "format_accuracy": 100.0
8
+ },
9
+ "per_operation": {
10
+ "a": {
11
+ "total": 25,
12
+ "correct": 0,
13
+ "accuracy": 0.0,
14
+ "format_found": 25
15
+ },
16
+ "b": {
17
+ "total": 25,
18
+ "correct": 0,
19
+ "accuracy": 0.0,
20
+ "format_found": 25
21
+ },
22
+ "c": {
23
+ "total": 25,
24
+ "correct": 0,
25
+ "accuracy": 0.0,
26
+ "format_found": 25
27
+ },
28
+ "d": {
29
+ "total": 25,
30
+ "correct": 0,
31
+ "accuracy": 0.0,
32
+ "format_found": 25
33
+ },
34
+ "e": {
35
+ "total": 25,
36
+ "correct": 0,
37
+ "accuracy": 0.0,
38
+ "format_found": 25
39
+ },
40
+ "f": {
41
+ "total": 25,
42
+ "correct": 0,
43
+ "accuracy": 0.0,
44
+ "format_found": 25
45
+ },
46
+ "g": {
47
+ "total": 25,
48
+ "correct": 1,
49
+ "accuracy": 4.0,
50
+ "format_found": 25
51
+ },
52
+ "h": {
53
+ "total": 25,
54
+ "correct": 0,
55
+ "accuracy": 0.0,
56
+ "format_found": 25
57
+ },
58
+ "i": {
59
+ "total": 25,
60
+ "correct": 1,
61
+ "accuracy": 4.0,
62
+ "format_found": 25
63
+ },
64
+ "j": {
65
+ "total": 25,
66
+ "correct": 0,
67
+ "accuracy": 0.0,
68
+ "format_found": 25
69
+ },
70
+ "k": {
71
+ "total": 25,
72
+ "correct": 0,
73
+ "accuracy": 0.0,
74
+ "format_found": 25
75
+ },
76
+ "l": {
77
+ "total": 25,
78
+ "correct": 0,
79
+ "accuracy": 0.0,
80
+ "format_found": 25
81
+ },
82
+ "m": {
83
+ "total": 25,
84
+ "correct": 0,
85
+ "accuracy": 0.0,
86
+ "format_found": 25
87
+ },
88
+ "n": {
89
+ "total": 25,
90
+ "correct": 0,
91
+ "accuracy": 0.0,
92
+ "format_found": 25
93
+ },
94
+ "o": {
95
+ "total": 25,
96
+ "correct": 0,
97
+ "accuracy": 0.0,
98
+ "format_found": 25
99
+ },
100
+ "p": {
101
+ "total": 25,
102
+ "correct": 0,
103
+ "accuracy": 0.0,
104
+ "format_found": 25
105
+ },
106
+ "q": {
107
+ "total": 25,
108
+ "correct": 0,
109
+ "accuracy": 0.0,
110
+ "format_found": 25
111
+ },
112
+ "r": {
113
+ "total": 25,
114
+ "correct": 0,
115
+ "accuracy": 0.0,
116
+ "format_found": 25
117
+ },
118
+ "s": {
119
+ "total": 25,
120
+ "correct": 0,
121
+ "accuracy": 0.0,
122
+ "format_found": 25
123
+ },
124
+ "t": {
125
+ "total": 25,
126
+ "correct": 1,
127
+ "accuracy": 4.0,
128
+ "format_found": 25
129
+ }
130
+ },
131
+ "n_errors": 497,
132
+ "results_file": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/primitive_atomic_full_sft_50k_lr1e4_t20260308/eval_results_easy_ops/balanced_test_alpaca_results.jsonl"
133
+ }