Jerry999 commited on
Commit
02dfe15
·
verified ·
1 Parent(s): 987f783

Upload checkpoints/math_operations/lora_sft_primitive_atomic_50k

Browse files
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/adapter_config.json +46 -0
  3. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/adapter_model.safetensors +3 -0
  4. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/chat_template.jinja +4 -0
  5. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/optimizer.pt +3 -0
  6. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/rng_state.pth +3 -0
  7. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/scheduler.pt +3 -0
  8. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokenizer.json +3 -0
  9. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokenizer_config.json +29 -0
  10. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokens_state. +1 -0
  11. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/trainer_state.json +1500 -0
  12. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/training_args.bin +3 -0
  13. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/adapter_config.json +46 -0
  14. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/adapter_model.safetensors +3 -0
  15. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/chat_template.jinja +4 -0
  16. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/optimizer.pt +3 -0
  17. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/rng_state.pth +3 -0
  18. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/scheduler.pt +3 -0
  19. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokenizer.json +3 -0
  20. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokenizer_config.json +29 -0
  21. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokens_state. +1 -0
  22. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/trainer_state.json +2966 -0
  23. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/training_args.bin +3 -0
  24. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/adapter_config.json +46 -0
  25. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/adapter_model.safetensors +3 -0
  26. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/chat_template.jinja +4 -0
  27. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/optimizer.pt +3 -0
  28. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/rng_state.pth +3 -0
  29. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/scheduler.pt +3 -0
  30. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokenizer.json +3 -0
  31. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokenizer_config.json +29 -0
  32. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokens_state. +1 -0
  33. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/trainer_state.json +0 -0
  34. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/training_args.bin +3 -0
  35. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/adapter_config.json +46 -0
  36. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/adapter_model.safetensors +3 -0
  37. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/chat_template.jinja +4 -0
  38. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/optimizer.pt +3 -0
  39. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/rng_state.pth +3 -0
  40. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/scheduler.pt +3 -0
  41. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokenizer.json +3 -0
  42. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokenizer_config.json +29 -0
  43. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokens_state. +1 -0
  44. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/trainer_state.json +0 -0
  45. checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/training_args.bin +3 -0
  46. checkpoints/math_operations/lora_sft_primitive_atomic_50k/debug.log +0 -0
  47. checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/balanced_test_alpaca_converted.jsonl +0 -0
  48. checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/balanced_test_alpaca_results.jsonl +0 -0
  49. checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/eval_results.csv +2 -0
  50. checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/eval_summary.json +133 -0
.gitattributes CHANGED
@@ -63,3 +63,7 @@ checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/checkpoint-1248/tokenizer.j
63
  checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/checkpoint-1274/tokenizer.json filter=lfs diff=lfs merge=lfs -text
64
  checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/checkpoint-1300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
65
  checkpoints/math_operations/base_model_eval/eval_results_easy_ops/balanced_test_alpaca_results.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
63
  checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/checkpoint-1274/tokenizer.json filter=lfs diff=lfs merge=lfs -text
64
  checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/checkpoint-1300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
65
  checkpoints/math_operations/base_model_eval/eval_results_easy_ops/balanced_test_alpaca_results.jsonl filter=lfs diff=lfs merge=lfs -text
66
+ checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokenizer.json filter=lfs diff=lfs merge=lfs -text
67
+ checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokenizer.json filter=lfs diff=lfs merge=lfs -text
68
+ checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokenizer.json filter=lfs diff=lfs merge=lfs -text
69
+ checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 32,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "down_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "o_proj",
36
+ "k_proj",
37
+ "q_proj",
38
+ "up_proj"
39
+ ],
40
+ "target_parameters": [],
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:379e91d5a96f500546d4939abf324418d8037973997e317959dee26b1120871d
3
+ size 264308896
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ ' + message['content'] + '<|im_end|>' + '
3
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
+ ' }}{% endif %}
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c868a01dbfce25472c802e9bb70d445af05f005b7200ed0296eeb2afef96ff3
3
+ size 528915403
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ac2f7da54075bc45ef2073674c010a395ea84101521997fd9e15096792e2601
3
+ size 14645
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2edfde6394729c32e1a44395988017d981d5c693031d697a92775ac9a22761ec
3
+ size 1465
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": true,
24
+ "model_max_length": 1010000,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/tokens_state. ADDED
@@ -0,0 +1 @@
 
 
1
+ {"total": 16904192, "trainable": 5351770}
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/trainer_state.json ADDED
@@ -0,0 +1,1500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.000969696969697,
6
+ "eval_steps": 516,
7
+ "global_step": 1031,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0,
14
+ "eval_loss": 0.8898435831069946,
15
+ "eval_ppl": 2.43475,
16
+ "eval_runtime": 12.6383,
17
+ "eval_samples_per_second": 15.825,
18
+ "eval_steps_per_second": 7.912,
19
+ "memory/device_reserved (GiB)": 13.84,
20
+ "memory/max_active (GiB)": 13.69,
21
+ "memory/max_allocated (GiB)": 13.69,
22
+ "step": 0
23
+ },
24
+ {
25
+ "epoch": 0.009696969696969697,
26
+ "grad_norm": 2.995619058609009,
27
+ "learning_rate": 3.4951456310679615e-06,
28
+ "loss": 0.8680612564086914,
29
+ "memory/device_reserved (GiB)": 18.85,
30
+ "memory/max_active (GiB)": 16.23,
31
+ "memory/max_allocated (GiB)": 16.23,
32
+ "ppl": 2.38229,
33
+ "step": 10,
34
+ "tokens/total": 163840,
35
+ "tokens/train_per_sec_per_gpu": 14.27,
36
+ "tokens/trainable": 51990
37
+ },
38
+ {
39
+ "epoch": 0.019393939393939394,
40
+ "grad_norm": 2.1244935989379883,
41
+ "learning_rate": 7.378640776699029e-06,
42
+ "loss": 0.7699687004089355,
43
+ "memory/device_reserved (GiB)": 18.85,
44
+ "memory/max_active (GiB)": 16.23,
45
+ "memory/max_allocated (GiB)": 16.23,
46
+ "ppl": 2.1597,
47
+ "step": 20,
48
+ "tokens/total": 327680,
49
+ "tokens/train_per_sec_per_gpu": 16.06,
50
+ "tokens/trainable": 104391
51
+ },
52
+ {
53
+ "epoch": 0.02909090909090909,
54
+ "grad_norm": 0.9706138372421265,
55
+ "learning_rate": 1.1262135922330098e-05,
56
+ "loss": 0.5319457054138184,
57
+ "memory/device_reserved (GiB)": 18.85,
58
+ "memory/max_active (GiB)": 16.23,
59
+ "memory/max_allocated (GiB)": 16.23,
60
+ "ppl": 1.70224,
61
+ "step": 30,
62
+ "tokens/total": 491520,
63
+ "tokens/train_per_sec_per_gpu": 16.48,
64
+ "tokens/trainable": 156787
65
+ },
66
+ {
67
+ "epoch": 0.03878787878787879,
68
+ "grad_norm": 0.7689842581748962,
69
+ "learning_rate": 1.5145631067961166e-05,
70
+ "loss": 0.30234951972961427,
71
+ "memory/device_reserved (GiB)": 18.85,
72
+ "memory/max_active (GiB)": 16.23,
73
+ "memory/max_allocated (GiB)": 16.23,
74
+ "ppl": 1.35303,
75
+ "step": 40,
76
+ "tokens/total": 655360,
77
+ "tokens/train_per_sec_per_gpu": 14.84,
78
+ "tokens/trainable": 208924
79
+ },
80
+ {
81
+ "epoch": 0.048484848484848485,
82
+ "grad_norm": 0.45850396156311035,
83
+ "learning_rate": 1.9029126213592234e-05,
84
+ "loss": 0.1519382953643799,
85
+ "memory/device_reserved (GiB)": 18.85,
86
+ "memory/max_active (GiB)": 16.23,
87
+ "memory/max_allocated (GiB)": 16.23,
88
+ "ppl": 1.16409,
89
+ "step": 50,
90
+ "tokens/total": 819200,
91
+ "tokens/train_per_sec_per_gpu": 14.61,
92
+ "tokens/trainable": 261170
93
+ },
94
+ {
95
+ "epoch": 0.05818181818181818,
96
+ "grad_norm": 0.41381561756134033,
97
+ "learning_rate": 2.29126213592233e-05,
98
+ "loss": 0.062263429164886475,
99
+ "memory/device_reserved (GiB)": 18.85,
100
+ "memory/max_active (GiB)": 16.23,
101
+ "memory/max_allocated (GiB)": 16.23,
102
+ "ppl": 1.06424,
103
+ "step": 60,
104
+ "tokens/total": 983040,
105
+ "tokens/train_per_sec_per_gpu": 14.19,
106
+ "tokens/trainable": 313808
107
+ },
108
+ {
109
+ "epoch": 0.06787878787878789,
110
+ "grad_norm": 0.4865979254245758,
111
+ "learning_rate": 2.6796116504854367e-05,
112
+ "loss": 0.018695920705795288,
113
+ "memory/device_reserved (GiB)": 18.85,
114
+ "memory/max_active (GiB)": 16.23,
115
+ "memory/max_allocated (GiB)": 16.23,
116
+ "ppl": 1.01887,
117
+ "step": 70,
118
+ "tokens/total": 1146880,
119
+ "tokens/train_per_sec_per_gpu": 14.62,
120
+ "tokens/trainable": 366068
121
+ },
122
+ {
123
+ "epoch": 0.07757575757575758,
124
+ "grad_norm": 0.39099738001823425,
125
+ "learning_rate": 3.067961165048544e-05,
126
+ "loss": 0.006136053055524826,
127
+ "memory/device_reserved (GiB)": 18.85,
128
+ "memory/max_active (GiB)": 16.23,
129
+ "memory/max_allocated (GiB)": 16.23,
130
+ "ppl": 1.00615,
131
+ "step": 80,
132
+ "tokens/total": 1310720,
133
+ "tokens/train_per_sec_per_gpu": 13.81,
134
+ "tokens/trainable": 418120
135
+ },
136
+ {
137
+ "epoch": 0.08727272727272728,
138
+ "grad_norm": 0.08230593055486679,
139
+ "learning_rate": 3.456310679611651e-05,
140
+ "loss": 0.004204501211643219,
141
+ "memory/device_reserved (GiB)": 18.85,
142
+ "memory/max_active (GiB)": 16.23,
143
+ "memory/max_allocated (GiB)": 16.23,
144
+ "ppl": 1.00421,
145
+ "step": 90,
146
+ "tokens/total": 1474560,
147
+ "tokens/train_per_sec_per_gpu": 15.07,
148
+ "tokens/trainable": 470244
149
+ },
150
+ {
151
+ "epoch": 0.09696969696969697,
152
+ "grad_norm": 0.13297680020332336,
153
+ "learning_rate": 3.844660194174757e-05,
154
+ "loss": 0.0036250378936529158,
155
+ "memory/device_reserved (GiB)": 18.85,
156
+ "memory/max_active (GiB)": 16.23,
157
+ "memory/max_allocated (GiB)": 16.23,
158
+ "ppl": 1.00363,
159
+ "step": 100,
160
+ "tokens/total": 1638400,
161
+ "tokens/train_per_sec_per_gpu": 14.91,
162
+ "tokens/trainable": 522666
163
+ },
164
+ {
165
+ "epoch": 0.10666666666666667,
166
+ "grad_norm": 0.2430051565170288,
167
+ "learning_rate": 4.2330097087378647e-05,
168
+ "loss": 0.003873714804649353,
169
+ "memory/device_reserved (GiB)": 18.85,
170
+ "memory/max_active (GiB)": 16.23,
171
+ "memory/max_allocated (GiB)": 16.23,
172
+ "ppl": 1.00388,
173
+ "step": 110,
174
+ "tokens/total": 1802240,
175
+ "tokens/train_per_sec_per_gpu": 14.17,
176
+ "tokens/trainable": 574329
177
+ },
178
+ {
179
+ "epoch": 0.11636363636363636,
180
+ "grad_norm": 0.09347938001155853,
181
+ "learning_rate": 4.621359223300971e-05,
182
+ "loss": 0.00237951148301363,
183
+ "memory/device_reserved (GiB)": 18.85,
184
+ "memory/max_active (GiB)": 16.23,
185
+ "memory/max_allocated (GiB)": 16.23,
186
+ "ppl": 1.00238,
187
+ "step": 120,
188
+ "tokens/total": 1966080,
189
+ "tokens/train_per_sec_per_gpu": 14.33,
190
+ "tokens/trainable": 626194
191
+ },
192
+ {
193
+ "epoch": 0.12606060606060607,
194
+ "grad_norm": 0.13388365507125854,
195
+ "learning_rate": 5.0097087378640786e-05,
196
+ "loss": 0.0015400107949972153,
197
+ "memory/device_reserved (GiB)": 18.85,
198
+ "memory/max_active (GiB)": 16.23,
199
+ "memory/max_allocated (GiB)": 16.23,
200
+ "ppl": 1.00154,
201
+ "step": 130,
202
+ "tokens/total": 2129920,
203
+ "tokens/train_per_sec_per_gpu": 14.01,
204
+ "tokens/trainable": 678140
205
+ },
206
+ {
207
+ "epoch": 0.13575757575757577,
208
+ "grad_norm": 0.13342970609664917,
209
+ "learning_rate": 5.398058252427185e-05,
210
+ "loss": 0.001996887102723122,
211
+ "memory/device_reserved (GiB)": 18.85,
212
+ "memory/max_active (GiB)": 16.23,
213
+ "memory/max_allocated (GiB)": 16.23,
214
+ "ppl": 1.002,
215
+ "step": 140,
216
+ "tokens/total": 2293760,
217
+ "tokens/train_per_sec_per_gpu": 14.41,
218
+ "tokens/trainable": 730201
219
+ },
220
+ {
221
+ "epoch": 0.14545454545454545,
222
+ "grad_norm": 0.0299234539270401,
223
+ "learning_rate": 5.786407766990292e-05,
224
+ "loss": 0.0015132850036025046,
225
+ "memory/device_reserved (GiB)": 18.85,
226
+ "memory/max_active (GiB)": 16.23,
227
+ "memory/max_allocated (GiB)": 16.23,
228
+ "ppl": 1.00151,
229
+ "step": 150,
230
+ "tokens/total": 2457600,
231
+ "tokens/train_per_sec_per_gpu": 15.8,
232
+ "tokens/trainable": 782196
233
+ },
234
+ {
235
+ "epoch": 0.15515151515151515,
236
+ "grad_norm": 0.04437975212931633,
237
+ "learning_rate": 6.174757281553398e-05,
238
+ "loss": 0.0012883609160780907,
239
+ "memory/device_reserved (GiB)": 18.85,
240
+ "memory/max_active (GiB)": 16.23,
241
+ "memory/max_allocated (GiB)": 16.23,
242
+ "ppl": 1.00129,
243
+ "step": 160,
244
+ "tokens/total": 2621440,
245
+ "tokens/train_per_sec_per_gpu": 14.64,
246
+ "tokens/trainable": 833614
247
+ },
248
+ {
249
+ "epoch": 0.16484848484848486,
250
+ "grad_norm": 0.014039761386811733,
251
+ "learning_rate": 6.563106796116505e-05,
252
+ "loss": 0.0011639594100415706,
253
+ "memory/device_reserved (GiB)": 18.85,
254
+ "memory/max_active (GiB)": 16.23,
255
+ "memory/max_allocated (GiB)": 16.23,
256
+ "ppl": 1.00116,
257
+ "step": 170,
258
+ "tokens/total": 2785280,
259
+ "tokens/train_per_sec_per_gpu": 13.95,
260
+ "tokens/trainable": 885591
261
+ },
262
+ {
263
+ "epoch": 0.17454545454545456,
264
+ "grad_norm": 0.0033261056523770094,
265
+ "learning_rate": 6.951456310679612e-05,
266
+ "loss": 0.0007388167083263397,
267
+ "memory/device_reserved (GiB)": 18.85,
268
+ "memory/max_active (GiB)": 16.23,
269
+ "memory/max_allocated (GiB)": 16.23,
270
+ "ppl": 1.00074,
271
+ "step": 180,
272
+ "tokens/total": 2949120,
273
+ "tokens/train_per_sec_per_gpu": 14.37,
274
+ "tokens/trainable": 937712
275
+ },
276
+ {
277
+ "epoch": 0.18424242424242424,
278
+ "grad_norm": 0.010476192459464073,
279
+ "learning_rate": 7.339805825242719e-05,
280
+ "loss": 0.0008642122149467469,
281
+ "memory/device_reserved (GiB)": 18.85,
282
+ "memory/max_active (GiB)": 16.23,
283
+ "memory/max_allocated (GiB)": 16.23,
284
+ "ppl": 1.00086,
285
+ "step": 190,
286
+ "tokens/total": 3112960,
287
+ "tokens/train_per_sec_per_gpu": 15.52,
288
+ "tokens/trainable": 989913
289
+ },
290
+ {
291
+ "epoch": 0.19393939393939394,
292
+ "grad_norm": 0.01253255270421505,
293
+ "learning_rate": 7.728155339805826e-05,
294
+ "loss": 0.0007610846310853958,
295
+ "memory/device_reserved (GiB)": 18.85,
296
+ "memory/max_active (GiB)": 16.23,
297
+ "memory/max_allocated (GiB)": 16.23,
298
+ "ppl": 1.00076,
299
+ "step": 200,
300
+ "tokens/total": 3276800,
301
+ "tokens/train_per_sec_per_gpu": 14.17,
302
+ "tokens/trainable": 1041978
303
+ },
304
+ {
305
+ "epoch": 0.20363636363636364,
306
+ "grad_norm": 0.01779557578265667,
307
+ "learning_rate": 8.116504854368933e-05,
308
+ "loss": 0.0007697530556470156,
309
+ "memory/device_reserved (GiB)": 18.85,
310
+ "memory/max_active (GiB)": 16.23,
311
+ "memory/max_allocated (GiB)": 16.23,
312
+ "ppl": 1.00077,
313
+ "step": 210,
314
+ "tokens/total": 3440640,
315
+ "tokens/train_per_sec_per_gpu": 14.12,
316
+ "tokens/trainable": 1093395
317
+ },
318
+ {
319
+ "epoch": 0.21333333333333335,
320
+ "grad_norm": 0.16895800828933716,
321
+ "learning_rate": 8.504854368932039e-05,
322
+ "loss": 0.0006535804830491542,
323
+ "memory/device_reserved (GiB)": 18.85,
324
+ "memory/max_active (GiB)": 16.23,
325
+ "memory/max_allocated (GiB)": 16.23,
326
+ "ppl": 1.00065,
327
+ "step": 220,
328
+ "tokens/total": 3604480,
329
+ "tokens/train_per_sec_per_gpu": 14.72,
330
+ "tokens/trainable": 1145329
331
+ },
332
+ {
333
+ "epoch": 0.22303030303030302,
334
+ "grad_norm": 0.08973463624715805,
335
+ "learning_rate": 8.893203883495146e-05,
336
+ "loss": 0.0009510296396911145,
337
+ "memory/device_reserved (GiB)": 18.85,
338
+ "memory/max_active (GiB)": 16.23,
339
+ "memory/max_allocated (GiB)": 16.23,
340
+ "ppl": 1.00095,
341
+ "step": 230,
342
+ "tokens/total": 3768320,
343
+ "tokens/train_per_sec_per_gpu": 14.67,
344
+ "tokens/trainable": 1197537
345
+ },
346
+ {
347
+ "epoch": 0.23272727272727273,
348
+ "grad_norm": 0.044939588755369186,
349
+ "learning_rate": 9.281553398058253e-05,
350
+ "loss": 0.001187363639473915,
351
+ "memory/device_reserved (GiB)": 18.85,
352
+ "memory/max_active (GiB)": 16.23,
353
+ "memory/max_allocated (GiB)": 16.23,
354
+ "ppl": 1.00119,
355
+ "step": 240,
356
+ "tokens/total": 3932160,
357
+ "tokens/train_per_sec_per_gpu": 15.39,
358
+ "tokens/trainable": 1249924
359
+ },
360
+ {
361
+ "epoch": 0.24242424242424243,
362
+ "grad_norm": 0.08850465714931488,
363
+ "learning_rate": 9.66990291262136e-05,
364
+ "loss": 0.0013382930308580398,
365
+ "memory/device_reserved (GiB)": 18.85,
366
+ "memory/max_active (GiB)": 16.23,
367
+ "memory/max_allocated (GiB)": 16.23,
368
+ "ppl": 1.00134,
369
+ "step": 250,
370
+ "tokens/total": 4096000,
371
+ "tokens/train_per_sec_per_gpu": 15.06,
372
+ "tokens/trainable": 1301558
373
+ },
374
+ {
375
+ "epoch": 0.25212121212121213,
376
+ "grad_norm": 0.101528100669384,
377
+ "learning_rate": 0.00010058252427184467,
378
+ "loss": 0.0008709387853741646,
379
+ "memory/device_reserved (GiB)": 18.85,
380
+ "memory/max_active (GiB)": 16.23,
381
+ "memory/max_allocated (GiB)": 16.23,
382
+ "ppl": 1.00087,
383
+ "step": 260,
384
+ "tokens/total": 4259840,
385
+ "tokens/train_per_sec_per_gpu": 15.16,
386
+ "tokens/trainable": 1353706
387
+ },
388
+ {
389
+ "epoch": 0.26181818181818184,
390
+ "grad_norm": 0.08298433572053909,
391
+ "learning_rate": 0.00010446601941747574,
392
+ "loss": 0.0013300922699272632,
393
+ "memory/device_reserved (GiB)": 18.85,
394
+ "memory/max_active (GiB)": 16.23,
395
+ "memory/max_allocated (GiB)": 16.23,
396
+ "ppl": 1.00133,
397
+ "step": 270,
398
+ "tokens/total": 4423680,
399
+ "tokens/train_per_sec_per_gpu": 15.11,
400
+ "tokens/trainable": 1405519
401
+ },
402
+ {
403
+ "epoch": 0.27151515151515154,
404
+ "grad_norm": 0.03734389320015907,
405
+ "learning_rate": 0.00010834951456310681,
406
+ "loss": 0.0006868645548820495,
407
+ "memory/device_reserved (GiB)": 18.85,
408
+ "memory/max_active (GiB)": 16.23,
409
+ "memory/max_allocated (GiB)": 16.23,
410
+ "ppl": 1.00069,
411
+ "step": 280,
412
+ "tokens/total": 4587520,
413
+ "tokens/train_per_sec_per_gpu": 15.07,
414
+ "tokens/trainable": 1457494
415
+ },
416
+ {
417
+ "epoch": 0.2812121212121212,
418
+ "grad_norm": 0.07898428291082382,
419
+ "learning_rate": 0.00011223300970873786,
420
+ "loss": 0.0013550779782235622,
421
+ "memory/device_reserved (GiB)": 18.85,
422
+ "memory/max_active (GiB)": 16.23,
423
+ "memory/max_allocated (GiB)": 16.23,
424
+ "ppl": 1.00136,
425
+ "step": 290,
426
+ "tokens/total": 4751360,
427
+ "tokens/train_per_sec_per_gpu": 14.75,
428
+ "tokens/trainable": 1509320
429
+ },
430
+ {
431
+ "epoch": 0.2909090909090909,
432
+ "grad_norm": 0.06320006400346756,
433
+ "learning_rate": 0.00011611650485436893,
434
+ "loss": 0.0010121697559952736,
435
+ "memory/device_reserved (GiB)": 18.85,
436
+ "memory/max_active (GiB)": 16.23,
437
+ "memory/max_allocated (GiB)": 16.23,
438
+ "ppl": 1.00101,
439
+ "step": 300,
440
+ "tokens/total": 4915200,
441
+ "tokens/train_per_sec_per_gpu": 14.19,
442
+ "tokens/trainable": 1561332
443
+ },
444
+ {
445
+ "epoch": 0.3006060606060606,
446
+ "grad_norm": 0.013749867677688599,
447
+ "learning_rate": 0.00012,
448
+ "loss": 0.0006499682553112507,
449
+ "memory/device_reserved (GiB)": 18.85,
450
+ "memory/max_active (GiB)": 16.23,
451
+ "memory/max_allocated (GiB)": 16.23,
452
+ "ppl": 1.00065,
453
+ "step": 310,
454
+ "tokens/total": 5079040,
455
+ "tokens/train_per_sec_per_gpu": 14.84,
456
+ "tokens/trainable": 1613189
457
+ },
458
+ {
459
+ "epoch": 0.3103030303030303,
460
+ "grad_norm": 0.033964402973651886,
461
+ "learning_rate": 0.00012388349514563107,
462
+ "loss": 0.0008866124786436558,
463
+ "memory/device_reserved (GiB)": 18.85,
464
+ "memory/max_active (GiB)": 16.23,
465
+ "memory/max_allocated (GiB)": 16.23,
466
+ "ppl": 1.00089,
467
+ "step": 320,
468
+ "tokens/total": 5242880,
469
+ "tokens/train_per_sec_per_gpu": 15.78,
470
+ "tokens/trainable": 1665681
471
+ },
472
+ {
473
+ "epoch": 0.32,
474
+ "grad_norm": 0.04327597841620445,
475
+ "learning_rate": 0.00012776699029126213,
476
+ "loss": 0.0005569641944020987,
477
+ "memory/device_reserved (GiB)": 18.85,
478
+ "memory/max_active (GiB)": 16.23,
479
+ "memory/max_allocated (GiB)": 16.23,
480
+ "ppl": 1.00056,
481
+ "step": 330,
482
+ "tokens/total": 5406720,
483
+ "tokens/train_per_sec_per_gpu": 14.92,
484
+ "tokens/trainable": 1718317
485
+ },
486
+ {
487
+ "epoch": 0.3296969696969697,
488
+ "grad_norm": 0.02717934548854828,
489
+ "learning_rate": 0.0001316504854368932,
490
+ "loss": 0.0003776244120672345,
491
+ "memory/device_reserved (GiB)": 18.85,
492
+ "memory/max_active (GiB)": 16.23,
493
+ "memory/max_allocated (GiB)": 16.23,
494
+ "ppl": 1.00038,
495
+ "step": 340,
496
+ "tokens/total": 5570560,
497
+ "tokens/train_per_sec_per_gpu": 14.42,
498
+ "tokens/trainable": 1770210
499
+ },
500
+ {
501
+ "epoch": 0.3393939393939394,
502
+ "grad_norm": 0.0028237912338227034,
503
+ "learning_rate": 0.0001355339805825243,
504
+ "loss": 0.0005292522720992566,
505
+ "memory/device_reserved (GiB)": 18.85,
506
+ "memory/max_active (GiB)": 16.23,
507
+ "memory/max_allocated (GiB)": 16.23,
508
+ "ppl": 1.00053,
509
+ "step": 350,
510
+ "tokens/total": 5734400,
511
+ "tokens/train_per_sec_per_gpu": 16.4,
512
+ "tokens/trainable": 1821987
513
+ },
514
+ {
515
+ "epoch": 0.3490909090909091,
516
+ "grad_norm": 0.0310799703001976,
517
+ "learning_rate": 0.00013941747572815535,
518
+ "loss": 0.0006786303594708443,
519
+ "memory/device_reserved (GiB)": 18.85,
520
+ "memory/max_active (GiB)": 16.23,
521
+ "memory/max_allocated (GiB)": 16.23,
522
+ "ppl": 1.00068,
523
+ "step": 360,
524
+ "tokens/total": 5898240,
525
+ "tokens/train_per_sec_per_gpu": 14.72,
526
+ "tokens/trainable": 1874266
527
+ },
528
+ {
529
+ "epoch": 0.35878787878787877,
530
+ "grad_norm": 0.17325043678283691,
531
+ "learning_rate": 0.0001433009708737864,
532
+ "loss": 0.0013975565321743487,
533
+ "memory/device_reserved (GiB)": 18.85,
534
+ "memory/max_active (GiB)": 16.23,
535
+ "memory/max_allocated (GiB)": 16.23,
536
+ "ppl": 1.0014,
537
+ "step": 370,
538
+ "tokens/total": 6062080,
539
+ "tokens/train_per_sec_per_gpu": 13.73,
540
+ "tokens/trainable": 1926124
541
+ },
542
+ {
543
+ "epoch": 0.36848484848484847,
544
+ "grad_norm": 0.07738752663135529,
545
+ "learning_rate": 0.0001471844660194175,
546
+ "loss": 0.0006820175796747208,
547
+ "memory/device_reserved (GiB)": 18.85,
548
+ "memory/max_active (GiB)": 16.23,
549
+ "memory/max_allocated (GiB)": 16.23,
550
+ "ppl": 1.00068,
551
+ "step": 380,
552
+ "tokens/total": 6225920,
553
+ "tokens/train_per_sec_per_gpu": 14.04,
554
+ "tokens/trainable": 1978693
555
+ },
556
+ {
557
+ "epoch": 0.3781818181818182,
558
+ "grad_norm": 0.10022349655628204,
559
+ "learning_rate": 0.00015106796116504855,
560
+ "loss": 0.00063879219815135,
561
+ "memory/device_reserved (GiB)": 18.85,
562
+ "memory/max_active (GiB)": 16.23,
563
+ "memory/max_allocated (GiB)": 16.23,
564
+ "ppl": 1.00064,
565
+ "step": 390,
566
+ "tokens/total": 6389760,
567
+ "tokens/train_per_sec_per_gpu": 13.34,
568
+ "tokens/trainable": 2030378
569
+ },
570
+ {
571
+ "epoch": 0.3878787878787879,
572
+ "grad_norm": 0.0495997779071331,
573
+ "learning_rate": 0.00015495145631067963,
574
+ "loss": 0.0021283581852912905,
575
+ "memory/device_reserved (GiB)": 18.85,
576
+ "memory/max_active (GiB)": 16.23,
577
+ "memory/max_allocated (GiB)": 16.23,
578
+ "ppl": 1.00213,
579
+ "step": 400,
580
+ "tokens/total": 6553600,
581
+ "tokens/train_per_sec_per_gpu": 15.34,
582
+ "tokens/trainable": 2083047
583
+ },
584
+ {
585
+ "epoch": 0.3975757575757576,
586
+ "grad_norm": 0.07361701130867004,
587
+ "learning_rate": 0.0001588349514563107,
588
+ "loss": 0.001862115040421486,
589
+ "memory/device_reserved (GiB)": 18.85,
590
+ "memory/max_active (GiB)": 16.23,
591
+ "memory/max_allocated (GiB)": 16.23,
592
+ "ppl": 1.00186,
593
+ "step": 410,
594
+ "tokens/total": 6717440,
595
+ "tokens/train_per_sec_per_gpu": 14.23,
596
+ "tokens/trainable": 2135527
597
+ },
598
+ {
599
+ "epoch": 0.4072727272727273,
600
+ "grad_norm": 0.05466209724545479,
601
+ "learning_rate": 0.00016271844660194174,
602
+ "loss": 0.0011581303551793098,
603
+ "memory/device_reserved (GiB)": 18.85,
604
+ "memory/max_active (GiB)": 16.23,
605
+ "memory/max_allocated (GiB)": 16.23,
606
+ "ppl": 1.00116,
607
+ "step": 420,
608
+ "tokens/total": 6881280,
609
+ "tokens/train_per_sec_per_gpu": 14.77,
610
+ "tokens/trainable": 2187636
611
+ },
612
+ {
613
+ "epoch": 0.416969696969697,
614
+ "grad_norm": 0.04331392049789429,
615
+ "learning_rate": 0.00016660194174757283,
616
+ "loss": 0.0051729224622249605,
617
+ "memory/device_reserved (GiB)": 18.85,
618
+ "memory/max_active (GiB)": 16.23,
619
+ "memory/max_allocated (GiB)": 16.23,
620
+ "ppl": 1.00519,
621
+ "step": 430,
622
+ "tokens/total": 7045120,
623
+ "tokens/train_per_sec_per_gpu": 13.76,
624
+ "tokens/trainable": 2239006
625
+ },
626
+ {
627
+ "epoch": 0.4266666666666667,
628
+ "grad_norm": 0.05931795388460159,
629
+ "learning_rate": 0.00017048543689320388,
630
+ "loss": 0.00242764875292778,
631
+ "memory/device_reserved (GiB)": 18.85,
632
+ "memory/max_active (GiB)": 16.23,
633
+ "memory/max_allocated (GiB)": 16.23,
634
+ "ppl": 1.00243,
635
+ "step": 440,
636
+ "tokens/total": 7208960,
637
+ "tokens/train_per_sec_per_gpu": 14.59,
638
+ "tokens/trainable": 2290540
639
+ },
640
+ {
641
+ "epoch": 0.43636363636363634,
642
+ "grad_norm": 0.04634418711066246,
643
+ "learning_rate": 0.00017436893203883494,
644
+ "loss": 0.001389546226710081,
645
+ "memory/device_reserved (GiB)": 18.85,
646
+ "memory/max_active (GiB)": 16.23,
647
+ "memory/max_allocated (GiB)": 16.23,
648
+ "ppl": 1.00139,
649
+ "step": 450,
650
+ "tokens/total": 7372800,
651
+ "tokens/train_per_sec_per_gpu": 14.78,
652
+ "tokens/trainable": 2341852
653
+ },
654
+ {
655
+ "epoch": 0.44606060606060605,
656
+ "grad_norm": 0.04817213863134384,
657
+ "learning_rate": 0.00017825242718446602,
658
+ "loss": 0.001370794139802456,
659
+ "memory/device_reserved (GiB)": 18.85,
660
+ "memory/max_active (GiB)": 16.23,
661
+ "memory/max_allocated (GiB)": 16.23,
662
+ "ppl": 1.00137,
663
+ "step": 460,
664
+ "tokens/total": 7536640,
665
+ "tokens/train_per_sec_per_gpu": 13.77,
666
+ "tokens/trainable": 2393320
667
+ },
668
+ {
669
+ "epoch": 0.45575757575757575,
670
+ "grad_norm": 0.011335949413478374,
671
+ "learning_rate": 0.00018213592233009708,
672
+ "loss": 0.0009715131483972073,
673
+ "memory/device_reserved (GiB)": 18.85,
674
+ "memory/max_active (GiB)": 16.23,
675
+ "memory/max_allocated (GiB)": 16.23,
676
+ "ppl": 1.00097,
677
+ "step": 470,
678
+ "tokens/total": 7700480,
679
+ "tokens/train_per_sec_per_gpu": 14.52,
680
+ "tokens/trainable": 2445170
681
+ },
682
+ {
683
+ "epoch": 0.46545454545454545,
684
+ "grad_norm": 0.05298445746302605,
685
+ "learning_rate": 0.00018601941747572816,
686
+ "loss": 0.0008222623728215694,
687
+ "memory/device_reserved (GiB)": 18.85,
688
+ "memory/max_active (GiB)": 16.23,
689
+ "memory/max_allocated (GiB)": 16.23,
690
+ "ppl": 1.00082,
691
+ "step": 480,
692
+ "tokens/total": 7864320,
693
+ "tokens/train_per_sec_per_gpu": 13.87,
694
+ "tokens/trainable": 2497473
695
+ },
696
+ {
697
+ "epoch": 0.47515151515151516,
698
+ "grad_norm": 0.061686884611845016,
699
+ "learning_rate": 0.00018990291262135925,
700
+ "loss": 0.000748783303424716,
701
+ "memory/device_reserved (GiB)": 18.85,
702
+ "memory/max_active (GiB)": 16.23,
703
+ "memory/max_allocated (GiB)": 16.23,
704
+ "ppl": 1.00075,
705
+ "step": 490,
706
+ "tokens/total": 8028160,
707
+ "tokens/train_per_sec_per_gpu": 15.41,
708
+ "tokens/trainable": 2549206
709
+ },
710
+ {
711
+ "epoch": 0.48484848484848486,
712
+ "grad_norm": 0.03281249850988388,
713
+ "learning_rate": 0.0001937864077669903,
714
+ "loss": 0.0006062469445168972,
715
+ "memory/device_reserved (GiB)": 18.85,
716
+ "memory/max_active (GiB)": 16.23,
717
+ "memory/max_allocated (GiB)": 16.23,
718
+ "ppl": 1.00061,
719
+ "step": 500,
720
+ "tokens/total": 8192000,
721
+ "tokens/train_per_sec_per_gpu": 14.49,
722
+ "tokens/trainable": 2600583
723
+ },
724
+ {
725
+ "epoch": 0.49454545454545457,
726
+ "grad_norm": 0.008482079952955246,
727
+ "learning_rate": 0.0001976699029126214,
728
+ "loss": 0.0008583014830946922,
729
+ "memory/device_reserved (GiB)": 18.85,
730
+ "memory/max_active (GiB)": 16.23,
731
+ "memory/max_allocated (GiB)": 16.23,
732
+ "ppl": 1.00086,
733
+ "step": 510,
734
+ "tokens/total": 8355840,
735
+ "tokens/train_per_sec_per_gpu": 13.86,
736
+ "tokens/trainable": 2652927
737
+ },
738
+ {
739
+ "epoch": 0.5003636363636363,
740
+ "eval_loss": 0.0009036393603309989,
741
+ "eval_ppl": 1.0009,
742
+ "eval_runtime": 12.7872,
743
+ "eval_samples_per_second": 15.641,
744
+ "eval_steps_per_second": 7.82,
745
+ "memory/device_reserved (GiB)": 18.85,
746
+ "memory/max_active (GiB)": 16.23,
747
+ "memory/max_allocated (GiB)": 16.23,
748
+ "step": 516
749
+ },
750
+ {
751
+ "epoch": 0.5042424242424243,
752
+ "grad_norm": 0.04333305358886719,
753
+ "learning_rate": 0.0001999996332640321,
754
+ "loss": 0.0005093200132250785,
755
+ "memory/device_reserved (GiB)": 20.01,
756
+ "memory/max_active (GiB)": 16.23,
757
+ "memory/max_allocated (GiB)": 16.23,
758
+ "ppl": 1.00051,
759
+ "step": 520,
760
+ "tokens/total": 8519680,
761
+ "tokens/train_per_sec_per_gpu": 14.09,
762
+ "tokens/trainable": 2705083
763
+ },
764
+ {
765
+ "epoch": 0.5139393939393939,
766
+ "grad_norm": 0.02485118806362152,
767
+ "learning_rate": 0.00019999550751528488,
768
+ "loss": 0.0006649125367403031,
769
+ "memory/device_reserved (GiB)": 20.01,
770
+ "memory/max_active (GiB)": 16.23,
771
+ "memory/max_allocated (GiB)": 16.23,
772
+ "ppl": 1.00067,
773
+ "step": 530,
774
+ "tokens/total": 8683520,
775
+ "tokens/train_per_sec_per_gpu": 14.44,
776
+ "tokens/trainable": 2756975
777
+ },
778
+ {
779
+ "epoch": 0.5236363636363637,
780
+ "grad_norm": 0.03736363351345062,
781
+ "learning_rate": 0.00019998679778759294,
782
+ "loss": 0.0006726076360791921,
783
+ "memory/device_reserved (GiB)": 20.01,
784
+ "memory/max_active (GiB)": 16.23,
785
+ "memory/max_allocated (GiB)": 16.23,
786
+ "ppl": 1.00067,
787
+ "step": 540,
788
+ "tokens/total": 8847360,
789
+ "tokens/train_per_sec_per_gpu": 14.16,
790
+ "tokens/trainable": 2808076
791
+ },
792
+ {
793
+ "epoch": 0.5333333333333333,
794
+ "grad_norm": 0.05156765505671501,
795
+ "learning_rate": 0.0001999735044802263,
796
+ "loss": 0.000789718609303236,
797
+ "memory/device_reserved (GiB)": 20.01,
798
+ "memory/max_active (GiB)": 16.23,
799
+ "memory/max_allocated (GiB)": 16.23,
800
+ "ppl": 1.00079,
801
+ "step": 550,
802
+ "tokens/total": 9011200,
803
+ "tokens/train_per_sec_per_gpu": 16.36,
804
+ "tokens/trainable": 2859893
805
+ },
806
+ {
807
+ "epoch": 0.5430303030303031,
808
+ "grad_norm": 0.647550106048584,
809
+ "learning_rate": 0.00019995562820257474,
810
+ "loss": 0.003008325584232807,
811
+ "memory/device_reserved (GiB)": 20.01,
812
+ "memory/max_active (GiB)": 16.23,
813
+ "memory/max_allocated (GiB)": 16.23,
814
+ "ppl": 1.00301,
815
+ "step": 560,
816
+ "tokens/total": 9175040,
817
+ "tokens/train_per_sec_per_gpu": 14.21,
818
+ "tokens/trainable": 2911399
819
+ },
820
+ {
821
+ "epoch": 0.5527272727272727,
822
+ "grad_norm": 0.185165673494339,
823
+ "learning_rate": 0.00019993316977411993,
824
+ "loss": 0.013715097308158874,
825
+ "memory/device_reserved (GiB)": 20.01,
826
+ "memory/max_active (GiB)": 16.23,
827
+ "memory/max_allocated (GiB)": 16.23,
828
+ "ppl": 1.01381,
829
+ "step": 570,
830
+ "tokens/total": 9338880,
831
+ "tokens/train_per_sec_per_gpu": 13.85,
832
+ "tokens/trainable": 2962403
833
+ },
834
+ {
835
+ "epoch": 0.5624242424242424,
836
+ "grad_norm": 0.2401553839445114,
837
+ "learning_rate": 0.0001999061302243977,
838
+ "loss": 0.009026474505662917,
839
+ "memory/device_reserved (GiB)": 20.01,
840
+ "memory/max_active (GiB)": 16.23,
841
+ "memory/max_allocated (GiB)": 16.23,
842
+ "ppl": 1.00907,
843
+ "step": 580,
844
+ "tokens/total": 9502720,
845
+ "tokens/train_per_sec_per_gpu": 14.38,
846
+ "tokens/trainable": 3015083
847
+ },
848
+ {
849
+ "epoch": 0.5721212121212121,
850
+ "grad_norm": 0.08092579245567322,
851
+ "learning_rate": 0.000199874510792951,
852
+ "loss": 0.005716494470834732,
853
+ "memory/device_reserved (GiB)": 20.01,
854
+ "memory/max_active (GiB)": 16.23,
855
+ "memory/max_allocated (GiB)": 16.23,
856
+ "ppl": 1.00573,
857
+ "step": 590,
858
+ "tokens/total": 9666560,
859
+ "tokens/train_per_sec_per_gpu": 16.38,
860
+ "tokens/trainable": 3066501
861
+ },
862
+ {
863
+ "epoch": 0.5818181818181818,
864
+ "grad_norm": 3.418715476989746,
865
+ "learning_rate": 0.00019983831292927305,
866
+ "loss": 0.048504295945167544,
867
+ "memory/device_reserved (GiB)": 20.01,
868
+ "memory/max_active (GiB)": 16.23,
869
+ "memory/max_allocated (GiB)": 16.23,
870
+ "ppl": 1.0497,
871
+ "step": 600,
872
+ "tokens/total": 9830400,
873
+ "tokens/train_per_sec_per_gpu": 14.23,
874
+ "tokens/trainable": 3118633
875
+ },
876
+ {
877
+ "epoch": 0.5915151515151515,
878
+ "grad_norm": 0.2194036841392517,
879
+ "learning_rate": 0.00019979753829274085,
880
+ "loss": 0.03429323434829712,
881
+ "memory/device_reserved (GiB)": 20.01,
882
+ "memory/max_active (GiB)": 16.23,
883
+ "memory/max_allocated (GiB)": 16.23,
884
+ "ppl": 1.03489,
885
+ "step": 610,
886
+ "tokens/total": 9994240,
887
+ "tokens/train_per_sec_per_gpu": 13.14,
888
+ "tokens/trainable": 3170577
889
+ },
890
+ {
891
+ "epoch": 0.6012121212121212,
892
+ "grad_norm": 0.022929901257157326,
893
+ "learning_rate": 0.0001997521887525391,
894
+ "loss": 0.0015171168372035027,
895
+ "memory/device_reserved (GiB)": 20.01,
896
+ "memory/max_active (GiB)": 16.23,
897
+ "memory/max_allocated (GiB)": 16.23,
898
+ "ppl": 1.00152,
899
+ "step": 620,
900
+ "tokens/total": 10158080,
901
+ "tokens/train_per_sec_per_gpu": 14.24,
902
+ "tokens/trainable": 3221696
903
+ },
904
+ {
905
+ "epoch": 0.610909090909091,
906
+ "grad_norm": 0.10083670169115067,
907
+ "learning_rate": 0.00019970226638757458,
908
+ "loss": 0.0025377947837114333,
909
+ "memory/device_reserved (GiB)": 20.01,
910
+ "memory/max_active (GiB)": 16.23,
911
+ "memory/max_allocated (GiB)": 16.23,
912
+ "ppl": 1.00254,
913
+ "step": 630,
914
+ "tokens/total": 10321920,
915
+ "tokens/train_per_sec_per_gpu": 14.7,
916
+ "tokens/trainable": 3273775
917
+ },
918
+ {
919
+ "epoch": 0.6206060606060606,
920
+ "grad_norm": 0.01761380024254322,
921
+ "learning_rate": 0.00019964777348638083,
922
+ "loss": 0.002281896211206913,
923
+ "memory/device_reserved (GiB)": 20.01,
924
+ "memory/max_active (GiB)": 16.23,
925
+ "memory/max_allocated (GiB)": 16.23,
926
+ "ppl": 1.00228,
927
+ "step": 640,
928
+ "tokens/total": 10485760,
929
+ "tokens/train_per_sec_per_gpu": 14.89,
930
+ "tokens/trainable": 3325516
931
+ },
932
+ {
933
+ "epoch": 0.6303030303030303,
934
+ "grad_norm": 0.004510029684752226,
935
+ "learning_rate": 0.00019958871254701315,
936
+ "loss": 0.0009477110579609871,
937
+ "memory/device_reserved (GiB)": 20.01,
938
+ "memory/max_active (GiB)": 16.23,
939
+ "memory/max_allocated (GiB)": 16.23,
940
+ "ppl": 1.00095,
941
+ "step": 650,
942
+ "tokens/total": 10649600,
943
+ "tokens/train_per_sec_per_gpu": 16.46,
944
+ "tokens/trainable": 3377214
945
+ },
946
+ {
947
+ "epoch": 0.64,
948
+ "grad_norm": 0.05332477018237114,
949
+ "learning_rate": 0.0001995250862769342,
950
+ "loss": 0.0005660496186465025,
951
+ "memory/device_reserved (GiB)": 20.01,
952
+ "memory/max_active (GiB)": 16.23,
953
+ "memory/max_allocated (GiB)": 16.23,
954
+ "ppl": 1.00057,
955
+ "step": 660,
956
+ "tokens/total": 10813440,
957
+ "tokens/train_per_sec_per_gpu": 14.52,
958
+ "tokens/trainable": 3428627
959
+ },
960
+ {
961
+ "epoch": 0.6496969696969697,
962
+ "grad_norm": 0.03861689195036888,
963
+ "learning_rate": 0.0001994568975928899,
964
+ "loss": 0.0008976863697171211,
965
+ "memory/device_reserved (GiB)": 20.01,
966
+ "memory/max_active (GiB)": 16.23,
967
+ "memory/max_allocated (GiB)": 16.23,
968
+ "ppl": 1.0009,
969
+ "step": 670,
970
+ "tokens/total": 10977280,
971
+ "tokens/train_per_sec_per_gpu": 15.66,
972
+ "tokens/trainable": 3480170
973
+ },
974
+ {
975
+ "epoch": 0.6593939393939394,
976
+ "grad_norm": 0.021123304963111877,
977
+ "learning_rate": 0.00019938414962077553,
978
+ "loss": 0.0009612766094505787,
979
+ "memory/device_reserved (GiB)": 20.01,
980
+ "memory/max_active (GiB)": 16.23,
981
+ "memory/max_allocated (GiB)": 16.23,
982
+ "ppl": 1.00096,
983
+ "step": 680,
984
+ "tokens/total": 11141120,
985
+ "tokens/train_per_sec_per_gpu": 15.15,
986
+ "tokens/trainable": 3532037
987
+ },
988
+ {
989
+ "epoch": 0.6690909090909091,
990
+ "grad_norm": 0.02421347238123417,
991
+ "learning_rate": 0.00019930684569549264,
992
+ "loss": 0.001021684519946575,
993
+ "memory/device_reserved (GiB)": 20.01,
994
+ "memory/max_active (GiB)": 16.23,
995
+ "memory/max_allocated (GiB)": 16.23,
996
+ "ppl": 1.00102,
997
+ "step": 690,
998
+ "tokens/total": 11304960,
999
+ "tokens/train_per_sec_per_gpu": 14.16,
1000
+ "tokens/trainable": 3583461
1001
+ },
1002
+ {
1003
+ "epoch": 0.6787878787878788,
1004
+ "grad_norm": 0.05008835345506668,
1005
+ "learning_rate": 0.00019922498936079613,
1006
+ "loss": 0.0007617876864969731,
1007
+ "memory/device_reserved (GiB)": 20.01,
1008
+ "memory/max_active (GiB)": 16.23,
1009
+ "memory/max_allocated (GiB)": 16.23,
1010
+ "ppl": 1.00076,
1011
+ "step": 700,
1012
+ "tokens/total": 11468800,
1013
+ "tokens/train_per_sec_per_gpu": 14.08,
1014
+ "tokens/trainable": 3634649
1015
+ },
1016
+ {
1017
+ "epoch": 0.6884848484848485,
1018
+ "grad_norm": 0.035733792930841446,
1019
+ "learning_rate": 0.00019913858436913171,
1020
+ "loss": 0.0012347914278507232,
1021
+ "memory/device_reserved (GiB)": 20.01,
1022
+ "memory/max_active (GiB)": 16.23,
1023
+ "memory/max_allocated (GiB)": 16.23,
1024
+ "ppl": 1.00124,
1025
+ "step": 710,
1026
+ "tokens/total": 11632640,
1027
+ "tokens/train_per_sec_per_gpu": 14.45,
1028
+ "tokens/trainable": 3685786
1029
+ },
1030
+ {
1031
+ "epoch": 0.6981818181818182,
1032
+ "grad_norm": 0.010948767885565758,
1033
+ "learning_rate": 0.00019904763468146393,
1034
+ "loss": 0.0008165687322616577,
1035
+ "memory/device_reserved (GiB)": 20.01,
1036
+ "memory/max_active (GiB)": 16.23,
1037
+ "memory/max_allocated (GiB)": 16.23,
1038
+ "ppl": 1.00082,
1039
+ "step": 720,
1040
+ "tokens/total": 11796480,
1041
+ "tokens/train_per_sec_per_gpu": 15.77,
1042
+ "tokens/trainable": 3737566
1043
+ },
1044
+ {
1045
+ "epoch": 0.7078787878787879,
1046
+ "grad_norm": 0.03577027469873428,
1047
+ "learning_rate": 0.00019895214446709463,
1048
+ "loss": 0.001333119161427021,
1049
+ "memory/device_reserved (GiB)": 20.01,
1050
+ "memory/max_active (GiB)": 16.23,
1051
+ "memory/max_allocated (GiB)": 16.23,
1052
+ "ppl": 1.00133,
1053
+ "step": 730,
1054
+ "tokens/total": 11960320,
1055
+ "tokens/train_per_sec_per_gpu": 13.98,
1056
+ "tokens/trainable": 3789817
1057
+ },
1058
+ {
1059
+ "epoch": 0.7175757575757575,
1060
+ "grad_norm": 0.03971279785037041,
1061
+ "learning_rate": 0.00019885211810347184,
1062
+ "loss": 0.0011184611357748508,
1063
+ "memory/device_reserved (GiB)": 20.01,
1064
+ "memory/max_active (GiB)": 16.23,
1065
+ "memory/max_allocated (GiB)": 16.23,
1066
+ "ppl": 1.00112,
1067
+ "step": 740,
1068
+ "tokens/total": 12124160,
1069
+ "tokens/train_per_sec_per_gpu": 14.67,
1070
+ "tokens/trainable": 3841912
1071
+ },
1072
+ {
1073
+ "epoch": 0.7272727272727273,
1074
+ "grad_norm": 0.06546575576066971,
1075
+ "learning_rate": 0.00019874756017598894,
1076
+ "loss": 0.0012452728115022182,
1077
+ "memory/device_reserved (GiB)": 20.01,
1078
+ "memory/max_active (GiB)": 16.23,
1079
+ "memory/max_allocated (GiB)": 16.23,
1080
+ "ppl": 1.00125,
1081
+ "step": 750,
1082
+ "tokens/total": 12288000,
1083
+ "tokens/train_per_sec_per_gpu": 14.58,
1084
+ "tokens/trainable": 3893725
1085
+ },
1086
+ {
1087
+ "epoch": 0.7369696969696969,
1088
+ "grad_norm": 0.047058816999197006,
1089
+ "learning_rate": 0.00019863847547777467,
1090
+ "loss": 0.0008146104402840138,
1091
+ "memory/device_reserved (GiB)": 20.01,
1092
+ "memory/max_active (GiB)": 16.23,
1093
+ "memory/max_allocated (GiB)": 16.23,
1094
+ "ppl": 1.00081,
1095
+ "step": 760,
1096
+ "tokens/total": 12451840,
1097
+ "tokens/train_per_sec_per_gpu": 13.49,
1098
+ "tokens/trainable": 3945033
1099
+ },
1100
+ {
1101
+ "epoch": 0.7466666666666667,
1102
+ "grad_norm": 0.028811641037464142,
1103
+ "learning_rate": 0.00019852486900947327,
1104
+ "loss": 0.0008652995340526104,
1105
+ "memory/device_reserved (GiB)": 20.01,
1106
+ "memory/max_active (GiB)": 16.23,
1107
+ "memory/max_allocated (GiB)": 16.23,
1108
+ "ppl": 1.00087,
1109
+ "step": 770,
1110
+ "tokens/total": 12615680,
1111
+ "tokens/train_per_sec_per_gpu": 15.12,
1112
+ "tokens/trainable": 3996749
1113
+ },
1114
+ {
1115
+ "epoch": 0.7563636363636363,
1116
+ "grad_norm": 0.012203546240925789,
1117
+ "learning_rate": 0.0001984067459790153,
1118
+ "loss": 0.000670672720298171,
1119
+ "memory/device_reserved (GiB)": 20.01,
1120
+ "memory/max_active (GiB)": 16.23,
1121
+ "memory/max_allocated (GiB)": 16.23,
1122
+ "ppl": 1.00067,
1123
+ "step": 780,
1124
+ "tokens/total": 12779520,
1125
+ "tokens/train_per_sec_per_gpu": 13.71,
1126
+ "tokens/trainable": 4048173
1127
+ },
1128
+ {
1129
+ "epoch": 0.7660606060606061,
1130
+ "grad_norm": 0.016218814998865128,
1131
+ "learning_rate": 0.0001982841118013789,
1132
+ "loss": 0.00046353964135050776,
1133
+ "memory/device_reserved (GiB)": 20.01,
1134
+ "memory/max_active (GiB)": 16.23,
1135
+ "memory/max_allocated (GiB)": 16.23,
1136
+ "ppl": 1.00046,
1137
+ "step": 790,
1138
+ "tokens/total": 12943360,
1139
+ "tokens/train_per_sec_per_gpu": 15.1,
1140
+ "tokens/trainable": 4099789
1141
+ },
1142
+ {
1143
+ "epoch": 0.7757575757575758,
1144
+ "grad_norm": 0.034673016518354416,
1145
+ "learning_rate": 0.00019815697209834147,
1146
+ "loss": 0.000707306619733572,
1147
+ "memory/device_reserved (GiB)": 20.01,
1148
+ "memory/max_active (GiB)": 16.23,
1149
+ "memory/max_allocated (GiB)": 16.23,
1150
+ "ppl": 1.00071,
1151
+ "step": 800,
1152
+ "tokens/total": 13107200,
1153
+ "tokens/train_per_sec_per_gpu": 14.45,
1154
+ "tokens/trainable": 4150960
1155
+ },
1156
+ {
1157
+ "epoch": 0.7854545454545454,
1158
+ "grad_norm": 0.0022127812262624502,
1159
+ "learning_rate": 0.00019802533269822208,
1160
+ "loss": 0.00021896373946219682,
1161
+ "memory/device_reserved (GiB)": 20.01,
1162
+ "memory/max_active (GiB)": 16.23,
1163
+ "memory/max_allocated (GiB)": 16.23,
1164
+ "ppl": 1.00022,
1165
+ "step": 810,
1166
+ "tokens/total": 13271040,
1167
+ "tokens/train_per_sec_per_gpu": 14.75,
1168
+ "tokens/trainable": 4202984
1169
+ },
1170
+ {
1171
+ "epoch": 0.7951515151515152,
1172
+ "grad_norm": 0.000919274752959609,
1173
+ "learning_rate": 0.00019788919963561422,
1174
+ "loss": 0.00043264860287308695,
1175
+ "memory/device_reserved (GiB)": 20.01,
1176
+ "memory/max_active (GiB)": 16.23,
1177
+ "memory/max_allocated (GiB)": 16.23,
1178
+ "ppl": 1.00043,
1179
+ "step": 820,
1180
+ "tokens/total": 13434880,
1181
+ "tokens/train_per_sec_per_gpu": 14.06,
1182
+ "tokens/trainable": 4254907
1183
+ },
1184
+ {
1185
+ "epoch": 0.8048484848484848,
1186
+ "grad_norm": 0.007699873298406601,
1187
+ "learning_rate": 0.00019774857915110913,
1188
+ "loss": 0.0003196246922016144,
1189
+ "memory/device_reserved (GiB)": 20.01,
1190
+ "memory/max_active (GiB)": 16.23,
1191
+ "memory/max_allocated (GiB)": 16.23,
1192
+ "ppl": 1.00032,
1193
+ "step": 830,
1194
+ "tokens/total": 13598720,
1195
+ "tokens/train_per_sec_per_gpu": 14.75,
1196
+ "tokens/trainable": 4306095
1197
+ },
1198
+ {
1199
+ "epoch": 0.8145454545454546,
1200
+ "grad_norm": 0.015523642301559448,
1201
+ "learning_rate": 0.00019760347769100987,
1202
+ "loss": 0.0004476988688111305,
1203
+ "memory/device_reserved (GiB)": 20.01,
1204
+ "memory/max_active (GiB)": 16.23,
1205
+ "memory/max_allocated (GiB)": 16.23,
1206
+ "ppl": 1.00045,
1207
+ "step": 840,
1208
+ "tokens/total": 13762560,
1209
+ "tokens/train_per_sec_per_gpu": 14.14,
1210
+ "tokens/trainable": 4357442
1211
+ },
1212
+ {
1213
+ "epoch": 0.8242424242424242,
1214
+ "grad_norm": 0.013460986316204071,
1215
+ "learning_rate": 0.00019745390190703565,
1216
+ "loss": 0.0004673306830227375,
1217
+ "memory/device_reserved (GiB)": 20.01,
1218
+ "memory/max_active (GiB)": 16.23,
1219
+ "memory/max_allocated (GiB)": 16.23,
1220
+ "ppl": 1.00047,
1221
+ "step": 850,
1222
+ "tokens/total": 13926400,
1223
+ "tokens/train_per_sec_per_gpu": 14.1,
1224
+ "tokens/trainable": 4409277
1225
+ },
1226
+ {
1227
+ "epoch": 0.833939393939394,
1228
+ "grad_norm": 0.0014691110700368881,
1229
+ "learning_rate": 0.0001972998586560169,
1230
+ "loss": 0.0003277578856796026,
1231
+ "memory/device_reserved (GiB)": 20.01,
1232
+ "memory/max_active (GiB)": 16.23,
1233
+ "memory/max_allocated (GiB)": 16.23,
1234
+ "ppl": 1.00033,
1235
+ "step": 860,
1236
+ "tokens/total": 14090240,
1237
+ "tokens/train_per_sec_per_gpu": 14.28,
1238
+ "tokens/trainable": 4460714
1239
+ },
1240
+ {
1241
+ "epoch": 0.8436363636363636,
1242
+ "grad_norm": 0.001358041656203568,
1243
+ "learning_rate": 0.00019714135499958112,
1244
+ "loss": 0.00032470382284373046,
1245
+ "memory/device_reserved (GiB)": 20.01,
1246
+ "memory/max_active (GiB)": 16.23,
1247
+ "memory/max_allocated (GiB)": 16.23,
1248
+ "ppl": 1.00032,
1249
+ "step": 870,
1250
+ "tokens/total": 14254080,
1251
+ "tokens/train_per_sec_per_gpu": 13.85,
1252
+ "tokens/trainable": 4511989
1253
+ },
1254
+ {
1255
+ "epoch": 0.8533333333333334,
1256
+ "grad_norm": 0.04510723799467087,
1257
+ "learning_rate": 0.0001969783982038289,
1258
+ "loss": 0.00023182881996035575,
1259
+ "memory/device_reserved (GiB)": 20.01,
1260
+ "memory/max_active (GiB)": 16.23,
1261
+ "memory/max_allocated (GiB)": 16.23,
1262
+ "ppl": 1.00023,
1263
+ "step": 880,
1264
+ "tokens/total": 14417920,
1265
+ "tokens/train_per_sec_per_gpu": 15.41,
1266
+ "tokens/trainable": 4563354
1267
+ },
1268
+ {
1269
+ "epoch": 0.863030303030303,
1270
+ "grad_norm": 0.14508692920207977,
1271
+ "learning_rate": 0.00019681099573900113,
1272
+ "loss": 0.00026136748492717744,
1273
+ "memory/device_reserved (GiB)": 20.01,
1274
+ "memory/max_active (GiB)": 16.23,
1275
+ "memory/max_allocated (GiB)": 16.23,
1276
+ "ppl": 1.00026,
1277
+ "step": 890,
1278
+ "tokens/total": 14581760,
1279
+ "tokens/train_per_sec_per_gpu": 13.85,
1280
+ "tokens/trainable": 4615691
1281
+ },
1282
+ {
1283
+ "epoch": 0.8727272727272727,
1284
+ "grad_norm": 0.010969490744173527,
1285
+ "learning_rate": 0.00019663915527913625,
1286
+ "loss": 0.00016044279327616097,
1287
+ "memory/device_reserved (GiB)": 20.01,
1288
+ "memory/max_active (GiB)": 16.23,
1289
+ "memory/max_allocated (GiB)": 16.23,
1290
+ "ppl": 1.00016,
1291
+ "step": 900,
1292
+ "tokens/total": 14745600,
1293
+ "tokens/train_per_sec_per_gpu": 15.76,
1294
+ "tokens/trainable": 4667433
1295
+ },
1296
+ {
1297
+ "epoch": 0.8824242424242424,
1298
+ "grad_norm": 0.03874114155769348,
1299
+ "learning_rate": 0.00019646288470171868,
1300
+ "loss": 0.0004159804433584213,
1301
+ "memory/device_reserved (GiB)": 20.01,
1302
+ "memory/max_active (GiB)": 16.23,
1303
+ "memory/max_allocated (GiB)": 16.23,
1304
+ "ppl": 1.00042,
1305
+ "step": 910,
1306
+ "tokens/total": 14909440,
1307
+ "tokens/train_per_sec_per_gpu": 16.01,
1308
+ "tokens/trainable": 4719807
1309
+ },
1310
+ {
1311
+ "epoch": 0.8921212121212121,
1312
+ "grad_norm": 0.044620465487241745,
1313
+ "learning_rate": 0.00019628219208731756,
1314
+ "loss": 0.0006739750038832426,
1315
+ "memory/device_reserved (GiB)": 20.01,
1316
+ "memory/max_active (GiB)": 16.23,
1317
+ "memory/max_allocated (GiB)": 16.23,
1318
+ "ppl": 1.00067,
1319
+ "step": 920,
1320
+ "tokens/total": 15073280,
1321
+ "tokens/train_per_sec_per_gpu": 15.05,
1322
+ "tokens/trainable": 4771772
1323
+ },
1324
+ {
1325
+ "epoch": 0.9018181818181819,
1326
+ "grad_norm": 0.024856949225068092,
1327
+ "learning_rate": 0.00019609708571921645,
1328
+ "loss": 0.00039347023703157903,
1329
+ "memory/device_reserved (GiB)": 20.01,
1330
+ "memory/max_active (GiB)": 16.23,
1331
+ "memory/max_allocated (GiB)": 16.23,
1332
+ "ppl": 1.00039,
1333
+ "step": 930,
1334
+ "tokens/total": 15237120,
1335
+ "tokens/train_per_sec_per_gpu": 15.16,
1336
+ "tokens/trainable": 4823415
1337
+ },
1338
+ {
1339
+ "epoch": 0.9115151515151515,
1340
+ "grad_norm": 0.022198157384991646,
1341
+ "learning_rate": 0.0001959075740830335,
1342
+ "loss": 0.0005907822400331497,
1343
+ "memory/device_reserved (GiB)": 20.01,
1344
+ "memory/max_active (GiB)": 16.23,
1345
+ "memory/max_allocated (GiB)": 16.23,
1346
+ "ppl": 1.00059,
1347
+ "step": 940,
1348
+ "tokens/total": 15400960,
1349
+ "tokens/train_per_sec_per_gpu": 15.36,
1350
+ "tokens/trainable": 4875269
1351
+ },
1352
+ {
1353
+ "epoch": 0.9212121212121213,
1354
+ "grad_norm": 0.01670038513839245,
1355
+ "learning_rate": 0.00019571366586633245,
1356
+ "loss": 0.00027316866908222437,
1357
+ "memory/device_reserved (GiB)": 20.01,
1358
+ "memory/max_active (GiB)": 16.23,
1359
+ "memory/max_allocated (GiB)": 16.23,
1360
+ "ppl": 1.00027,
1361
+ "step": 950,
1362
+ "tokens/total": 15564800,
1363
+ "tokens/train_per_sec_per_gpu": 15.11,
1364
+ "tokens/trainable": 4927244
1365
+ },
1366
+ {
1367
+ "epoch": 0.9309090909090909,
1368
+ "grad_norm": 0.021392742171883583,
1369
+ "learning_rate": 0.00019551536995822454,
1370
+ "loss": 0.0004320886451750994,
1371
+ "memory/device_reserved (GiB)": 20.01,
1372
+ "memory/max_active (GiB)": 16.23,
1373
+ "memory/max_allocated (GiB)": 16.23,
1374
+ "ppl": 1.00043,
1375
+ "step": 960,
1376
+ "tokens/total": 15728640,
1377
+ "tokens/train_per_sec_per_gpu": 14.16,
1378
+ "tokens/trainable": 4979068
1379
+ },
1380
+ {
1381
+ "epoch": 0.9406060606060606,
1382
+ "grad_norm": 0.028143158182501793,
1383
+ "learning_rate": 0.00019531269544896076,
1384
+ "loss": 0.0005637989845126868,
1385
+ "memory/device_reserved (GiB)": 20.01,
1386
+ "memory/max_active (GiB)": 16.23,
1387
+ "memory/max_allocated (GiB)": 16.23,
1388
+ "ppl": 1.00056,
1389
+ "step": 970,
1390
+ "tokens/total": 15892480,
1391
+ "tokens/train_per_sec_per_gpu": 14.26,
1392
+ "tokens/trainable": 5030980
1393
+ },
1394
+ {
1395
+ "epoch": 0.9503030303030303,
1396
+ "grad_norm": 0.077091746032238,
1397
+ "learning_rate": 0.00019510565162951537,
1398
+ "loss": 0.0010597245767712594,
1399
+ "memory/device_reserved (GiB)": 20.01,
1400
+ "memory/max_active (GiB)": 16.23,
1401
+ "memory/max_allocated (GiB)": 16.23,
1402
+ "ppl": 1.00106,
1403
+ "step": 980,
1404
+ "tokens/total": 16056320,
1405
+ "tokens/train_per_sec_per_gpu": 14.04,
1406
+ "tokens/trainable": 5082759
1407
+ },
1408
+ {
1409
+ "epoch": 0.96,
1410
+ "grad_norm": 0.04455556347966194,
1411
+ "learning_rate": 0.00019489424799115984,
1412
+ "loss": 0.0009517236612737179,
1413
+ "memory/device_reserved (GiB)": 20.01,
1414
+ "memory/max_active (GiB)": 16.23,
1415
+ "memory/max_allocated (GiB)": 16.23,
1416
+ "ppl": 1.00095,
1417
+ "step": 990,
1418
+ "tokens/total": 16220160,
1419
+ "tokens/train_per_sec_per_gpu": 13.04,
1420
+ "tokens/trainable": 5134379
1421
+ },
1422
+ {
1423
+ "epoch": 0.9696969696969697,
1424
+ "grad_norm": 0.03573840856552124,
1425
+ "learning_rate": 0.00019467849422502784,
1426
+ "loss": 0.0008812972344458103,
1427
+ "memory/device_reserved (GiB)": 20.01,
1428
+ "memory/max_active (GiB)": 16.23,
1429
+ "memory/max_allocated (GiB)": 16.23,
1430
+ "ppl": 1.00088,
1431
+ "step": 1000,
1432
+ "tokens/total": 16384000,
1433
+ "tokens/train_per_sec_per_gpu": 15.23,
1434
+ "tokens/trainable": 5186184
1435
+ },
1436
+ {
1437
+ "epoch": 0.9793939393939394,
1438
+ "grad_norm": 0.0006549305398948491,
1439
+ "learning_rate": 0.0001944584002216709,
1440
+ "loss": 0.0006358013488352299,
1441
+ "memory/device_reserved (GiB)": 20.01,
1442
+ "memory/max_active (GiB)": 16.23,
1443
+ "memory/max_allocated (GiB)": 16.23,
1444
+ "ppl": 1.00064,
1445
+ "step": 1010,
1446
+ "tokens/total": 16547840,
1447
+ "tokens/train_per_sec_per_gpu": 16.1,
1448
+ "tokens/trainable": 5238320
1449
+ },
1450
+ {
1451
+ "epoch": 0.9890909090909091,
1452
+ "grad_norm": 0.021742813289165497,
1453
+ "learning_rate": 0.00019423397607060507,
1454
+ "loss": 0.000400003744289279,
1455
+ "memory/device_reserved (GiB)": 20.01,
1456
+ "memory/max_active (GiB)": 16.23,
1457
+ "memory/max_allocated (GiB)": 16.23,
1458
+ "ppl": 1.0004,
1459
+ "step": 1020,
1460
+ "tokens/total": 16711680,
1461
+ "tokens/train_per_sec_per_gpu": 14.53,
1462
+ "tokens/trainable": 5290445
1463
+ },
1464
+ {
1465
+ "epoch": 0.9987878787878788,
1466
+ "grad_norm": 0.04323820024728775,
1467
+ "learning_rate": 0.00019400523205984833,
1468
+ "loss": 0.0002954686991870403,
1469
+ "memory/device_reserved (GiB)": 20.01,
1470
+ "memory/max_active (GiB)": 16.23,
1471
+ "memory/max_allocated (GiB)": 16.23,
1472
+ "ppl": 1.0003,
1473
+ "step": 1030,
1474
+ "tokens/total": 16875520,
1475
+ "tokens/train_per_sec_per_gpu": 14.98,
1476
+ "tokens/trainable": 5342720
1477
+ }
1478
+ ],
1479
+ "logging_steps": 10,
1480
+ "max_steps": 5155,
1481
+ "num_input_tokens_seen": 0,
1482
+ "num_train_epochs": 5,
1483
+ "save_steps": 1031,
1484
+ "stateful_callbacks": {
1485
+ "TrainerControl": {
1486
+ "args": {
1487
+ "should_epoch_stop": false,
1488
+ "should_evaluate": false,
1489
+ "should_log": false,
1490
+ "should_save": true,
1491
+ "should_training_stop": false
1492
+ },
1493
+ "attributes": {}
1494
+ }
1495
+ },
1496
+ "total_flos": 3.7522967515417805e+17,
1497
+ "train_batch_size": 2,
1498
+ "trial_name": null,
1499
+ "trial_params": null
1500
+ }
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-1031/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d29b464b8810e63db4689f2a7488bb151d3c44002b850563c9f99c9489ec58c9
3
+ size 7121
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 32,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "down_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "o_proj",
36
+ "k_proj",
37
+ "q_proj",
38
+ "up_proj"
39
+ ],
40
+ "target_parameters": [],
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da62714aaa23848ef165e457db4ac64c154ba06f2678a79b5c9a5f4d9131e877
3
+ size 264308896
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ ' + message['content'] + '<|im_end|>' + '
3
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
+ ' }}{% endif %}
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9568a797d761234200086dde3789d57bc2b2944e366ab3e5a3273a96b0515b3
3
+ size 528915403
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d132e1337dcddf36eb41c4686b7b1f64060722a1c210d58a733ddcfc9c9fb9b
3
+ size 14645
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fad802e90d13070878b8d7c99bc22b3028181f43a8f5425264542fd04c806af
3
+ size 1465
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": true,
24
+ "model_max_length": 1010000,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/tokens_state. ADDED
@@ -0,0 +1 @@
 
 
1
+ {"total": 33808384, "trainable": 10703590}
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/trainer_state.json ADDED
@@ -0,0 +1,2966 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.001939393939394,
6
+ "eval_steps": 516,
7
+ "global_step": 2062,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0,
14
+ "eval_loss": 0.8898435831069946,
15
+ "eval_ppl": 2.43475,
16
+ "eval_runtime": 12.6383,
17
+ "eval_samples_per_second": 15.825,
18
+ "eval_steps_per_second": 7.912,
19
+ "memory/device_reserved (GiB)": 13.84,
20
+ "memory/max_active (GiB)": 13.69,
21
+ "memory/max_allocated (GiB)": 13.69,
22
+ "step": 0
23
+ },
24
+ {
25
+ "epoch": 0.009696969696969697,
26
+ "grad_norm": 2.995619058609009,
27
+ "learning_rate": 3.4951456310679615e-06,
28
+ "loss": 0.8680612564086914,
29
+ "memory/device_reserved (GiB)": 18.85,
30
+ "memory/max_active (GiB)": 16.23,
31
+ "memory/max_allocated (GiB)": 16.23,
32
+ "ppl": 2.38229,
33
+ "step": 10,
34
+ "tokens/total": 163840,
35
+ "tokens/train_per_sec_per_gpu": 14.27,
36
+ "tokens/trainable": 51990
37
+ },
38
+ {
39
+ "epoch": 0.019393939393939394,
40
+ "grad_norm": 2.1244935989379883,
41
+ "learning_rate": 7.378640776699029e-06,
42
+ "loss": 0.7699687004089355,
43
+ "memory/device_reserved (GiB)": 18.85,
44
+ "memory/max_active (GiB)": 16.23,
45
+ "memory/max_allocated (GiB)": 16.23,
46
+ "ppl": 2.1597,
47
+ "step": 20,
48
+ "tokens/total": 327680,
49
+ "tokens/train_per_sec_per_gpu": 16.06,
50
+ "tokens/trainable": 104391
51
+ },
52
+ {
53
+ "epoch": 0.02909090909090909,
54
+ "grad_norm": 0.9706138372421265,
55
+ "learning_rate": 1.1262135922330098e-05,
56
+ "loss": 0.5319457054138184,
57
+ "memory/device_reserved (GiB)": 18.85,
58
+ "memory/max_active (GiB)": 16.23,
59
+ "memory/max_allocated (GiB)": 16.23,
60
+ "ppl": 1.70224,
61
+ "step": 30,
62
+ "tokens/total": 491520,
63
+ "tokens/train_per_sec_per_gpu": 16.48,
64
+ "tokens/trainable": 156787
65
+ },
66
+ {
67
+ "epoch": 0.03878787878787879,
68
+ "grad_norm": 0.7689842581748962,
69
+ "learning_rate": 1.5145631067961166e-05,
70
+ "loss": 0.30234951972961427,
71
+ "memory/device_reserved (GiB)": 18.85,
72
+ "memory/max_active (GiB)": 16.23,
73
+ "memory/max_allocated (GiB)": 16.23,
74
+ "ppl": 1.35303,
75
+ "step": 40,
76
+ "tokens/total": 655360,
77
+ "tokens/train_per_sec_per_gpu": 14.84,
78
+ "tokens/trainable": 208924
79
+ },
80
+ {
81
+ "epoch": 0.048484848484848485,
82
+ "grad_norm": 0.45850396156311035,
83
+ "learning_rate": 1.9029126213592234e-05,
84
+ "loss": 0.1519382953643799,
85
+ "memory/device_reserved (GiB)": 18.85,
86
+ "memory/max_active (GiB)": 16.23,
87
+ "memory/max_allocated (GiB)": 16.23,
88
+ "ppl": 1.16409,
89
+ "step": 50,
90
+ "tokens/total": 819200,
91
+ "tokens/train_per_sec_per_gpu": 14.61,
92
+ "tokens/trainable": 261170
93
+ },
94
+ {
95
+ "epoch": 0.05818181818181818,
96
+ "grad_norm": 0.41381561756134033,
97
+ "learning_rate": 2.29126213592233e-05,
98
+ "loss": 0.062263429164886475,
99
+ "memory/device_reserved (GiB)": 18.85,
100
+ "memory/max_active (GiB)": 16.23,
101
+ "memory/max_allocated (GiB)": 16.23,
102
+ "ppl": 1.06424,
103
+ "step": 60,
104
+ "tokens/total": 983040,
105
+ "tokens/train_per_sec_per_gpu": 14.19,
106
+ "tokens/trainable": 313808
107
+ },
108
+ {
109
+ "epoch": 0.06787878787878789,
110
+ "grad_norm": 0.4865979254245758,
111
+ "learning_rate": 2.6796116504854367e-05,
112
+ "loss": 0.018695920705795288,
113
+ "memory/device_reserved (GiB)": 18.85,
114
+ "memory/max_active (GiB)": 16.23,
115
+ "memory/max_allocated (GiB)": 16.23,
116
+ "ppl": 1.01887,
117
+ "step": 70,
118
+ "tokens/total": 1146880,
119
+ "tokens/train_per_sec_per_gpu": 14.62,
120
+ "tokens/trainable": 366068
121
+ },
122
+ {
123
+ "epoch": 0.07757575757575758,
124
+ "grad_norm": 0.39099738001823425,
125
+ "learning_rate": 3.067961165048544e-05,
126
+ "loss": 0.006136053055524826,
127
+ "memory/device_reserved (GiB)": 18.85,
128
+ "memory/max_active (GiB)": 16.23,
129
+ "memory/max_allocated (GiB)": 16.23,
130
+ "ppl": 1.00615,
131
+ "step": 80,
132
+ "tokens/total": 1310720,
133
+ "tokens/train_per_sec_per_gpu": 13.81,
134
+ "tokens/trainable": 418120
135
+ },
136
+ {
137
+ "epoch": 0.08727272727272728,
138
+ "grad_norm": 0.08230593055486679,
139
+ "learning_rate": 3.456310679611651e-05,
140
+ "loss": 0.004204501211643219,
141
+ "memory/device_reserved (GiB)": 18.85,
142
+ "memory/max_active (GiB)": 16.23,
143
+ "memory/max_allocated (GiB)": 16.23,
144
+ "ppl": 1.00421,
145
+ "step": 90,
146
+ "tokens/total": 1474560,
147
+ "tokens/train_per_sec_per_gpu": 15.07,
148
+ "tokens/trainable": 470244
149
+ },
150
+ {
151
+ "epoch": 0.09696969696969697,
152
+ "grad_norm": 0.13297680020332336,
153
+ "learning_rate": 3.844660194174757e-05,
154
+ "loss": 0.0036250378936529158,
155
+ "memory/device_reserved (GiB)": 18.85,
156
+ "memory/max_active (GiB)": 16.23,
157
+ "memory/max_allocated (GiB)": 16.23,
158
+ "ppl": 1.00363,
159
+ "step": 100,
160
+ "tokens/total": 1638400,
161
+ "tokens/train_per_sec_per_gpu": 14.91,
162
+ "tokens/trainable": 522666
163
+ },
164
+ {
165
+ "epoch": 0.10666666666666667,
166
+ "grad_norm": 0.2430051565170288,
167
+ "learning_rate": 4.2330097087378647e-05,
168
+ "loss": 0.003873714804649353,
169
+ "memory/device_reserved (GiB)": 18.85,
170
+ "memory/max_active (GiB)": 16.23,
171
+ "memory/max_allocated (GiB)": 16.23,
172
+ "ppl": 1.00388,
173
+ "step": 110,
174
+ "tokens/total": 1802240,
175
+ "tokens/train_per_sec_per_gpu": 14.17,
176
+ "tokens/trainable": 574329
177
+ },
178
+ {
179
+ "epoch": 0.11636363636363636,
180
+ "grad_norm": 0.09347938001155853,
181
+ "learning_rate": 4.621359223300971e-05,
182
+ "loss": 0.00237951148301363,
183
+ "memory/device_reserved (GiB)": 18.85,
184
+ "memory/max_active (GiB)": 16.23,
185
+ "memory/max_allocated (GiB)": 16.23,
186
+ "ppl": 1.00238,
187
+ "step": 120,
188
+ "tokens/total": 1966080,
189
+ "tokens/train_per_sec_per_gpu": 14.33,
190
+ "tokens/trainable": 626194
191
+ },
192
+ {
193
+ "epoch": 0.12606060606060607,
194
+ "grad_norm": 0.13388365507125854,
195
+ "learning_rate": 5.0097087378640786e-05,
196
+ "loss": 0.0015400107949972153,
197
+ "memory/device_reserved (GiB)": 18.85,
198
+ "memory/max_active (GiB)": 16.23,
199
+ "memory/max_allocated (GiB)": 16.23,
200
+ "ppl": 1.00154,
201
+ "step": 130,
202
+ "tokens/total": 2129920,
203
+ "tokens/train_per_sec_per_gpu": 14.01,
204
+ "tokens/trainable": 678140
205
+ },
206
+ {
207
+ "epoch": 0.13575757575757577,
208
+ "grad_norm": 0.13342970609664917,
209
+ "learning_rate": 5.398058252427185e-05,
210
+ "loss": 0.001996887102723122,
211
+ "memory/device_reserved (GiB)": 18.85,
212
+ "memory/max_active (GiB)": 16.23,
213
+ "memory/max_allocated (GiB)": 16.23,
214
+ "ppl": 1.002,
215
+ "step": 140,
216
+ "tokens/total": 2293760,
217
+ "tokens/train_per_sec_per_gpu": 14.41,
218
+ "tokens/trainable": 730201
219
+ },
220
+ {
221
+ "epoch": 0.14545454545454545,
222
+ "grad_norm": 0.0299234539270401,
223
+ "learning_rate": 5.786407766990292e-05,
224
+ "loss": 0.0015132850036025046,
225
+ "memory/device_reserved (GiB)": 18.85,
226
+ "memory/max_active (GiB)": 16.23,
227
+ "memory/max_allocated (GiB)": 16.23,
228
+ "ppl": 1.00151,
229
+ "step": 150,
230
+ "tokens/total": 2457600,
231
+ "tokens/train_per_sec_per_gpu": 15.8,
232
+ "tokens/trainable": 782196
233
+ },
234
+ {
235
+ "epoch": 0.15515151515151515,
236
+ "grad_norm": 0.04437975212931633,
237
+ "learning_rate": 6.174757281553398e-05,
238
+ "loss": 0.0012883609160780907,
239
+ "memory/device_reserved (GiB)": 18.85,
240
+ "memory/max_active (GiB)": 16.23,
241
+ "memory/max_allocated (GiB)": 16.23,
242
+ "ppl": 1.00129,
243
+ "step": 160,
244
+ "tokens/total": 2621440,
245
+ "tokens/train_per_sec_per_gpu": 14.64,
246
+ "tokens/trainable": 833614
247
+ },
248
+ {
249
+ "epoch": 0.16484848484848486,
250
+ "grad_norm": 0.014039761386811733,
251
+ "learning_rate": 6.563106796116505e-05,
252
+ "loss": 0.0011639594100415706,
253
+ "memory/device_reserved (GiB)": 18.85,
254
+ "memory/max_active (GiB)": 16.23,
255
+ "memory/max_allocated (GiB)": 16.23,
256
+ "ppl": 1.00116,
257
+ "step": 170,
258
+ "tokens/total": 2785280,
259
+ "tokens/train_per_sec_per_gpu": 13.95,
260
+ "tokens/trainable": 885591
261
+ },
262
+ {
263
+ "epoch": 0.17454545454545456,
264
+ "grad_norm": 0.0033261056523770094,
265
+ "learning_rate": 6.951456310679612e-05,
266
+ "loss": 0.0007388167083263397,
267
+ "memory/device_reserved (GiB)": 18.85,
268
+ "memory/max_active (GiB)": 16.23,
269
+ "memory/max_allocated (GiB)": 16.23,
270
+ "ppl": 1.00074,
271
+ "step": 180,
272
+ "tokens/total": 2949120,
273
+ "tokens/train_per_sec_per_gpu": 14.37,
274
+ "tokens/trainable": 937712
275
+ },
276
+ {
277
+ "epoch": 0.18424242424242424,
278
+ "grad_norm": 0.010476192459464073,
279
+ "learning_rate": 7.339805825242719e-05,
280
+ "loss": 0.0008642122149467469,
281
+ "memory/device_reserved (GiB)": 18.85,
282
+ "memory/max_active (GiB)": 16.23,
283
+ "memory/max_allocated (GiB)": 16.23,
284
+ "ppl": 1.00086,
285
+ "step": 190,
286
+ "tokens/total": 3112960,
287
+ "tokens/train_per_sec_per_gpu": 15.52,
288
+ "tokens/trainable": 989913
289
+ },
290
+ {
291
+ "epoch": 0.19393939393939394,
292
+ "grad_norm": 0.01253255270421505,
293
+ "learning_rate": 7.728155339805826e-05,
294
+ "loss": 0.0007610846310853958,
295
+ "memory/device_reserved (GiB)": 18.85,
296
+ "memory/max_active (GiB)": 16.23,
297
+ "memory/max_allocated (GiB)": 16.23,
298
+ "ppl": 1.00076,
299
+ "step": 200,
300
+ "tokens/total": 3276800,
301
+ "tokens/train_per_sec_per_gpu": 14.17,
302
+ "tokens/trainable": 1041978
303
+ },
304
+ {
305
+ "epoch": 0.20363636363636364,
306
+ "grad_norm": 0.01779557578265667,
307
+ "learning_rate": 8.116504854368933e-05,
308
+ "loss": 0.0007697530556470156,
309
+ "memory/device_reserved (GiB)": 18.85,
310
+ "memory/max_active (GiB)": 16.23,
311
+ "memory/max_allocated (GiB)": 16.23,
312
+ "ppl": 1.00077,
313
+ "step": 210,
314
+ "tokens/total": 3440640,
315
+ "tokens/train_per_sec_per_gpu": 14.12,
316
+ "tokens/trainable": 1093395
317
+ },
318
+ {
319
+ "epoch": 0.21333333333333335,
320
+ "grad_norm": 0.16895800828933716,
321
+ "learning_rate": 8.504854368932039e-05,
322
+ "loss": 0.0006535804830491542,
323
+ "memory/device_reserved (GiB)": 18.85,
324
+ "memory/max_active (GiB)": 16.23,
325
+ "memory/max_allocated (GiB)": 16.23,
326
+ "ppl": 1.00065,
327
+ "step": 220,
328
+ "tokens/total": 3604480,
329
+ "tokens/train_per_sec_per_gpu": 14.72,
330
+ "tokens/trainable": 1145329
331
+ },
332
+ {
333
+ "epoch": 0.22303030303030302,
334
+ "grad_norm": 0.08973463624715805,
335
+ "learning_rate": 8.893203883495146e-05,
336
+ "loss": 0.0009510296396911145,
337
+ "memory/device_reserved (GiB)": 18.85,
338
+ "memory/max_active (GiB)": 16.23,
339
+ "memory/max_allocated (GiB)": 16.23,
340
+ "ppl": 1.00095,
341
+ "step": 230,
342
+ "tokens/total": 3768320,
343
+ "tokens/train_per_sec_per_gpu": 14.67,
344
+ "tokens/trainable": 1197537
345
+ },
346
+ {
347
+ "epoch": 0.23272727272727273,
348
+ "grad_norm": 0.044939588755369186,
349
+ "learning_rate": 9.281553398058253e-05,
350
+ "loss": 0.001187363639473915,
351
+ "memory/device_reserved (GiB)": 18.85,
352
+ "memory/max_active (GiB)": 16.23,
353
+ "memory/max_allocated (GiB)": 16.23,
354
+ "ppl": 1.00119,
355
+ "step": 240,
356
+ "tokens/total": 3932160,
357
+ "tokens/train_per_sec_per_gpu": 15.39,
358
+ "tokens/trainable": 1249924
359
+ },
360
+ {
361
+ "epoch": 0.24242424242424243,
362
+ "grad_norm": 0.08850465714931488,
363
+ "learning_rate": 9.66990291262136e-05,
364
+ "loss": 0.0013382930308580398,
365
+ "memory/device_reserved (GiB)": 18.85,
366
+ "memory/max_active (GiB)": 16.23,
367
+ "memory/max_allocated (GiB)": 16.23,
368
+ "ppl": 1.00134,
369
+ "step": 250,
370
+ "tokens/total": 4096000,
371
+ "tokens/train_per_sec_per_gpu": 15.06,
372
+ "tokens/trainable": 1301558
373
+ },
374
+ {
375
+ "epoch": 0.25212121212121213,
376
+ "grad_norm": 0.101528100669384,
377
+ "learning_rate": 0.00010058252427184467,
378
+ "loss": 0.0008709387853741646,
379
+ "memory/device_reserved (GiB)": 18.85,
380
+ "memory/max_active (GiB)": 16.23,
381
+ "memory/max_allocated (GiB)": 16.23,
382
+ "ppl": 1.00087,
383
+ "step": 260,
384
+ "tokens/total": 4259840,
385
+ "tokens/train_per_sec_per_gpu": 15.16,
386
+ "tokens/trainable": 1353706
387
+ },
388
+ {
389
+ "epoch": 0.26181818181818184,
390
+ "grad_norm": 0.08298433572053909,
391
+ "learning_rate": 0.00010446601941747574,
392
+ "loss": 0.0013300922699272632,
393
+ "memory/device_reserved (GiB)": 18.85,
394
+ "memory/max_active (GiB)": 16.23,
395
+ "memory/max_allocated (GiB)": 16.23,
396
+ "ppl": 1.00133,
397
+ "step": 270,
398
+ "tokens/total": 4423680,
399
+ "tokens/train_per_sec_per_gpu": 15.11,
400
+ "tokens/trainable": 1405519
401
+ },
402
+ {
403
+ "epoch": 0.27151515151515154,
404
+ "grad_norm": 0.03734389320015907,
405
+ "learning_rate": 0.00010834951456310681,
406
+ "loss": 0.0006868645548820495,
407
+ "memory/device_reserved (GiB)": 18.85,
408
+ "memory/max_active (GiB)": 16.23,
409
+ "memory/max_allocated (GiB)": 16.23,
410
+ "ppl": 1.00069,
411
+ "step": 280,
412
+ "tokens/total": 4587520,
413
+ "tokens/train_per_sec_per_gpu": 15.07,
414
+ "tokens/trainable": 1457494
415
+ },
416
+ {
417
+ "epoch": 0.2812121212121212,
418
+ "grad_norm": 0.07898428291082382,
419
+ "learning_rate": 0.00011223300970873786,
420
+ "loss": 0.0013550779782235622,
421
+ "memory/device_reserved (GiB)": 18.85,
422
+ "memory/max_active (GiB)": 16.23,
423
+ "memory/max_allocated (GiB)": 16.23,
424
+ "ppl": 1.00136,
425
+ "step": 290,
426
+ "tokens/total": 4751360,
427
+ "tokens/train_per_sec_per_gpu": 14.75,
428
+ "tokens/trainable": 1509320
429
+ },
430
+ {
431
+ "epoch": 0.2909090909090909,
432
+ "grad_norm": 0.06320006400346756,
433
+ "learning_rate": 0.00011611650485436893,
434
+ "loss": 0.0010121697559952736,
435
+ "memory/device_reserved (GiB)": 18.85,
436
+ "memory/max_active (GiB)": 16.23,
437
+ "memory/max_allocated (GiB)": 16.23,
438
+ "ppl": 1.00101,
439
+ "step": 300,
440
+ "tokens/total": 4915200,
441
+ "tokens/train_per_sec_per_gpu": 14.19,
442
+ "tokens/trainable": 1561332
443
+ },
444
+ {
445
+ "epoch": 0.3006060606060606,
446
+ "grad_norm": 0.013749867677688599,
447
+ "learning_rate": 0.00012,
448
+ "loss": 0.0006499682553112507,
449
+ "memory/device_reserved (GiB)": 18.85,
450
+ "memory/max_active (GiB)": 16.23,
451
+ "memory/max_allocated (GiB)": 16.23,
452
+ "ppl": 1.00065,
453
+ "step": 310,
454
+ "tokens/total": 5079040,
455
+ "tokens/train_per_sec_per_gpu": 14.84,
456
+ "tokens/trainable": 1613189
457
+ },
458
+ {
459
+ "epoch": 0.3103030303030303,
460
+ "grad_norm": 0.033964402973651886,
461
+ "learning_rate": 0.00012388349514563107,
462
+ "loss": 0.0008866124786436558,
463
+ "memory/device_reserved (GiB)": 18.85,
464
+ "memory/max_active (GiB)": 16.23,
465
+ "memory/max_allocated (GiB)": 16.23,
466
+ "ppl": 1.00089,
467
+ "step": 320,
468
+ "tokens/total": 5242880,
469
+ "tokens/train_per_sec_per_gpu": 15.78,
470
+ "tokens/trainable": 1665681
471
+ },
472
+ {
473
+ "epoch": 0.32,
474
+ "grad_norm": 0.04327597841620445,
475
+ "learning_rate": 0.00012776699029126213,
476
+ "loss": 0.0005569641944020987,
477
+ "memory/device_reserved (GiB)": 18.85,
478
+ "memory/max_active (GiB)": 16.23,
479
+ "memory/max_allocated (GiB)": 16.23,
480
+ "ppl": 1.00056,
481
+ "step": 330,
482
+ "tokens/total": 5406720,
483
+ "tokens/train_per_sec_per_gpu": 14.92,
484
+ "tokens/trainable": 1718317
485
+ },
486
+ {
487
+ "epoch": 0.3296969696969697,
488
+ "grad_norm": 0.02717934548854828,
489
+ "learning_rate": 0.0001316504854368932,
490
+ "loss": 0.0003776244120672345,
491
+ "memory/device_reserved (GiB)": 18.85,
492
+ "memory/max_active (GiB)": 16.23,
493
+ "memory/max_allocated (GiB)": 16.23,
494
+ "ppl": 1.00038,
495
+ "step": 340,
496
+ "tokens/total": 5570560,
497
+ "tokens/train_per_sec_per_gpu": 14.42,
498
+ "tokens/trainable": 1770210
499
+ },
500
+ {
501
+ "epoch": 0.3393939393939394,
502
+ "grad_norm": 0.0028237912338227034,
503
+ "learning_rate": 0.0001355339805825243,
504
+ "loss": 0.0005292522720992566,
505
+ "memory/device_reserved (GiB)": 18.85,
506
+ "memory/max_active (GiB)": 16.23,
507
+ "memory/max_allocated (GiB)": 16.23,
508
+ "ppl": 1.00053,
509
+ "step": 350,
510
+ "tokens/total": 5734400,
511
+ "tokens/train_per_sec_per_gpu": 16.4,
512
+ "tokens/trainable": 1821987
513
+ },
514
+ {
515
+ "epoch": 0.3490909090909091,
516
+ "grad_norm": 0.0310799703001976,
517
+ "learning_rate": 0.00013941747572815535,
518
+ "loss": 0.0006786303594708443,
519
+ "memory/device_reserved (GiB)": 18.85,
520
+ "memory/max_active (GiB)": 16.23,
521
+ "memory/max_allocated (GiB)": 16.23,
522
+ "ppl": 1.00068,
523
+ "step": 360,
524
+ "tokens/total": 5898240,
525
+ "tokens/train_per_sec_per_gpu": 14.72,
526
+ "tokens/trainable": 1874266
527
+ },
528
+ {
529
+ "epoch": 0.35878787878787877,
530
+ "grad_norm": 0.17325043678283691,
531
+ "learning_rate": 0.0001433009708737864,
532
+ "loss": 0.0013975565321743487,
533
+ "memory/device_reserved (GiB)": 18.85,
534
+ "memory/max_active (GiB)": 16.23,
535
+ "memory/max_allocated (GiB)": 16.23,
536
+ "ppl": 1.0014,
537
+ "step": 370,
538
+ "tokens/total": 6062080,
539
+ "tokens/train_per_sec_per_gpu": 13.73,
540
+ "tokens/trainable": 1926124
541
+ },
542
+ {
543
+ "epoch": 0.36848484848484847,
544
+ "grad_norm": 0.07738752663135529,
545
+ "learning_rate": 0.0001471844660194175,
546
+ "loss": 0.0006820175796747208,
547
+ "memory/device_reserved (GiB)": 18.85,
548
+ "memory/max_active (GiB)": 16.23,
549
+ "memory/max_allocated (GiB)": 16.23,
550
+ "ppl": 1.00068,
551
+ "step": 380,
552
+ "tokens/total": 6225920,
553
+ "tokens/train_per_sec_per_gpu": 14.04,
554
+ "tokens/trainable": 1978693
555
+ },
556
+ {
557
+ "epoch": 0.3781818181818182,
558
+ "grad_norm": 0.10022349655628204,
559
+ "learning_rate": 0.00015106796116504855,
560
+ "loss": 0.00063879219815135,
561
+ "memory/device_reserved (GiB)": 18.85,
562
+ "memory/max_active (GiB)": 16.23,
563
+ "memory/max_allocated (GiB)": 16.23,
564
+ "ppl": 1.00064,
565
+ "step": 390,
566
+ "tokens/total": 6389760,
567
+ "tokens/train_per_sec_per_gpu": 13.34,
568
+ "tokens/trainable": 2030378
569
+ },
570
+ {
571
+ "epoch": 0.3878787878787879,
572
+ "grad_norm": 0.0495997779071331,
573
+ "learning_rate": 0.00015495145631067963,
574
+ "loss": 0.0021283581852912905,
575
+ "memory/device_reserved (GiB)": 18.85,
576
+ "memory/max_active (GiB)": 16.23,
577
+ "memory/max_allocated (GiB)": 16.23,
578
+ "ppl": 1.00213,
579
+ "step": 400,
580
+ "tokens/total": 6553600,
581
+ "tokens/train_per_sec_per_gpu": 15.34,
582
+ "tokens/trainable": 2083047
583
+ },
584
+ {
585
+ "epoch": 0.3975757575757576,
586
+ "grad_norm": 0.07361701130867004,
587
+ "learning_rate": 0.0001588349514563107,
588
+ "loss": 0.001862115040421486,
589
+ "memory/device_reserved (GiB)": 18.85,
590
+ "memory/max_active (GiB)": 16.23,
591
+ "memory/max_allocated (GiB)": 16.23,
592
+ "ppl": 1.00186,
593
+ "step": 410,
594
+ "tokens/total": 6717440,
595
+ "tokens/train_per_sec_per_gpu": 14.23,
596
+ "tokens/trainable": 2135527
597
+ },
598
+ {
599
+ "epoch": 0.4072727272727273,
600
+ "grad_norm": 0.05466209724545479,
601
+ "learning_rate": 0.00016271844660194174,
602
+ "loss": 0.0011581303551793098,
603
+ "memory/device_reserved (GiB)": 18.85,
604
+ "memory/max_active (GiB)": 16.23,
605
+ "memory/max_allocated (GiB)": 16.23,
606
+ "ppl": 1.00116,
607
+ "step": 420,
608
+ "tokens/total": 6881280,
609
+ "tokens/train_per_sec_per_gpu": 14.77,
610
+ "tokens/trainable": 2187636
611
+ },
612
+ {
613
+ "epoch": 0.416969696969697,
614
+ "grad_norm": 0.04331392049789429,
615
+ "learning_rate": 0.00016660194174757283,
616
+ "loss": 0.0051729224622249605,
617
+ "memory/device_reserved (GiB)": 18.85,
618
+ "memory/max_active (GiB)": 16.23,
619
+ "memory/max_allocated (GiB)": 16.23,
620
+ "ppl": 1.00519,
621
+ "step": 430,
622
+ "tokens/total": 7045120,
623
+ "tokens/train_per_sec_per_gpu": 13.76,
624
+ "tokens/trainable": 2239006
625
+ },
626
+ {
627
+ "epoch": 0.4266666666666667,
628
+ "grad_norm": 0.05931795388460159,
629
+ "learning_rate": 0.00017048543689320388,
630
+ "loss": 0.00242764875292778,
631
+ "memory/device_reserved (GiB)": 18.85,
632
+ "memory/max_active (GiB)": 16.23,
633
+ "memory/max_allocated (GiB)": 16.23,
634
+ "ppl": 1.00243,
635
+ "step": 440,
636
+ "tokens/total": 7208960,
637
+ "tokens/train_per_sec_per_gpu": 14.59,
638
+ "tokens/trainable": 2290540
639
+ },
640
+ {
641
+ "epoch": 0.43636363636363634,
642
+ "grad_norm": 0.04634418711066246,
643
+ "learning_rate": 0.00017436893203883494,
644
+ "loss": 0.001389546226710081,
645
+ "memory/device_reserved (GiB)": 18.85,
646
+ "memory/max_active (GiB)": 16.23,
647
+ "memory/max_allocated (GiB)": 16.23,
648
+ "ppl": 1.00139,
649
+ "step": 450,
650
+ "tokens/total": 7372800,
651
+ "tokens/train_per_sec_per_gpu": 14.78,
652
+ "tokens/trainable": 2341852
653
+ },
654
+ {
655
+ "epoch": 0.44606060606060605,
656
+ "grad_norm": 0.04817213863134384,
657
+ "learning_rate": 0.00017825242718446602,
658
+ "loss": 0.001370794139802456,
659
+ "memory/device_reserved (GiB)": 18.85,
660
+ "memory/max_active (GiB)": 16.23,
661
+ "memory/max_allocated (GiB)": 16.23,
662
+ "ppl": 1.00137,
663
+ "step": 460,
664
+ "tokens/total": 7536640,
665
+ "tokens/train_per_sec_per_gpu": 13.77,
666
+ "tokens/trainable": 2393320
667
+ },
668
+ {
669
+ "epoch": 0.45575757575757575,
670
+ "grad_norm": 0.011335949413478374,
671
+ "learning_rate": 0.00018213592233009708,
672
+ "loss": 0.0009715131483972073,
673
+ "memory/device_reserved (GiB)": 18.85,
674
+ "memory/max_active (GiB)": 16.23,
675
+ "memory/max_allocated (GiB)": 16.23,
676
+ "ppl": 1.00097,
677
+ "step": 470,
678
+ "tokens/total": 7700480,
679
+ "tokens/train_per_sec_per_gpu": 14.52,
680
+ "tokens/trainable": 2445170
681
+ },
682
+ {
683
+ "epoch": 0.46545454545454545,
684
+ "grad_norm": 0.05298445746302605,
685
+ "learning_rate": 0.00018601941747572816,
686
+ "loss": 0.0008222623728215694,
687
+ "memory/device_reserved (GiB)": 18.85,
688
+ "memory/max_active (GiB)": 16.23,
689
+ "memory/max_allocated (GiB)": 16.23,
690
+ "ppl": 1.00082,
691
+ "step": 480,
692
+ "tokens/total": 7864320,
693
+ "tokens/train_per_sec_per_gpu": 13.87,
694
+ "tokens/trainable": 2497473
695
+ },
696
+ {
697
+ "epoch": 0.47515151515151516,
698
+ "grad_norm": 0.061686884611845016,
699
+ "learning_rate": 0.00018990291262135925,
700
+ "loss": 0.000748783303424716,
701
+ "memory/device_reserved (GiB)": 18.85,
702
+ "memory/max_active (GiB)": 16.23,
703
+ "memory/max_allocated (GiB)": 16.23,
704
+ "ppl": 1.00075,
705
+ "step": 490,
706
+ "tokens/total": 8028160,
707
+ "tokens/train_per_sec_per_gpu": 15.41,
708
+ "tokens/trainable": 2549206
709
+ },
710
+ {
711
+ "epoch": 0.48484848484848486,
712
+ "grad_norm": 0.03281249850988388,
713
+ "learning_rate": 0.0001937864077669903,
714
+ "loss": 0.0006062469445168972,
715
+ "memory/device_reserved (GiB)": 18.85,
716
+ "memory/max_active (GiB)": 16.23,
717
+ "memory/max_allocated (GiB)": 16.23,
718
+ "ppl": 1.00061,
719
+ "step": 500,
720
+ "tokens/total": 8192000,
721
+ "tokens/train_per_sec_per_gpu": 14.49,
722
+ "tokens/trainable": 2600583
723
+ },
724
+ {
725
+ "epoch": 0.49454545454545457,
726
+ "grad_norm": 0.008482079952955246,
727
+ "learning_rate": 0.0001976699029126214,
728
+ "loss": 0.0008583014830946922,
729
+ "memory/device_reserved (GiB)": 18.85,
730
+ "memory/max_active (GiB)": 16.23,
731
+ "memory/max_allocated (GiB)": 16.23,
732
+ "ppl": 1.00086,
733
+ "step": 510,
734
+ "tokens/total": 8355840,
735
+ "tokens/train_per_sec_per_gpu": 13.86,
736
+ "tokens/trainable": 2652927
737
+ },
738
+ {
739
+ "epoch": 0.5003636363636363,
740
+ "eval_loss": 0.0009036393603309989,
741
+ "eval_ppl": 1.0009,
742
+ "eval_runtime": 12.7872,
743
+ "eval_samples_per_second": 15.641,
744
+ "eval_steps_per_second": 7.82,
745
+ "memory/device_reserved (GiB)": 18.85,
746
+ "memory/max_active (GiB)": 16.23,
747
+ "memory/max_allocated (GiB)": 16.23,
748
+ "step": 516
749
+ },
750
+ {
751
+ "epoch": 0.5042424242424243,
752
+ "grad_norm": 0.04333305358886719,
753
+ "learning_rate": 0.0001999996332640321,
754
+ "loss": 0.0005093200132250785,
755
+ "memory/device_reserved (GiB)": 20.01,
756
+ "memory/max_active (GiB)": 16.23,
757
+ "memory/max_allocated (GiB)": 16.23,
758
+ "ppl": 1.00051,
759
+ "step": 520,
760
+ "tokens/total": 8519680,
761
+ "tokens/train_per_sec_per_gpu": 14.09,
762
+ "tokens/trainable": 2705083
763
+ },
764
+ {
765
+ "epoch": 0.5139393939393939,
766
+ "grad_norm": 0.02485118806362152,
767
+ "learning_rate": 0.00019999550751528488,
768
+ "loss": 0.0006649125367403031,
769
+ "memory/device_reserved (GiB)": 20.01,
770
+ "memory/max_active (GiB)": 16.23,
771
+ "memory/max_allocated (GiB)": 16.23,
772
+ "ppl": 1.00067,
773
+ "step": 530,
774
+ "tokens/total": 8683520,
775
+ "tokens/train_per_sec_per_gpu": 14.44,
776
+ "tokens/trainable": 2756975
777
+ },
778
+ {
779
+ "epoch": 0.5236363636363637,
780
+ "grad_norm": 0.03736363351345062,
781
+ "learning_rate": 0.00019998679778759294,
782
+ "loss": 0.0006726076360791921,
783
+ "memory/device_reserved (GiB)": 20.01,
784
+ "memory/max_active (GiB)": 16.23,
785
+ "memory/max_allocated (GiB)": 16.23,
786
+ "ppl": 1.00067,
787
+ "step": 540,
788
+ "tokens/total": 8847360,
789
+ "tokens/train_per_sec_per_gpu": 14.16,
790
+ "tokens/trainable": 2808076
791
+ },
792
+ {
793
+ "epoch": 0.5333333333333333,
794
+ "grad_norm": 0.05156765505671501,
795
+ "learning_rate": 0.0001999735044802263,
796
+ "loss": 0.000789718609303236,
797
+ "memory/device_reserved (GiB)": 20.01,
798
+ "memory/max_active (GiB)": 16.23,
799
+ "memory/max_allocated (GiB)": 16.23,
800
+ "ppl": 1.00079,
801
+ "step": 550,
802
+ "tokens/total": 9011200,
803
+ "tokens/train_per_sec_per_gpu": 16.36,
804
+ "tokens/trainable": 2859893
805
+ },
806
+ {
807
+ "epoch": 0.5430303030303031,
808
+ "grad_norm": 0.647550106048584,
809
+ "learning_rate": 0.00019995562820257474,
810
+ "loss": 0.003008325584232807,
811
+ "memory/device_reserved (GiB)": 20.01,
812
+ "memory/max_active (GiB)": 16.23,
813
+ "memory/max_allocated (GiB)": 16.23,
814
+ "ppl": 1.00301,
815
+ "step": 560,
816
+ "tokens/total": 9175040,
817
+ "tokens/train_per_sec_per_gpu": 14.21,
818
+ "tokens/trainable": 2911399
819
+ },
820
+ {
821
+ "epoch": 0.5527272727272727,
822
+ "grad_norm": 0.185165673494339,
823
+ "learning_rate": 0.00019993316977411993,
824
+ "loss": 0.013715097308158874,
825
+ "memory/device_reserved (GiB)": 20.01,
826
+ "memory/max_active (GiB)": 16.23,
827
+ "memory/max_allocated (GiB)": 16.23,
828
+ "ppl": 1.01381,
829
+ "step": 570,
830
+ "tokens/total": 9338880,
831
+ "tokens/train_per_sec_per_gpu": 13.85,
832
+ "tokens/trainable": 2962403
833
+ },
834
+ {
835
+ "epoch": 0.5624242424242424,
836
+ "grad_norm": 0.2401553839445114,
837
+ "learning_rate": 0.0001999061302243977,
838
+ "loss": 0.009026474505662917,
839
+ "memory/device_reserved (GiB)": 20.01,
840
+ "memory/max_active (GiB)": 16.23,
841
+ "memory/max_allocated (GiB)": 16.23,
842
+ "ppl": 1.00907,
843
+ "step": 580,
844
+ "tokens/total": 9502720,
845
+ "tokens/train_per_sec_per_gpu": 14.38,
846
+ "tokens/trainable": 3015083
847
+ },
848
+ {
849
+ "epoch": 0.5721212121212121,
850
+ "grad_norm": 0.08092579245567322,
851
+ "learning_rate": 0.000199874510792951,
852
+ "loss": 0.005716494470834732,
853
+ "memory/device_reserved (GiB)": 20.01,
854
+ "memory/max_active (GiB)": 16.23,
855
+ "memory/max_allocated (GiB)": 16.23,
856
+ "ppl": 1.00573,
857
+ "step": 590,
858
+ "tokens/total": 9666560,
859
+ "tokens/train_per_sec_per_gpu": 16.38,
860
+ "tokens/trainable": 3066501
861
+ },
862
+ {
863
+ "epoch": 0.5818181818181818,
864
+ "grad_norm": 3.418715476989746,
865
+ "learning_rate": 0.00019983831292927305,
866
+ "loss": 0.048504295945167544,
867
+ "memory/device_reserved (GiB)": 20.01,
868
+ "memory/max_active (GiB)": 16.23,
869
+ "memory/max_allocated (GiB)": 16.23,
870
+ "ppl": 1.0497,
871
+ "step": 600,
872
+ "tokens/total": 9830400,
873
+ "tokens/train_per_sec_per_gpu": 14.23,
874
+ "tokens/trainable": 3118633
875
+ },
876
+ {
877
+ "epoch": 0.5915151515151515,
878
+ "grad_norm": 0.2194036841392517,
879
+ "learning_rate": 0.00019979753829274085,
880
+ "loss": 0.03429323434829712,
881
+ "memory/device_reserved (GiB)": 20.01,
882
+ "memory/max_active (GiB)": 16.23,
883
+ "memory/max_allocated (GiB)": 16.23,
884
+ "ppl": 1.03489,
885
+ "step": 610,
886
+ "tokens/total": 9994240,
887
+ "tokens/train_per_sec_per_gpu": 13.14,
888
+ "tokens/trainable": 3170577
889
+ },
890
+ {
891
+ "epoch": 0.6012121212121212,
892
+ "grad_norm": 0.022929901257157326,
893
+ "learning_rate": 0.0001997521887525391,
894
+ "loss": 0.0015171168372035027,
895
+ "memory/device_reserved (GiB)": 20.01,
896
+ "memory/max_active (GiB)": 16.23,
897
+ "memory/max_allocated (GiB)": 16.23,
898
+ "ppl": 1.00152,
899
+ "step": 620,
900
+ "tokens/total": 10158080,
901
+ "tokens/train_per_sec_per_gpu": 14.24,
902
+ "tokens/trainable": 3221696
903
+ },
904
+ {
905
+ "epoch": 0.610909090909091,
906
+ "grad_norm": 0.10083670169115067,
907
+ "learning_rate": 0.00019970226638757458,
908
+ "loss": 0.0025377947837114333,
909
+ "memory/device_reserved (GiB)": 20.01,
910
+ "memory/max_active (GiB)": 16.23,
911
+ "memory/max_allocated (GiB)": 16.23,
912
+ "ppl": 1.00254,
913
+ "step": 630,
914
+ "tokens/total": 10321920,
915
+ "tokens/train_per_sec_per_gpu": 14.7,
916
+ "tokens/trainable": 3273775
917
+ },
918
+ {
919
+ "epoch": 0.6206060606060606,
920
+ "grad_norm": 0.01761380024254322,
921
+ "learning_rate": 0.00019964777348638083,
922
+ "loss": 0.002281896211206913,
923
+ "memory/device_reserved (GiB)": 20.01,
924
+ "memory/max_active (GiB)": 16.23,
925
+ "memory/max_allocated (GiB)": 16.23,
926
+ "ppl": 1.00228,
927
+ "step": 640,
928
+ "tokens/total": 10485760,
929
+ "tokens/train_per_sec_per_gpu": 14.89,
930
+ "tokens/trainable": 3325516
931
+ },
932
+ {
933
+ "epoch": 0.6303030303030303,
934
+ "grad_norm": 0.004510029684752226,
935
+ "learning_rate": 0.00019958871254701315,
936
+ "loss": 0.0009477110579609871,
937
+ "memory/device_reserved (GiB)": 20.01,
938
+ "memory/max_active (GiB)": 16.23,
939
+ "memory/max_allocated (GiB)": 16.23,
940
+ "ppl": 1.00095,
941
+ "step": 650,
942
+ "tokens/total": 10649600,
943
+ "tokens/train_per_sec_per_gpu": 16.46,
944
+ "tokens/trainable": 3377214
945
+ },
946
+ {
947
+ "epoch": 0.64,
948
+ "grad_norm": 0.05332477018237114,
949
+ "learning_rate": 0.0001995250862769342,
950
+ "loss": 0.0005660496186465025,
951
+ "memory/device_reserved (GiB)": 20.01,
952
+ "memory/max_active (GiB)": 16.23,
953
+ "memory/max_allocated (GiB)": 16.23,
954
+ "ppl": 1.00057,
955
+ "step": 660,
956
+ "tokens/total": 10813440,
957
+ "tokens/train_per_sec_per_gpu": 14.52,
958
+ "tokens/trainable": 3428627
959
+ },
960
+ {
961
+ "epoch": 0.6496969696969697,
962
+ "grad_norm": 0.03861689195036888,
963
+ "learning_rate": 0.0001994568975928899,
964
+ "loss": 0.0008976863697171211,
965
+ "memory/device_reserved (GiB)": 20.01,
966
+ "memory/max_active (GiB)": 16.23,
967
+ "memory/max_allocated (GiB)": 16.23,
968
+ "ppl": 1.0009,
969
+ "step": 670,
970
+ "tokens/total": 10977280,
971
+ "tokens/train_per_sec_per_gpu": 15.66,
972
+ "tokens/trainable": 3480170
973
+ },
974
+ {
975
+ "epoch": 0.6593939393939394,
976
+ "grad_norm": 0.021123304963111877,
977
+ "learning_rate": 0.00019938414962077553,
978
+ "loss": 0.0009612766094505787,
979
+ "memory/device_reserved (GiB)": 20.01,
980
+ "memory/max_active (GiB)": 16.23,
981
+ "memory/max_allocated (GiB)": 16.23,
982
+ "ppl": 1.00096,
983
+ "step": 680,
984
+ "tokens/total": 11141120,
985
+ "tokens/train_per_sec_per_gpu": 15.15,
986
+ "tokens/trainable": 3532037
987
+ },
988
+ {
989
+ "epoch": 0.6690909090909091,
990
+ "grad_norm": 0.02421347238123417,
991
+ "learning_rate": 0.00019930684569549264,
992
+ "loss": 0.001021684519946575,
993
+ "memory/device_reserved (GiB)": 20.01,
994
+ "memory/max_active (GiB)": 16.23,
995
+ "memory/max_allocated (GiB)": 16.23,
996
+ "ppl": 1.00102,
997
+ "step": 690,
998
+ "tokens/total": 11304960,
999
+ "tokens/train_per_sec_per_gpu": 14.16,
1000
+ "tokens/trainable": 3583461
1001
+ },
1002
+ {
1003
+ "epoch": 0.6787878787878788,
1004
+ "grad_norm": 0.05008835345506668,
1005
+ "learning_rate": 0.00019922498936079613,
1006
+ "loss": 0.0007617876864969731,
1007
+ "memory/device_reserved (GiB)": 20.01,
1008
+ "memory/max_active (GiB)": 16.23,
1009
+ "memory/max_allocated (GiB)": 16.23,
1010
+ "ppl": 1.00076,
1011
+ "step": 700,
1012
+ "tokens/total": 11468800,
1013
+ "tokens/train_per_sec_per_gpu": 14.08,
1014
+ "tokens/trainable": 3634649
1015
+ },
1016
+ {
1017
+ "epoch": 0.6884848484848485,
1018
+ "grad_norm": 0.035733792930841446,
1019
+ "learning_rate": 0.00019913858436913171,
1020
+ "loss": 0.0012347914278507232,
1021
+ "memory/device_reserved (GiB)": 20.01,
1022
+ "memory/max_active (GiB)": 16.23,
1023
+ "memory/max_allocated (GiB)": 16.23,
1024
+ "ppl": 1.00124,
1025
+ "step": 710,
1026
+ "tokens/total": 11632640,
1027
+ "tokens/train_per_sec_per_gpu": 14.45,
1028
+ "tokens/trainable": 3685786
1029
+ },
1030
+ {
1031
+ "epoch": 0.6981818181818182,
1032
+ "grad_norm": 0.010948767885565758,
1033
+ "learning_rate": 0.00019904763468146393,
1034
+ "loss": 0.0008165687322616577,
1035
+ "memory/device_reserved (GiB)": 20.01,
1036
+ "memory/max_active (GiB)": 16.23,
1037
+ "memory/max_allocated (GiB)": 16.23,
1038
+ "ppl": 1.00082,
1039
+ "step": 720,
1040
+ "tokens/total": 11796480,
1041
+ "tokens/train_per_sec_per_gpu": 15.77,
1042
+ "tokens/trainable": 3737566
1043
+ },
1044
+ {
1045
+ "epoch": 0.7078787878787879,
1046
+ "grad_norm": 0.03577027469873428,
1047
+ "learning_rate": 0.00019895214446709463,
1048
+ "loss": 0.001333119161427021,
1049
+ "memory/device_reserved (GiB)": 20.01,
1050
+ "memory/max_active (GiB)": 16.23,
1051
+ "memory/max_allocated (GiB)": 16.23,
1052
+ "ppl": 1.00133,
1053
+ "step": 730,
1054
+ "tokens/total": 11960320,
1055
+ "tokens/train_per_sec_per_gpu": 13.98,
1056
+ "tokens/trainable": 3789817
1057
+ },
1058
+ {
1059
+ "epoch": 0.7175757575757575,
1060
+ "grad_norm": 0.03971279785037041,
1061
+ "learning_rate": 0.00019885211810347184,
1062
+ "loss": 0.0011184611357748508,
1063
+ "memory/device_reserved (GiB)": 20.01,
1064
+ "memory/max_active (GiB)": 16.23,
1065
+ "memory/max_allocated (GiB)": 16.23,
1066
+ "ppl": 1.00112,
1067
+ "step": 740,
1068
+ "tokens/total": 12124160,
1069
+ "tokens/train_per_sec_per_gpu": 14.67,
1070
+ "tokens/trainable": 3841912
1071
+ },
1072
+ {
1073
+ "epoch": 0.7272727272727273,
1074
+ "grad_norm": 0.06546575576066971,
1075
+ "learning_rate": 0.00019874756017598894,
1076
+ "loss": 0.0012452728115022182,
1077
+ "memory/device_reserved (GiB)": 20.01,
1078
+ "memory/max_active (GiB)": 16.23,
1079
+ "memory/max_allocated (GiB)": 16.23,
1080
+ "ppl": 1.00125,
1081
+ "step": 750,
1082
+ "tokens/total": 12288000,
1083
+ "tokens/train_per_sec_per_gpu": 14.58,
1084
+ "tokens/trainable": 3893725
1085
+ },
1086
+ {
1087
+ "epoch": 0.7369696969696969,
1088
+ "grad_norm": 0.047058816999197006,
1089
+ "learning_rate": 0.00019863847547777467,
1090
+ "loss": 0.0008146104402840138,
1091
+ "memory/device_reserved (GiB)": 20.01,
1092
+ "memory/max_active (GiB)": 16.23,
1093
+ "memory/max_allocated (GiB)": 16.23,
1094
+ "ppl": 1.00081,
1095
+ "step": 760,
1096
+ "tokens/total": 12451840,
1097
+ "tokens/train_per_sec_per_gpu": 13.49,
1098
+ "tokens/trainable": 3945033
1099
+ },
1100
+ {
1101
+ "epoch": 0.7466666666666667,
1102
+ "grad_norm": 0.028811641037464142,
1103
+ "learning_rate": 0.00019852486900947327,
1104
+ "loss": 0.0008652995340526104,
1105
+ "memory/device_reserved (GiB)": 20.01,
1106
+ "memory/max_active (GiB)": 16.23,
1107
+ "memory/max_allocated (GiB)": 16.23,
1108
+ "ppl": 1.00087,
1109
+ "step": 770,
1110
+ "tokens/total": 12615680,
1111
+ "tokens/train_per_sec_per_gpu": 15.12,
1112
+ "tokens/trainable": 3996749
1113
+ },
1114
+ {
1115
+ "epoch": 0.7563636363636363,
1116
+ "grad_norm": 0.012203546240925789,
1117
+ "learning_rate": 0.0001984067459790153,
1118
+ "loss": 0.000670672720298171,
1119
+ "memory/device_reserved (GiB)": 20.01,
1120
+ "memory/max_active (GiB)": 16.23,
1121
+ "memory/max_allocated (GiB)": 16.23,
1122
+ "ppl": 1.00067,
1123
+ "step": 780,
1124
+ "tokens/total": 12779520,
1125
+ "tokens/train_per_sec_per_gpu": 13.71,
1126
+ "tokens/trainable": 4048173
1127
+ },
1128
+ {
1129
+ "epoch": 0.7660606060606061,
1130
+ "grad_norm": 0.016218814998865128,
1131
+ "learning_rate": 0.0001982841118013789,
1132
+ "loss": 0.00046353964135050776,
1133
+ "memory/device_reserved (GiB)": 20.01,
1134
+ "memory/max_active (GiB)": 16.23,
1135
+ "memory/max_allocated (GiB)": 16.23,
1136
+ "ppl": 1.00046,
1137
+ "step": 790,
1138
+ "tokens/total": 12943360,
1139
+ "tokens/train_per_sec_per_gpu": 15.1,
1140
+ "tokens/trainable": 4099789
1141
+ },
1142
+ {
1143
+ "epoch": 0.7757575757575758,
1144
+ "grad_norm": 0.034673016518354416,
1145
+ "learning_rate": 0.00019815697209834147,
1146
+ "loss": 0.000707306619733572,
1147
+ "memory/device_reserved (GiB)": 20.01,
1148
+ "memory/max_active (GiB)": 16.23,
1149
+ "memory/max_allocated (GiB)": 16.23,
1150
+ "ppl": 1.00071,
1151
+ "step": 800,
1152
+ "tokens/total": 13107200,
1153
+ "tokens/train_per_sec_per_gpu": 14.45,
1154
+ "tokens/trainable": 4150960
1155
+ },
1156
+ {
1157
+ "epoch": 0.7854545454545454,
1158
+ "grad_norm": 0.0022127812262624502,
1159
+ "learning_rate": 0.00019802533269822208,
1160
+ "loss": 0.00021896373946219682,
1161
+ "memory/device_reserved (GiB)": 20.01,
1162
+ "memory/max_active (GiB)": 16.23,
1163
+ "memory/max_allocated (GiB)": 16.23,
1164
+ "ppl": 1.00022,
1165
+ "step": 810,
1166
+ "tokens/total": 13271040,
1167
+ "tokens/train_per_sec_per_gpu": 14.75,
1168
+ "tokens/trainable": 4202984
1169
+ },
1170
+ {
1171
+ "epoch": 0.7951515151515152,
1172
+ "grad_norm": 0.000919274752959609,
1173
+ "learning_rate": 0.00019788919963561422,
1174
+ "loss": 0.00043264860287308695,
1175
+ "memory/device_reserved (GiB)": 20.01,
1176
+ "memory/max_active (GiB)": 16.23,
1177
+ "memory/max_allocated (GiB)": 16.23,
1178
+ "ppl": 1.00043,
1179
+ "step": 820,
1180
+ "tokens/total": 13434880,
1181
+ "tokens/train_per_sec_per_gpu": 14.06,
1182
+ "tokens/trainable": 4254907
1183
+ },
1184
+ {
1185
+ "epoch": 0.8048484848484848,
1186
+ "grad_norm": 0.007699873298406601,
1187
+ "learning_rate": 0.00019774857915110913,
1188
+ "loss": 0.0003196246922016144,
1189
+ "memory/device_reserved (GiB)": 20.01,
1190
+ "memory/max_active (GiB)": 16.23,
1191
+ "memory/max_allocated (GiB)": 16.23,
1192
+ "ppl": 1.00032,
1193
+ "step": 830,
1194
+ "tokens/total": 13598720,
1195
+ "tokens/train_per_sec_per_gpu": 14.75,
1196
+ "tokens/trainable": 4306095
1197
+ },
1198
+ {
1199
+ "epoch": 0.8145454545454546,
1200
+ "grad_norm": 0.015523642301559448,
1201
+ "learning_rate": 0.00019760347769100987,
1202
+ "loss": 0.0004476988688111305,
1203
+ "memory/device_reserved (GiB)": 20.01,
1204
+ "memory/max_active (GiB)": 16.23,
1205
+ "memory/max_allocated (GiB)": 16.23,
1206
+ "ppl": 1.00045,
1207
+ "step": 840,
1208
+ "tokens/total": 13762560,
1209
+ "tokens/train_per_sec_per_gpu": 14.14,
1210
+ "tokens/trainable": 4357442
1211
+ },
1212
+ {
1213
+ "epoch": 0.8242424242424242,
1214
+ "grad_norm": 0.013460986316204071,
1215
+ "learning_rate": 0.00019745390190703565,
1216
+ "loss": 0.0004673306830227375,
1217
+ "memory/device_reserved (GiB)": 20.01,
1218
+ "memory/max_active (GiB)": 16.23,
1219
+ "memory/max_allocated (GiB)": 16.23,
1220
+ "ppl": 1.00047,
1221
+ "step": 850,
1222
+ "tokens/total": 13926400,
1223
+ "tokens/train_per_sec_per_gpu": 14.1,
1224
+ "tokens/trainable": 4409277
1225
+ },
1226
+ {
1227
+ "epoch": 0.833939393939394,
1228
+ "grad_norm": 0.0014691110700368881,
1229
+ "learning_rate": 0.0001972998586560169,
1230
+ "loss": 0.0003277578856796026,
1231
+ "memory/device_reserved (GiB)": 20.01,
1232
+ "memory/max_active (GiB)": 16.23,
1233
+ "memory/max_allocated (GiB)": 16.23,
1234
+ "ppl": 1.00033,
1235
+ "step": 860,
1236
+ "tokens/total": 14090240,
1237
+ "tokens/train_per_sec_per_gpu": 14.28,
1238
+ "tokens/trainable": 4460714
1239
+ },
1240
+ {
1241
+ "epoch": 0.8436363636363636,
1242
+ "grad_norm": 0.001358041656203568,
1243
+ "learning_rate": 0.00019714135499958112,
1244
+ "loss": 0.00032470382284373046,
1245
+ "memory/device_reserved (GiB)": 20.01,
1246
+ "memory/max_active (GiB)": 16.23,
1247
+ "memory/max_allocated (GiB)": 16.23,
1248
+ "ppl": 1.00032,
1249
+ "step": 870,
1250
+ "tokens/total": 14254080,
1251
+ "tokens/train_per_sec_per_gpu": 13.85,
1252
+ "tokens/trainable": 4511989
1253
+ },
1254
+ {
1255
+ "epoch": 0.8533333333333334,
1256
+ "grad_norm": 0.04510723799467087,
1257
+ "learning_rate": 0.0001969783982038289,
1258
+ "loss": 0.00023182881996035575,
1259
+ "memory/device_reserved (GiB)": 20.01,
1260
+ "memory/max_active (GiB)": 16.23,
1261
+ "memory/max_allocated (GiB)": 16.23,
1262
+ "ppl": 1.00023,
1263
+ "step": 880,
1264
+ "tokens/total": 14417920,
1265
+ "tokens/train_per_sec_per_gpu": 15.41,
1266
+ "tokens/trainable": 4563354
1267
+ },
1268
+ {
1269
+ "epoch": 0.863030303030303,
1270
+ "grad_norm": 0.14508692920207977,
1271
+ "learning_rate": 0.00019681099573900113,
1272
+ "loss": 0.00026136748492717744,
1273
+ "memory/device_reserved (GiB)": 20.01,
1274
+ "memory/max_active (GiB)": 16.23,
1275
+ "memory/max_allocated (GiB)": 16.23,
1276
+ "ppl": 1.00026,
1277
+ "step": 890,
1278
+ "tokens/total": 14581760,
1279
+ "tokens/train_per_sec_per_gpu": 13.85,
1280
+ "tokens/trainable": 4615691
1281
+ },
1282
+ {
1283
+ "epoch": 0.8727272727272727,
1284
+ "grad_norm": 0.010969490744173527,
1285
+ "learning_rate": 0.00019663915527913625,
1286
+ "loss": 0.00016044279327616097,
1287
+ "memory/device_reserved (GiB)": 20.01,
1288
+ "memory/max_active (GiB)": 16.23,
1289
+ "memory/max_allocated (GiB)": 16.23,
1290
+ "ppl": 1.00016,
1291
+ "step": 900,
1292
+ "tokens/total": 14745600,
1293
+ "tokens/train_per_sec_per_gpu": 15.76,
1294
+ "tokens/trainable": 4667433
1295
+ },
1296
+ {
1297
+ "epoch": 0.8824242424242424,
1298
+ "grad_norm": 0.03874114155769348,
1299
+ "learning_rate": 0.00019646288470171868,
1300
+ "loss": 0.0004159804433584213,
1301
+ "memory/device_reserved (GiB)": 20.01,
1302
+ "memory/max_active (GiB)": 16.23,
1303
+ "memory/max_allocated (GiB)": 16.23,
1304
+ "ppl": 1.00042,
1305
+ "step": 910,
1306
+ "tokens/total": 14909440,
1307
+ "tokens/train_per_sec_per_gpu": 16.01,
1308
+ "tokens/trainable": 4719807
1309
+ },
1310
+ {
1311
+ "epoch": 0.8921212121212121,
1312
+ "grad_norm": 0.044620465487241745,
1313
+ "learning_rate": 0.00019628219208731756,
1314
+ "loss": 0.0006739750038832426,
1315
+ "memory/device_reserved (GiB)": 20.01,
1316
+ "memory/max_active (GiB)": 16.23,
1317
+ "memory/max_allocated (GiB)": 16.23,
1318
+ "ppl": 1.00067,
1319
+ "step": 920,
1320
+ "tokens/total": 15073280,
1321
+ "tokens/train_per_sec_per_gpu": 15.05,
1322
+ "tokens/trainable": 4771772
1323
+ },
1324
+ {
1325
+ "epoch": 0.9018181818181819,
1326
+ "grad_norm": 0.024856949225068092,
1327
+ "learning_rate": 0.00019609708571921645,
1328
+ "loss": 0.00039347023703157903,
1329
+ "memory/device_reserved (GiB)": 20.01,
1330
+ "memory/max_active (GiB)": 16.23,
1331
+ "memory/max_allocated (GiB)": 16.23,
1332
+ "ppl": 1.00039,
1333
+ "step": 930,
1334
+ "tokens/total": 15237120,
1335
+ "tokens/train_per_sec_per_gpu": 15.16,
1336
+ "tokens/trainable": 4823415
1337
+ },
1338
+ {
1339
+ "epoch": 0.9115151515151515,
1340
+ "grad_norm": 0.022198157384991646,
1341
+ "learning_rate": 0.0001959075740830335,
1342
+ "loss": 0.0005907822400331497,
1343
+ "memory/device_reserved (GiB)": 20.01,
1344
+ "memory/max_active (GiB)": 16.23,
1345
+ "memory/max_allocated (GiB)": 16.23,
1346
+ "ppl": 1.00059,
1347
+ "step": 940,
1348
+ "tokens/total": 15400960,
1349
+ "tokens/train_per_sec_per_gpu": 15.36,
1350
+ "tokens/trainable": 4875269
1351
+ },
1352
+ {
1353
+ "epoch": 0.9212121212121213,
1354
+ "grad_norm": 0.01670038513839245,
1355
+ "learning_rate": 0.00019571366586633245,
1356
+ "loss": 0.00027316866908222437,
1357
+ "memory/device_reserved (GiB)": 20.01,
1358
+ "memory/max_active (GiB)": 16.23,
1359
+ "memory/max_allocated (GiB)": 16.23,
1360
+ "ppl": 1.00027,
1361
+ "step": 950,
1362
+ "tokens/total": 15564800,
1363
+ "tokens/train_per_sec_per_gpu": 15.11,
1364
+ "tokens/trainable": 4927244
1365
+ },
1366
+ {
1367
+ "epoch": 0.9309090909090909,
1368
+ "grad_norm": 0.021392742171883583,
1369
+ "learning_rate": 0.00019551536995822454,
1370
+ "loss": 0.0004320886451750994,
1371
+ "memory/device_reserved (GiB)": 20.01,
1372
+ "memory/max_active (GiB)": 16.23,
1373
+ "memory/max_allocated (GiB)": 16.23,
1374
+ "ppl": 1.00043,
1375
+ "step": 960,
1376
+ "tokens/total": 15728640,
1377
+ "tokens/train_per_sec_per_gpu": 14.16,
1378
+ "tokens/trainable": 4979068
1379
+ },
1380
+ {
1381
+ "epoch": 0.9406060606060606,
1382
+ "grad_norm": 0.028143158182501793,
1383
+ "learning_rate": 0.00019531269544896076,
1384
+ "loss": 0.0005637989845126868,
1385
+ "memory/device_reserved (GiB)": 20.01,
1386
+ "memory/max_active (GiB)": 16.23,
1387
+ "memory/max_allocated (GiB)": 16.23,
1388
+ "ppl": 1.00056,
1389
+ "step": 970,
1390
+ "tokens/total": 15892480,
1391
+ "tokens/train_per_sec_per_gpu": 14.26,
1392
+ "tokens/trainable": 5030980
1393
+ },
1394
+ {
1395
+ "epoch": 0.9503030303030303,
1396
+ "grad_norm": 0.077091746032238,
1397
+ "learning_rate": 0.00019510565162951537,
1398
+ "loss": 0.0010597245767712594,
1399
+ "memory/device_reserved (GiB)": 20.01,
1400
+ "memory/max_active (GiB)": 16.23,
1401
+ "memory/max_allocated (GiB)": 16.23,
1402
+ "ppl": 1.00106,
1403
+ "step": 980,
1404
+ "tokens/total": 16056320,
1405
+ "tokens/train_per_sec_per_gpu": 14.04,
1406
+ "tokens/trainable": 5082759
1407
+ },
1408
+ {
1409
+ "epoch": 0.96,
1410
+ "grad_norm": 0.04455556347966194,
1411
+ "learning_rate": 0.00019489424799115984,
1412
+ "loss": 0.0009517236612737179,
1413
+ "memory/device_reserved (GiB)": 20.01,
1414
+ "memory/max_active (GiB)": 16.23,
1415
+ "memory/max_allocated (GiB)": 16.23,
1416
+ "ppl": 1.00095,
1417
+ "step": 990,
1418
+ "tokens/total": 16220160,
1419
+ "tokens/train_per_sec_per_gpu": 13.04,
1420
+ "tokens/trainable": 5134379
1421
+ },
1422
+ {
1423
+ "epoch": 0.9696969696969697,
1424
+ "grad_norm": 0.03573840856552124,
1425
+ "learning_rate": 0.00019467849422502784,
1426
+ "loss": 0.0008812972344458103,
1427
+ "memory/device_reserved (GiB)": 20.01,
1428
+ "memory/max_active (GiB)": 16.23,
1429
+ "memory/max_allocated (GiB)": 16.23,
1430
+ "ppl": 1.00088,
1431
+ "step": 1000,
1432
+ "tokens/total": 16384000,
1433
+ "tokens/train_per_sec_per_gpu": 15.23,
1434
+ "tokens/trainable": 5186184
1435
+ },
1436
+ {
1437
+ "epoch": 0.9793939393939394,
1438
+ "grad_norm": 0.0006549305398948491,
1439
+ "learning_rate": 0.0001944584002216709,
1440
+ "loss": 0.0006358013488352299,
1441
+ "memory/device_reserved (GiB)": 20.01,
1442
+ "memory/max_active (GiB)": 16.23,
1443
+ "memory/max_allocated (GiB)": 16.23,
1444
+ "ppl": 1.00064,
1445
+ "step": 1010,
1446
+ "tokens/total": 16547840,
1447
+ "tokens/train_per_sec_per_gpu": 16.1,
1448
+ "tokens/trainable": 5238320
1449
+ },
1450
+ {
1451
+ "epoch": 0.9890909090909091,
1452
+ "grad_norm": 0.021742813289165497,
1453
+ "learning_rate": 0.00019423397607060507,
1454
+ "loss": 0.000400003744289279,
1455
+ "memory/device_reserved (GiB)": 20.01,
1456
+ "memory/max_active (GiB)": 16.23,
1457
+ "memory/max_allocated (GiB)": 16.23,
1458
+ "ppl": 1.0004,
1459
+ "step": 1020,
1460
+ "tokens/total": 16711680,
1461
+ "tokens/train_per_sec_per_gpu": 14.53,
1462
+ "tokens/trainable": 5290445
1463
+ },
1464
+ {
1465
+ "epoch": 0.9987878787878788,
1466
+ "grad_norm": 0.04323820024728775,
1467
+ "learning_rate": 0.00019400523205984833,
1468
+ "loss": 0.0002954686991870403,
1469
+ "memory/device_reserved (GiB)": 20.01,
1470
+ "memory/max_active (GiB)": 16.23,
1471
+ "memory/max_allocated (GiB)": 16.23,
1472
+ "ppl": 1.0003,
1473
+ "step": 1030,
1474
+ "tokens/total": 16875520,
1475
+ "tokens/train_per_sec_per_gpu": 14.98,
1476
+ "tokens/trainable": 5342720
1477
+ },
1478
+ {
1479
+ "epoch": 1.001939393939394,
1480
+ "eval_loss": 0.00047458006883971393,
1481
+ "eval_ppl": 1.00047,
1482
+ "eval_runtime": 11.7938,
1483
+ "eval_samples_per_second": 16.958,
1484
+ "eval_steps_per_second": 8.479,
1485
+ "memory/device_reserved (GiB)": 20.01,
1486
+ "memory/max_active (GiB)": 16.73,
1487
+ "memory/max_allocated (GiB)": 16.73,
1488
+ "step": 1032
1489
+ },
1490
+ {
1491
+ "epoch": 1.0096969696969698,
1492
+ "grad_norm": 0.000988126266747713,
1493
+ "learning_rate": 0.00019377217867544907,
1494
+ "loss": 0.0004762394353747368,
1495
+ "memory/device_reserved (GiB)": 20.01,
1496
+ "memory/max_active (GiB)": 16.23,
1497
+ "memory/max_allocated (GiB)": 16.23,
1498
+ "ppl": 1.00048,
1499
+ "step": 1040,
1500
+ "tokens/total": 17051648,
1501
+ "tokens/train_per_sec_per_gpu": 14.47,
1502
+ "tokens/trainable": 5398184
1503
+ },
1504
+ {
1505
+ "epoch": 1.0193939393939393,
1506
+ "grad_norm": 0.0011711094994097948,
1507
+ "learning_rate": 0.00019353482660100537,
1508
+ "loss": 0.00022675264626741408,
1509
+ "memory/device_reserved (GiB)": 20.01,
1510
+ "memory/max_active (GiB)": 16.23,
1511
+ "memory/max_allocated (GiB)": 16.23,
1512
+ "ppl": 1.00023,
1513
+ "step": 1050,
1514
+ "tokens/total": 17215488,
1515
+ "tokens/train_per_sec_per_gpu": 14.05,
1516
+ "tokens/trainable": 5450329
1517
+ },
1518
+ {
1519
+ "epoch": 1.029090909090909,
1520
+ "grad_norm": 0.007319436874240637,
1521
+ "learning_rate": 0.0001932931867171751,
1522
+ "loss": 0.0003059083363041282,
1523
+ "memory/device_reserved (GiB)": 20.01,
1524
+ "memory/max_active (GiB)": 16.23,
1525
+ "memory/max_allocated (GiB)": 16.23,
1526
+ "ppl": 1.00031,
1527
+ "step": 1060,
1528
+ "tokens/total": 17379328,
1529
+ "tokens/train_per_sec_per_gpu": 13.66,
1530
+ "tokens/trainable": 5502706
1531
+ },
1532
+ {
1533
+ "epoch": 1.0387878787878788,
1534
+ "grad_norm": 0.00967186689376831,
1535
+ "learning_rate": 0.0001930472701011773,
1536
+ "loss": 0.0003639918984845281,
1537
+ "memory/device_reserved (GiB)": 20.01,
1538
+ "memory/max_active (GiB)": 16.23,
1539
+ "memory/max_allocated (GiB)": 16.23,
1540
+ "ppl": 1.00036,
1541
+ "step": 1070,
1542
+ "tokens/total": 17543168,
1543
+ "tokens/train_per_sec_per_gpu": 15.36,
1544
+ "tokens/trainable": 5554957
1545
+ },
1546
+ {
1547
+ "epoch": 1.0484848484848486,
1548
+ "grad_norm": 0.0018478024285286665,
1549
+ "learning_rate": 0.00019279708802628437,
1550
+ "loss": 0.0002576910424977541,
1551
+ "memory/device_reserved (GiB)": 20.01,
1552
+ "memory/max_active (GiB)": 16.23,
1553
+ "memory/max_allocated (GiB)": 16.23,
1554
+ "ppl": 1.00026,
1555
+ "step": 1080,
1556
+ "tokens/total": 17707008,
1557
+ "tokens/train_per_sec_per_gpu": 14.73,
1558
+ "tokens/trainable": 5607534
1559
+ },
1560
+ {
1561
+ "epoch": 1.0581818181818181,
1562
+ "grad_norm": 0.018235478550195694,
1563
+ "learning_rate": 0.00019254265196130517,
1564
+ "loss": 0.0003647733014076948,
1565
+ "memory/device_reserved (GiB)": 20.01,
1566
+ "memory/max_active (GiB)": 16.23,
1567
+ "memory/max_allocated (GiB)": 16.23,
1568
+ "ppl": 1.00036,
1569
+ "step": 1090,
1570
+ "tokens/total": 17870848,
1571
+ "tokens/train_per_sec_per_gpu": 14.24,
1572
+ "tokens/trainable": 5659689
1573
+ },
1574
+ {
1575
+ "epoch": 1.0678787878787879,
1576
+ "grad_norm": 0.024314021691679955,
1577
+ "learning_rate": 0.0001922839735700593,
1578
+ "loss": 0.00030459570698440077,
1579
+ "memory/device_reserved (GiB)": 20.01,
1580
+ "memory/max_active (GiB)": 16.23,
1581
+ "memory/max_allocated (GiB)": 16.23,
1582
+ "ppl": 1.0003,
1583
+ "step": 1100,
1584
+ "tokens/total": 18034688,
1585
+ "tokens/train_per_sec_per_gpu": 13.67,
1586
+ "tokens/trainable": 5711346
1587
+ },
1588
+ {
1589
+ "epoch": 1.0775757575757576,
1590
+ "grad_norm": 0.0177497286349535,
1591
+ "learning_rate": 0.0001920210647108425,
1592
+ "loss": 0.00023341022897511722,
1593
+ "memory/device_reserved (GiB)": 20.01,
1594
+ "memory/max_active (GiB)": 16.23,
1595
+ "memory/max_allocated (GiB)": 16.23,
1596
+ "ppl": 1.00023,
1597
+ "step": 1110,
1598
+ "tokens/total": 18198528,
1599
+ "tokens/train_per_sec_per_gpu": 14.13,
1600
+ "tokens/trainable": 5763094
1601
+ },
1602
+ {
1603
+ "epoch": 1.0872727272727274,
1604
+ "grad_norm": 0.005781313870102167,
1605
+ "learning_rate": 0.00019175393743588295,
1606
+ "loss": 0.0002974884817376733,
1607
+ "memory/device_reserved (GiB)": 20.01,
1608
+ "memory/max_active (GiB)": 16.23,
1609
+ "memory/max_allocated (GiB)": 16.23,
1610
+ "ppl": 1.0003,
1611
+ "step": 1120,
1612
+ "tokens/total": 18362368,
1613
+ "tokens/train_per_sec_per_gpu": 14.55,
1614
+ "tokens/trainable": 5815101
1615
+ },
1616
+ {
1617
+ "epoch": 1.096969696969697,
1618
+ "grad_norm": 0.0026403339579701424,
1619
+ "learning_rate": 0.00019148260399078887,
1620
+ "loss": 0.00010604445124045015,
1621
+ "memory/device_reserved (GiB)": 20.01,
1622
+ "memory/max_active (GiB)": 16.23,
1623
+ "memory/max_allocated (GiB)": 16.23,
1624
+ "ppl": 1.00011,
1625
+ "step": 1130,
1626
+ "tokens/total": 18526208,
1627
+ "tokens/train_per_sec_per_gpu": 13.87,
1628
+ "tokens/trainable": 5866763
1629
+ },
1630
+ {
1631
+ "epoch": 1.1066666666666667,
1632
+ "grad_norm": 0.03586777299642563,
1633
+ "learning_rate": 0.000191207076813987,
1634
+ "loss": 0.00027820770628750324,
1635
+ "memory/device_reserved (GiB)": 20.01,
1636
+ "memory/max_active (GiB)": 16.23,
1637
+ "memory/max_allocated (GiB)": 16.23,
1638
+ "ppl": 1.00028,
1639
+ "step": 1140,
1640
+ "tokens/total": 18690048,
1641
+ "tokens/train_per_sec_per_gpu": 13.83,
1642
+ "tokens/trainable": 5918322
1643
+ },
1644
+ {
1645
+ "epoch": 1.1163636363636364,
1646
+ "grad_norm": 0.007715190295130014,
1647
+ "learning_rate": 0.00019092736853615257,
1648
+ "loss": 0.00029321699403226373,
1649
+ "memory/device_reserved (GiB)": 20.01,
1650
+ "memory/max_active (GiB)": 16.23,
1651
+ "memory/max_allocated (GiB)": 16.23,
1652
+ "ppl": 1.00029,
1653
+ "step": 1150,
1654
+ "tokens/total": 18853888,
1655
+ "tokens/train_per_sec_per_gpu": 13.95,
1656
+ "tokens/trainable": 5970153
1657
+ },
1658
+ {
1659
+ "epoch": 1.126060606060606,
1660
+ "grad_norm": 0.05122547224164009,
1661
+ "learning_rate": 0.00019064349197963013,
1662
+ "loss": 0.0005070990417152643,
1663
+ "memory/device_reserved (GiB)": 20.01,
1664
+ "memory/max_active (GiB)": 16.23,
1665
+ "memory/max_allocated (GiB)": 16.23,
1666
+ "ppl": 1.00051,
1667
+ "step": 1160,
1668
+ "tokens/total": 19017728,
1669
+ "tokens/train_per_sec_per_gpu": 15.51,
1670
+ "tokens/trainable": 6021741
1671
+ },
1672
+ {
1673
+ "epoch": 1.1357575757575757,
1674
+ "grad_norm": 0.032420564442873,
1675
+ "learning_rate": 0.000190355460157846,
1676
+ "loss": 0.00031497194431722163,
1677
+ "memory/device_reserved (GiB)": 20.01,
1678
+ "memory/max_active (GiB)": 16.23,
1679
+ "memory/max_allocated (GiB)": 16.23,
1680
+ "ppl": 1.00032,
1681
+ "step": 1170,
1682
+ "tokens/total": 19181568,
1683
+ "tokens/train_per_sec_per_gpu": 16.05,
1684
+ "tokens/trainable": 6074092
1685
+ },
1686
+ {
1687
+ "epoch": 1.1454545454545455,
1688
+ "grad_norm": 0.03688061609864235,
1689
+ "learning_rate": 0.00019006328627471132,
1690
+ "loss": 0.0003225028282031417,
1691
+ "memory/device_reserved (GiB)": 20.01,
1692
+ "memory/max_active (GiB)": 16.23,
1693
+ "memory/max_allocated (GiB)": 16.23,
1694
+ "ppl": 1.00032,
1695
+ "step": 1180,
1696
+ "tokens/total": 19345408,
1697
+ "tokens/train_per_sec_per_gpu": 14.1,
1698
+ "tokens/trainable": 6126315
1699
+ },
1700
+ {
1701
+ "epoch": 1.1551515151515153,
1702
+ "grad_norm": 0.03359396383166313,
1703
+ "learning_rate": 0.00018976698372401716,
1704
+ "loss": 0.0004557626787573099,
1705
+ "memory/device_reserved (GiB)": 20.01,
1706
+ "memory/max_active (GiB)": 16.23,
1707
+ "memory/max_allocated (GiB)": 16.23,
1708
+ "ppl": 1.00046,
1709
+ "step": 1190,
1710
+ "tokens/total": 19509248,
1711
+ "tokens/train_per_sec_per_gpu": 14.6,
1712
+ "tokens/trainable": 6178392
1713
+ },
1714
+ {
1715
+ "epoch": 1.1648484848484848,
1716
+ "grad_norm": 0.020522581413388252,
1717
+ "learning_rate": 0.0001894665660888202,
1718
+ "loss": 0.0006435967981815339,
1719
+ "memory/device_reserved (GiB)": 20.01,
1720
+ "memory/max_active (GiB)": 16.23,
1721
+ "memory/max_allocated (GiB)": 16.23,
1722
+ "ppl": 1.00064,
1723
+ "step": 1200,
1724
+ "tokens/total": 19673088,
1725
+ "tokens/train_per_sec_per_gpu": 15.47,
1726
+ "tokens/trainable": 6230984
1727
+ },
1728
+ {
1729
+ "epoch": 1.1745454545454546,
1730
+ "grad_norm": 0.0025893959682434797,
1731
+ "learning_rate": 0.00018916204714082034,
1732
+ "loss": 0.0005178887862712145,
1733
+ "memory/device_reserved (GiB)": 20.01,
1734
+ "memory/max_active (GiB)": 16.23,
1735
+ "memory/max_allocated (GiB)": 16.23,
1736
+ "ppl": 1.00052,
1737
+ "step": 1210,
1738
+ "tokens/total": 19836928,
1739
+ "tokens/train_per_sec_per_gpu": 14.13,
1740
+ "tokens/trainable": 6282713
1741
+ },
1742
+ {
1743
+ "epoch": 1.1842424242424243,
1744
+ "grad_norm": 0.017288153991103172,
1745
+ "learning_rate": 0.00018885344083972914,
1746
+ "loss": 0.0005050559528172016,
1747
+ "memory/device_reserved (GiB)": 20.01,
1748
+ "memory/max_active (GiB)": 16.23,
1749
+ "memory/max_allocated (GiB)": 16.23,
1750
+ "ppl": 1.00051,
1751
+ "step": 1220,
1752
+ "tokens/total": 20000768,
1753
+ "tokens/train_per_sec_per_gpu": 14.31,
1754
+ "tokens/trainable": 6334555
1755
+ },
1756
+ {
1757
+ "epoch": 1.1939393939393939,
1758
+ "grad_norm": 0.00206086877733469,
1759
+ "learning_rate": 0.00018854076133263003,
1760
+ "loss": 0.00020185327157378196,
1761
+ "memory/device_reserved (GiB)": 20.01,
1762
+ "memory/max_active (GiB)": 16.23,
1763
+ "memory/max_allocated (GiB)": 16.23,
1764
+ "ppl": 1.0002,
1765
+ "step": 1230,
1766
+ "tokens/total": 20164608,
1767
+ "tokens/train_per_sec_per_gpu": 14.72,
1768
+ "tokens/trainable": 6386137
1769
+ },
1770
+ {
1771
+ "epoch": 1.2036363636363636,
1772
+ "grad_norm": 0.02184407040476799,
1773
+ "learning_rate": 0.0001882240229533297,
1774
+ "loss": 0.00048260441981256007,
1775
+ "memory/device_reserved (GiB)": 20.01,
1776
+ "memory/max_active (GiB)": 16.23,
1777
+ "memory/max_allocated (GiB)": 16.23,
1778
+ "ppl": 1.00048,
1779
+ "step": 1240,
1780
+ "tokens/total": 20328448,
1781
+ "tokens/train_per_sec_per_gpu": 14.35,
1782
+ "tokens/trainable": 6437493
1783
+ },
1784
+ {
1785
+ "epoch": 1.2133333333333334,
1786
+ "grad_norm": 0.04215926304459572,
1787
+ "learning_rate": 0.00018790324022170118,
1788
+ "loss": 0.0003190681803971529,
1789
+ "memory/device_reserved (GiB)": 20.01,
1790
+ "memory/max_active (GiB)": 16.23,
1791
+ "memory/max_allocated (GiB)": 16.23,
1792
+ "ppl": 1.00032,
1793
+ "step": 1250,
1794
+ "tokens/total": 20492288,
1795
+ "tokens/train_per_sec_per_gpu": 14.51,
1796
+ "tokens/trainable": 6488834
1797
+ },
1798
+ {
1799
+ "epoch": 1.2230303030303031,
1800
+ "grad_norm": 0.006890668533742428,
1801
+ "learning_rate": 0.00018757842784301784,
1802
+ "loss": 0.0005027144681662322,
1803
+ "memory/device_reserved (GiB)": 20.01,
1804
+ "memory/max_active (GiB)": 16.23,
1805
+ "memory/max_allocated (GiB)": 16.23,
1806
+ "ppl": 1.0005,
1807
+ "step": 1260,
1808
+ "tokens/total": 20656128,
1809
+ "tokens/train_per_sec_per_gpu": 14.26,
1810
+ "tokens/trainable": 6540606
1811
+ },
1812
+ {
1813
+ "epoch": 1.2327272727272727,
1814
+ "grad_norm": 0.005489532835781574,
1815
+ "learning_rate": 0.00018724960070727972,
1816
+ "loss": 0.0006080259568989277,
1817
+ "memory/device_reserved (GiB)": 20.01,
1818
+ "memory/max_active (GiB)": 16.23,
1819
+ "memory/max_allocated (GiB)": 16.23,
1820
+ "ppl": 1.00061,
1821
+ "step": 1270,
1822
+ "tokens/total": 20819968,
1823
+ "tokens/train_per_sec_per_gpu": 13.92,
1824
+ "tokens/trainable": 6592727
1825
+ },
1826
+ {
1827
+ "epoch": 1.2424242424242424,
1828
+ "grad_norm": 0.005877023097127676,
1829
+ "learning_rate": 0.00018691677388853068,
1830
+ "loss": 0.0006749071180820465,
1831
+ "memory/device_reserved (GiB)": 20.01,
1832
+ "memory/max_active (GiB)": 16.23,
1833
+ "memory/max_allocated (GiB)": 16.23,
1834
+ "ppl": 1.00068,
1835
+ "step": 1280,
1836
+ "tokens/total": 20983808,
1837
+ "tokens/train_per_sec_per_gpu": 14.93,
1838
+ "tokens/trainable": 6645179
1839
+ },
1840
+ {
1841
+ "epoch": 1.2521212121212122,
1842
+ "grad_norm": 0.0061390516348183155,
1843
+ "learning_rate": 0.00018657996264416745,
1844
+ "loss": 0.0002642946550622582,
1845
+ "memory/device_reserved (GiB)": 20.01,
1846
+ "memory/max_active (GiB)": 16.23,
1847
+ "memory/max_allocated (GiB)": 16.23,
1848
+ "ppl": 1.00026,
1849
+ "step": 1290,
1850
+ "tokens/total": 21147648,
1851
+ "tokens/train_per_sec_per_gpu": 14.92,
1852
+ "tokens/trainable": 6697406
1853
+ },
1854
+ {
1855
+ "epoch": 1.2618181818181817,
1856
+ "grad_norm": 0.03444842994213104,
1857
+ "learning_rate": 0.0001862391824142402,
1858
+ "loss": 0.0004464905709028244,
1859
+ "memory/device_reserved (GiB)": 20.01,
1860
+ "memory/max_active (GiB)": 16.23,
1861
+ "memory/max_allocated (GiB)": 16.23,
1862
+ "ppl": 1.00045,
1863
+ "step": 1300,
1864
+ "tokens/total": 21311488,
1865
+ "tokens/train_per_sec_per_gpu": 15.07,
1866
+ "tokens/trainable": 6749589
1867
+ },
1868
+ {
1869
+ "epoch": 1.2715151515151515,
1870
+ "grad_norm": 0.0036635284777730703,
1871
+ "learning_rate": 0.00018589444882074474,
1872
+ "loss": 0.0002096141455695033,
1873
+ "memory/device_reserved (GiB)": 20.01,
1874
+ "memory/max_active (GiB)": 16.23,
1875
+ "memory/max_allocated (GiB)": 16.23,
1876
+ "ppl": 1.00021,
1877
+ "step": 1310,
1878
+ "tokens/total": 21475328,
1879
+ "tokens/train_per_sec_per_gpu": 13.69,
1880
+ "tokens/trainable": 6801799
1881
+ },
1882
+ {
1883
+ "epoch": 1.2812121212121212,
1884
+ "grad_norm": 0.003200239036232233,
1885
+ "learning_rate": 0.00018554577766690636,
1886
+ "loss": 0.00026335257571190595,
1887
+ "memory/device_reserved (GiB)": 20.01,
1888
+ "memory/max_active (GiB)": 16.23,
1889
+ "memory/max_allocated (GiB)": 16.23,
1890
+ "ppl": 1.00026,
1891
+ "step": 1320,
1892
+ "tokens/total": 21639168,
1893
+ "tokens/train_per_sec_per_gpu": 14.58,
1894
+ "tokens/trainable": 6854205
1895
+ },
1896
+ {
1897
+ "epoch": 1.290909090909091,
1898
+ "grad_norm": 0.00109296350274235,
1899
+ "learning_rate": 0.0001851931849364554,
1900
+ "loss": 0.0003910743165761232,
1901
+ "memory/device_reserved (GiB)": 20.01,
1902
+ "memory/max_active (GiB)": 16.23,
1903
+ "memory/max_allocated (GiB)": 16.23,
1904
+ "ppl": 1.00039,
1905
+ "step": 1330,
1906
+ "tokens/total": 21803008,
1907
+ "tokens/train_per_sec_per_gpu": 14.96,
1908
+ "tokens/trainable": 6906145
1909
+ },
1910
+ {
1911
+ "epoch": 1.3006060606060605,
1912
+ "grad_norm": 0.0006913666147738695,
1913
+ "learning_rate": 0.00018483668679289452,
1914
+ "loss": 0.0003079640679061413,
1915
+ "memory/device_reserved (GiB)": 20.01,
1916
+ "memory/max_active (GiB)": 16.23,
1917
+ "memory/max_allocated (GiB)": 16.23,
1918
+ "ppl": 1.00031,
1919
+ "step": 1340,
1920
+ "tokens/total": 21966848,
1921
+ "tokens/train_per_sec_per_gpu": 15.13,
1922
+ "tokens/trainable": 6957405
1923
+ },
1924
+ {
1925
+ "epoch": 1.3103030303030303,
1926
+ "grad_norm": 0.03036116063594818,
1927
+ "learning_rate": 0.00018447629957875776,
1928
+ "loss": 0.0003281526267528534,
1929
+ "memory/device_reserved (GiB)": 20.01,
1930
+ "memory/max_active (GiB)": 16.23,
1931
+ "memory/max_allocated (GiB)": 16.23,
1932
+ "ppl": 1.00033,
1933
+ "step": 1350,
1934
+ "tokens/total": 22130688,
1935
+ "tokens/train_per_sec_per_gpu": 15.08,
1936
+ "tokens/trainable": 7009256
1937
+ },
1938
+ {
1939
+ "epoch": 1.32,
1940
+ "grad_norm": 0.012580045498907566,
1941
+ "learning_rate": 0.00018411203981486134,
1942
+ "loss": 0.0006514057982712984,
1943
+ "memory/device_reserved (GiB)": 20.01,
1944
+ "memory/max_active (GiB)": 16.23,
1945
+ "memory/max_allocated (GiB)": 16.23,
1946
+ "ppl": 1.00065,
1947
+ "step": 1360,
1948
+ "tokens/total": 22294528,
1949
+ "tokens/train_per_sec_per_gpu": 14.66,
1950
+ "tokens/trainable": 7060734
1951
+ },
1952
+ {
1953
+ "epoch": 1.3296969696969696,
1954
+ "grad_norm": 0.00828342791646719,
1955
+ "learning_rate": 0.00018374392419954628,
1956
+ "loss": 0.0003020781092345715,
1957
+ "memory/device_reserved (GiB)": 20.01,
1958
+ "memory/max_active (GiB)": 16.23,
1959
+ "memory/max_allocated (GiB)": 16.23,
1960
+ "ppl": 1.0003,
1961
+ "step": 1370,
1962
+ "tokens/total": 22458368,
1963
+ "tokens/train_per_sec_per_gpu": 15.09,
1964
+ "tokens/trainable": 7112415
1965
+ },
1966
+ {
1967
+ "epoch": 1.3393939393939394,
1968
+ "grad_norm": 0.09482505917549133,
1969
+ "learning_rate": 0.00018337196960791302,
1970
+ "loss": 0.0006797847803682089,
1971
+ "memory/device_reserved (GiB)": 20.01,
1972
+ "memory/max_active (GiB)": 16.23,
1973
+ "memory/max_allocated (GiB)": 16.23,
1974
+ "ppl": 1.00068,
1975
+ "step": 1380,
1976
+ "tokens/total": 22622208,
1977
+ "tokens/train_per_sec_per_gpu": 15.03,
1978
+ "tokens/trainable": 7164110
1979
+ },
1980
+ {
1981
+ "epoch": 1.3490909090909091,
1982
+ "grad_norm": 0.04534842446446419,
1983
+ "learning_rate": 0.00018299619309104773,
1984
+ "loss": 0.000729580270126462,
1985
+ "memory/device_reserved (GiB)": 20.01,
1986
+ "memory/max_active (GiB)": 16.23,
1987
+ "memory/max_allocated (GiB)": 16.23,
1988
+ "ppl": 1.00073,
1989
+ "step": 1390,
1990
+ "tokens/total": 22786048,
1991
+ "tokens/train_per_sec_per_gpu": 15.49,
1992
+ "tokens/trainable": 7215797
1993
+ },
1994
+ {
1995
+ "epoch": 1.3587878787878789,
1996
+ "grad_norm": 0.010737202130258083,
1997
+ "learning_rate": 0.00018261661187524072,
1998
+ "loss": 0.0007514740340411663,
1999
+ "memory/device_reserved (GiB)": 20.01,
2000
+ "memory/max_active (GiB)": 16.23,
2001
+ "memory/max_allocated (GiB)": 16.23,
2002
+ "ppl": 1.00075,
2003
+ "step": 1400,
2004
+ "tokens/total": 22949888,
2005
+ "tokens/train_per_sec_per_gpu": 14.14,
2006
+ "tokens/trainable": 7267691
2007
+ },
2008
+ {
2009
+ "epoch": 1.3684848484848484,
2010
+ "grad_norm": 0.05600081756711006,
2011
+ "learning_rate": 0.00018223324336119672,
2012
+ "loss": 0.001420076284557581,
2013
+ "memory/device_reserved (GiB)": 20.01,
2014
+ "memory/max_active (GiB)": 16.23,
2015
+ "memory/max_allocated (GiB)": 16.23,
2016
+ "ppl": 1.00142,
2017
+ "step": 1410,
2018
+ "tokens/total": 23113728,
2019
+ "tokens/train_per_sec_per_gpu": 15.3,
2020
+ "tokens/trainable": 7319876
2021
+ },
2022
+ {
2023
+ "epoch": 1.3781818181818182,
2024
+ "grad_norm": 0.019460471346974373,
2025
+ "learning_rate": 0.00018184610512323718,
2026
+ "loss": 0.0022406818345189093,
2027
+ "memory/device_reserved (GiB)": 20.01,
2028
+ "memory/max_active (GiB)": 16.23,
2029
+ "memory/max_allocated (GiB)": 16.23,
2030
+ "ppl": 1.00224,
2031
+ "step": 1420,
2032
+ "tokens/total": 23277568,
2033
+ "tokens/train_per_sec_per_gpu": 14.38,
2034
+ "tokens/trainable": 7371762
2035
+ },
2036
+ {
2037
+ "epoch": 1.387878787878788,
2038
+ "grad_norm": 0.03277068957686424,
2039
+ "learning_rate": 0.00018145521490849477,
2040
+ "loss": 0.000915923435240984,
2041
+ "memory/device_reserved (GiB)": 20.01,
2042
+ "memory/max_active (GiB)": 16.23,
2043
+ "memory/max_allocated (GiB)": 16.23,
2044
+ "ppl": 1.00092,
2045
+ "step": 1430,
2046
+ "tokens/total": 23441408,
2047
+ "tokens/train_per_sec_per_gpu": 14.66,
2048
+ "tokens/trainable": 7423685
2049
+ },
2050
+ {
2051
+ "epoch": 1.3975757575757575,
2052
+ "grad_norm": 0.0156385600566864,
2053
+ "learning_rate": 0.0001810605906360996,
2054
+ "loss": 0.000897888746112585,
2055
+ "memory/device_reserved (GiB)": 20.01,
2056
+ "memory/max_active (GiB)": 16.23,
2057
+ "memory/max_allocated (GiB)": 16.23,
2058
+ "ppl": 1.0009,
2059
+ "step": 1440,
2060
+ "tokens/total": 23605248,
2061
+ "tokens/train_per_sec_per_gpu": 13.99,
2062
+ "tokens/trainable": 7476266
2063
+ },
2064
+ {
2065
+ "epoch": 1.4072727272727272,
2066
+ "grad_norm": 0.01643913984298706,
2067
+ "learning_rate": 0.00018066225039635794,
2068
+ "loss": 0.000922933965921402,
2069
+ "memory/device_reserved (GiB)": 20.01,
2070
+ "memory/max_active (GiB)": 16.23,
2071
+ "memory/max_allocated (GiB)": 16.23,
2072
+ "ppl": 1.00092,
2073
+ "step": 1450,
2074
+ "tokens/total": 23769088,
2075
+ "tokens/train_per_sec_per_gpu": 14.57,
2076
+ "tokens/trainable": 7528208
2077
+ },
2078
+ {
2079
+ "epoch": 1.416969696969697,
2080
+ "grad_norm": 0.024322666227817535,
2081
+ "learning_rate": 0.00018026021244992287,
2082
+ "loss": 0.0011652217246592045,
2083
+ "memory/device_reserved (GiB)": 20.01,
2084
+ "memory/max_active (GiB)": 16.23,
2085
+ "memory/max_allocated (GiB)": 16.23,
2086
+ "ppl": 1.00117,
2087
+ "step": 1460,
2088
+ "tokens/total": 23932928,
2089
+ "tokens/train_per_sec_per_gpu": 13.91,
2090
+ "tokens/trainable": 7580038
2091
+ },
2092
+ {
2093
+ "epoch": 1.4266666666666667,
2094
+ "grad_norm": 0.05165834724903107,
2095
+ "learning_rate": 0.0001798544952269572,
2096
+ "loss": 0.0009731135331094265,
2097
+ "memory/device_reserved (GiB)": 20.01,
2098
+ "memory/max_active (GiB)": 16.23,
2099
+ "memory/max_allocated (GiB)": 16.23,
2100
+ "ppl": 1.00097,
2101
+ "step": 1470,
2102
+ "tokens/total": 24096768,
2103
+ "tokens/train_per_sec_per_gpu": 14.56,
2104
+ "tokens/trainable": 7631772
2105
+ },
2106
+ {
2107
+ "epoch": 1.4363636363636363,
2108
+ "grad_norm": 0.02529827691614628,
2109
+ "learning_rate": 0.0001794451173262885,
2110
+ "loss": 0.0005802253726869822,
2111
+ "memory/device_reserved (GiB)": 20.01,
2112
+ "memory/max_active (GiB)": 16.23,
2113
+ "memory/max_allocated (GiB)": 16.23,
2114
+ "ppl": 1.00058,
2115
+ "step": 1480,
2116
+ "tokens/total": 24260608,
2117
+ "tokens/train_per_sec_per_gpu": 13.72,
2118
+ "tokens/trainable": 7683048
2119
+ },
2120
+ {
2121
+ "epoch": 1.446060606060606,
2122
+ "grad_norm": 0.0670745000243187,
2123
+ "learning_rate": 0.00017903209751455665,
2124
+ "loss": 0.000642474414780736,
2125
+ "memory/device_reserved (GiB)": 20.01,
2126
+ "memory/max_active (GiB)": 16.23,
2127
+ "memory/max_allocated (GiB)": 16.23,
2128
+ "ppl": 1.00064,
2129
+ "step": 1490,
2130
+ "tokens/total": 24424448,
2131
+ "tokens/train_per_sec_per_gpu": 14.33,
2132
+ "tokens/trainable": 7735332
2133
+ },
2134
+ {
2135
+ "epoch": 1.4557575757575758,
2136
+ "grad_norm": 0.02367187850177288,
2137
+ "learning_rate": 0.00017861545472535348,
2138
+ "loss": 0.00032834114972501993,
2139
+ "memory/device_reserved (GiB)": 20.01,
2140
+ "memory/max_active (GiB)": 16.23,
2141
+ "memory/max_allocated (GiB)": 16.23,
2142
+ "ppl": 1.00033,
2143
+ "step": 1500,
2144
+ "tokens/total": 24588288,
2145
+ "tokens/train_per_sec_per_gpu": 16.37,
2146
+ "tokens/trainable": 7787186
2147
+ },
2148
+ {
2149
+ "epoch": 1.4654545454545453,
2150
+ "grad_norm": 0.011678172275424004,
2151
+ "learning_rate": 0.00017819520805835475,
2152
+ "loss": 0.0009690596722066403,
2153
+ "memory/device_reserved (GiB)": 20.01,
2154
+ "memory/max_active (GiB)": 16.23,
2155
+ "memory/max_allocated (GiB)": 16.23,
2156
+ "ppl": 1.00097,
2157
+ "step": 1510,
2158
+ "tokens/total": 24752128,
2159
+ "tokens/train_per_sec_per_gpu": 13.55,
2160
+ "tokens/trainable": 7838878
2161
+ },
2162
+ {
2163
+ "epoch": 1.475151515151515,
2164
+ "grad_norm": 0.05298800393939018,
2165
+ "learning_rate": 0.00017777137677844461,
2166
+ "loss": 0.0009098535403609276,
2167
+ "memory/device_reserved (GiB)": 20.01,
2168
+ "memory/max_active (GiB)": 16.23,
2169
+ "memory/max_allocated (GiB)": 16.23,
2170
+ "ppl": 1.00091,
2171
+ "step": 1520,
2172
+ "tokens/total": 24915968,
2173
+ "tokens/train_per_sec_per_gpu": 14.33,
2174
+ "tokens/trainable": 7890631
2175
+ },
2176
+ {
2177
+ "epoch": 1.4848484848484849,
2178
+ "grad_norm": 0.037918779999017715,
2179
+ "learning_rate": 0.00017734398031483265,
2180
+ "loss": 0.0006457697600126266,
2181
+ "memory/device_reserved (GiB)": 20.01,
2182
+ "memory/max_active (GiB)": 16.23,
2183
+ "memory/max_allocated (GiB)": 16.23,
2184
+ "ppl": 1.00065,
2185
+ "step": 1530,
2186
+ "tokens/total": 25079808,
2187
+ "tokens/train_per_sec_per_gpu": 13.25,
2188
+ "tokens/trainable": 7942366
2189
+ },
2190
+ {
2191
+ "epoch": 1.4945454545454546,
2192
+ "grad_norm": 0.02729674056172371,
2193
+ "learning_rate": 0.0001769130382601629,
2194
+ "loss": 0.0009943137876689434,
2195
+ "memory/device_reserved (GiB)": 20.01,
2196
+ "memory/max_active (GiB)": 16.23,
2197
+ "memory/max_allocated (GiB)": 16.23,
2198
+ "ppl": 1.00099,
2199
+ "step": 1540,
2200
+ "tokens/total": 25243648,
2201
+ "tokens/train_per_sec_per_gpu": 14.37,
2202
+ "tokens/trainable": 7994307
2203
+ },
2204
+ {
2205
+ "epoch": 1.5023030303030303,
2206
+ "eval_loss": 0.0006865999894216657,
2207
+ "eval_ppl": 1.00069,
2208
+ "eval_runtime": 12.127,
2209
+ "eval_samples_per_second": 16.492,
2210
+ "eval_steps_per_second": 8.246,
2211
+ "memory/device_reserved (GiB)": 20.01,
2212
+ "memory/max_active (GiB)": 16.23,
2213
+ "memory/max_allocated (GiB)": 16.23,
2214
+ "step": 1548
2215
+ },
2216
+ {
2217
+ "epoch": 1.5042424242424244,
2218
+ "grad_norm": 0.053267233073711395,
2219
+ "learning_rate": 0.00017647857036961592,
2220
+ "loss": 0.0006284893956035375,
2221
+ "memory/device_reserved (GiB)": 20.01,
2222
+ "memory/max_active (GiB)": 16.23,
2223
+ "memory/max_allocated (GiB)": 16.23,
2224
+ "ppl": 1.00063,
2225
+ "step": 1550,
2226
+ "tokens/total": 25407488,
2227
+ "tokens/train_per_sec_per_gpu": 14.87,
2228
+ "tokens/trainable": 8046124
2229
+ },
2230
+ {
2231
+ "epoch": 1.513939393939394,
2232
+ "grad_norm": 0.05232734978199005,
2233
+ "learning_rate": 0.0001760405965600031,
2234
+ "loss": 0.0005064161494374275,
2235
+ "memory/device_reserved (GiB)": 20.01,
2236
+ "memory/max_active (GiB)": 16.23,
2237
+ "memory/max_allocated (GiB)": 16.23,
2238
+ "ppl": 1.00051,
2239
+ "step": 1560,
2240
+ "tokens/total": 25571328,
2241
+ "tokens/train_per_sec_per_gpu": 14.39,
2242
+ "tokens/trainable": 8098367
2243
+ },
2244
+ {
2245
+ "epoch": 1.5236363636363637,
2246
+ "grad_norm": 0.015440079383552074,
2247
+ "learning_rate": 0.00017559913690885364,
2248
+ "loss": 0.0004742793273180723,
2249
+ "memory/device_reserved (GiB)": 20.01,
2250
+ "memory/max_active (GiB)": 16.23,
2251
+ "memory/max_allocated (GiB)": 16.23,
2252
+ "ppl": 1.00047,
2253
+ "step": 1570,
2254
+ "tokens/total": 25735168,
2255
+ "tokens/train_per_sec_per_gpu": 14.19,
2256
+ "tokens/trainable": 8150005
2257
+ },
2258
+ {
2259
+ "epoch": 1.5333333333333332,
2260
+ "grad_norm": 0.005799058359116316,
2261
+ "learning_rate": 0.00017515421165349414,
2262
+ "loss": 0.0005522690713405609,
2263
+ "memory/device_reserved (GiB)": 20.01,
2264
+ "memory/max_active (GiB)": 16.23,
2265
+ "memory/max_allocated (GiB)": 16.23,
2266
+ "ppl": 1.00055,
2267
+ "step": 1580,
2268
+ "tokens/total": 25899008,
2269
+ "tokens/train_per_sec_per_gpu": 14.94,
2270
+ "tokens/trainable": 8201985
2271
+ },
2272
+ {
2273
+ "epoch": 1.543030303030303,
2274
+ "grad_norm": 0.025745827704668045,
2275
+ "learning_rate": 0.00017470584119012094,
2276
+ "loss": 0.0004415466450154781,
2277
+ "memory/device_reserved (GiB)": 20.01,
2278
+ "memory/max_active (GiB)": 16.23,
2279
+ "memory/max_allocated (GiB)": 16.23,
2280
+ "ppl": 1.00044,
2281
+ "step": 1590,
2282
+ "tokens/total": 26062848,
2283
+ "tokens/train_per_sec_per_gpu": 14.76,
2284
+ "tokens/trainable": 8253407
2285
+ },
2286
+ {
2287
+ "epoch": 1.5527272727272727,
2288
+ "grad_norm": 0.006111942231655121,
2289
+ "learning_rate": 0.00017425404607286508,
2290
+ "loss": 0.0004033858887851238,
2291
+ "memory/device_reserved (GiB)": 20.01,
2292
+ "memory/max_active (GiB)": 16.23,
2293
+ "memory/max_allocated (GiB)": 16.23,
2294
+ "ppl": 1.0004,
2295
+ "step": 1600,
2296
+ "tokens/total": 26226688,
2297
+ "tokens/train_per_sec_per_gpu": 13.45,
2298
+ "tokens/trainable": 8305596
2299
+ },
2300
+ {
2301
+ "epoch": 1.5624242424242425,
2302
+ "grad_norm": 0.01315031573176384,
2303
+ "learning_rate": 0.00017379884701285,
2304
+ "loss": 0.0006456051021814346,
2305
+ "memory/device_reserved (GiB)": 20.01,
2306
+ "memory/max_active (GiB)": 16.23,
2307
+ "memory/max_allocated (GiB)": 16.23,
2308
+ "ppl": 1.00065,
2309
+ "step": 1610,
2310
+ "tokens/total": 26390528,
2311
+ "tokens/train_per_sec_per_gpu": 15.34,
2312
+ "tokens/trainable": 8357648
2313
+ },
2314
+ {
2315
+ "epoch": 1.5721212121212123,
2316
+ "grad_norm": 0.002383842132985592,
2317
+ "learning_rate": 0.00017334026487724225,
2318
+ "loss": 0.00028960562776774167,
2319
+ "memory/device_reserved (GiB)": 20.01,
2320
+ "memory/max_active (GiB)": 16.23,
2321
+ "memory/max_allocated (GiB)": 16.23,
2322
+ "ppl": 1.00029,
2323
+ "step": 1620,
2324
+ "tokens/total": 26554368,
2325
+ "tokens/train_per_sec_per_gpu": 14.29,
2326
+ "tokens/trainable": 8410056
2327
+ },
2328
+ {
2329
+ "epoch": 1.5818181818181818,
2330
+ "grad_norm": 0.006294222082942724,
2331
+ "learning_rate": 0.0001728783206882948,
2332
+ "loss": 0.00025043871719390156,
2333
+ "memory/device_reserved (GiB)": 20.01,
2334
+ "memory/max_active (GiB)": 16.23,
2335
+ "memory/max_allocated (GiB)": 16.23,
2336
+ "ppl": 1.00025,
2337
+ "step": 1630,
2338
+ "tokens/total": 26718208,
2339
+ "tokens/train_per_sec_per_gpu": 15.1,
2340
+ "tokens/trainable": 8461798
2341
+ },
2342
+ {
2343
+ "epoch": 1.5915151515151515,
2344
+ "grad_norm": 8.702854393050075e-05,
2345
+ "learning_rate": 0.00017241303562238336,
2346
+ "loss": 0.00012461008736863732,
2347
+ "memory/device_reserved (GiB)": 20.01,
2348
+ "memory/max_active (GiB)": 16.23,
2349
+ "memory/max_allocated (GiB)": 16.23,
2350
+ "ppl": 1.00012,
2351
+ "step": 1640,
2352
+ "tokens/total": 26882048,
2353
+ "tokens/train_per_sec_per_gpu": 15.61,
2354
+ "tokens/trainable": 8514035
2355
+ },
2356
+ {
2357
+ "epoch": 1.601212121212121,
2358
+ "grad_norm": 0.07624056935310364,
2359
+ "learning_rate": 0.00017194443100903558,
2360
+ "loss": 0.00024855402298271654,
2361
+ "memory/device_reserved (GiB)": 20.01,
2362
+ "memory/max_active (GiB)": 16.23,
2363
+ "memory/max_allocated (GiB)": 16.23,
2364
+ "ppl": 1.00025,
2365
+ "step": 1650,
2366
+ "tokens/total": 27045888,
2367
+ "tokens/train_per_sec_per_gpu": 14.48,
2368
+ "tokens/trainable": 8565875
2369
+ },
2370
+ {
2371
+ "epoch": 1.6109090909090908,
2372
+ "grad_norm": 0.02497026138007641,
2373
+ "learning_rate": 0.00017147252832995337,
2374
+ "loss": 0.00044286823831498625,
2375
+ "memory/device_reserved (GiB)": 20.01,
2376
+ "memory/max_active (GiB)": 16.23,
2377
+ "memory/max_allocated (GiB)": 16.23,
2378
+ "ppl": 1.00044,
2379
+ "step": 1660,
2380
+ "tokens/total": 27209728,
2381
+ "tokens/train_per_sec_per_gpu": 14.47,
2382
+ "tokens/trainable": 8617912
2383
+ },
2384
+ {
2385
+ "epoch": 1.6206060606060606,
2386
+ "grad_norm": 0.0016530955908820033,
2387
+ "learning_rate": 0.00017099734921802802,
2388
+ "loss": 0.0003104714211076498,
2389
+ "memory/device_reserved (GiB)": 20.01,
2390
+ "memory/max_active (GiB)": 16.23,
2391
+ "memory/max_allocated (GiB)": 16.23,
2392
+ "ppl": 1.00031,
2393
+ "step": 1670,
2394
+ "tokens/total": 27373568,
2395
+ "tokens/train_per_sec_per_gpu": 13.53,
2396
+ "tokens/trainable": 8669875
2397
+ },
2398
+ {
2399
+ "epoch": 1.6303030303030304,
2400
+ "grad_norm": 0.02621961385011673,
2401
+ "learning_rate": 0.00017051891545634854,
2402
+ "loss": 0.0004010321106761694,
2403
+ "memory/device_reserved (GiB)": 20.01,
2404
+ "memory/max_active (GiB)": 16.23,
2405
+ "memory/max_allocated (GiB)": 16.23,
2406
+ "ppl": 1.0004,
2407
+ "step": 1680,
2408
+ "tokens/total": 27537408,
2409
+ "tokens/train_per_sec_per_gpu": 16.09,
2410
+ "tokens/trainable": 8721709
2411
+ },
2412
+ {
2413
+ "epoch": 1.6400000000000001,
2414
+ "grad_norm": 0.043721288442611694,
2415
+ "learning_rate": 0.00017003724897720316,
2416
+ "loss": 0.00042473864741623404,
2417
+ "memory/device_reserved (GiB)": 20.01,
2418
+ "memory/max_active (GiB)": 16.23,
2419
+ "memory/max_allocated (GiB)": 16.23,
2420
+ "ppl": 1.00042,
2421
+ "step": 1690,
2422
+ "tokens/total": 27701248,
2423
+ "tokens/train_per_sec_per_gpu": 14.84,
2424
+ "tokens/trainable": 8773762
2425
+ },
2426
+ {
2427
+ "epoch": 1.6496969696969697,
2428
+ "grad_norm": 0.01791808009147644,
2429
+ "learning_rate": 0.00016955237186107387,
2430
+ "loss": 0.0003858121577650309,
2431
+ "memory/device_reserved (GiB)": 20.01,
2432
+ "memory/max_active (GiB)": 16.23,
2433
+ "memory/max_allocated (GiB)": 16.23,
2434
+ "ppl": 1.00039,
2435
+ "step": 1700,
2436
+ "tokens/total": 27865088,
2437
+ "tokens/train_per_sec_per_gpu": 14.87,
2438
+ "tokens/trainable": 8825435
2439
+ },
2440
+ {
2441
+ "epoch": 1.6593939393939394,
2442
+ "grad_norm": 0.017175329849123955,
2443
+ "learning_rate": 0.0001690643063356241,
2444
+ "loss": 0.0003785108681768179,
2445
+ "memory/device_reserved (GiB)": 20.01,
2446
+ "memory/max_active (GiB)": 16.23,
2447
+ "memory/max_allocated (GiB)": 16.23,
2448
+ "ppl": 1.00038,
2449
+ "step": 1710,
2450
+ "tokens/total": 28028928,
2451
+ "tokens/train_per_sec_per_gpu": 13.63,
2452
+ "tokens/trainable": 8877227
2453
+ },
2454
+ {
2455
+ "epoch": 1.669090909090909,
2456
+ "grad_norm": 0.03429865464568138,
2457
+ "learning_rate": 0.0001685730747746799,
2458
+ "loss": 0.0003128159558400512,
2459
+ "memory/device_reserved (GiB)": 20.01,
2460
+ "memory/max_active (GiB)": 16.23,
2461
+ "memory/max_allocated (GiB)": 16.23,
2462
+ "ppl": 1.00031,
2463
+ "step": 1720,
2464
+ "tokens/total": 28192768,
2465
+ "tokens/train_per_sec_per_gpu": 13.42,
2466
+ "tokens/trainable": 8928835
2467
+ },
2468
+ {
2469
+ "epoch": 1.6787878787878787,
2470
+ "grad_norm": 0.008623798377811909,
2471
+ "learning_rate": 0.0001680786996972043,
2472
+ "loss": 0.0008884714916348457,
2473
+ "memory/device_reserved (GiB)": 20.01,
2474
+ "memory/max_active (GiB)": 16.23,
2475
+ "memory/max_allocated (GiB)": 16.23,
2476
+ "ppl": 1.00089,
2477
+ "step": 1730,
2478
+ "tokens/total": 28356608,
2479
+ "tokens/train_per_sec_per_gpu": 14.8,
2480
+ "tokens/trainable": 8979863
2481
+ },
2482
+ {
2483
+ "epoch": 1.6884848484848485,
2484
+ "grad_norm": 0.007137796841561794,
2485
+ "learning_rate": 0.00016758120376626488,
2486
+ "loss": 0.000342932902276516,
2487
+ "memory/device_reserved (GiB)": 20.01,
2488
+ "memory/max_active (GiB)": 16.23,
2489
+ "memory/max_allocated (GiB)": 16.23,
2490
+ "ppl": 1.00034,
2491
+ "step": 1740,
2492
+ "tokens/total": 28520448,
2493
+ "tokens/train_per_sec_per_gpu": 13.64,
2494
+ "tokens/trainable": 9031317
2495
+ },
2496
+ {
2497
+ "epoch": 1.6981818181818182,
2498
+ "grad_norm": 0.006754934322088957,
2499
+ "learning_rate": 0.00016708060978799493,
2500
+ "loss": 0.00031610706355422735,
2501
+ "memory/device_reserved (GiB)": 20.01,
2502
+ "memory/max_active (GiB)": 16.23,
2503
+ "memory/max_allocated (GiB)": 16.23,
2504
+ "ppl": 1.00032,
2505
+ "step": 1750,
2506
+ "tokens/total": 28684288,
2507
+ "tokens/train_per_sec_per_gpu": 16.63,
2508
+ "tokens/trainable": 9082925
2509
+ },
2510
+ {
2511
+ "epoch": 1.707878787878788,
2512
+ "grad_norm": 0.012158721685409546,
2513
+ "learning_rate": 0.00016657694071054794,
2514
+ "loss": 0.00039324900135397913,
2515
+ "memory/device_reserved (GiB)": 20.01,
2516
+ "memory/max_active (GiB)": 16.23,
2517
+ "memory/max_allocated (GiB)": 16.23,
2518
+ "ppl": 1.00039,
2519
+ "step": 1760,
2520
+ "tokens/total": 28848128,
2521
+ "tokens/train_per_sec_per_gpu": 14.31,
2522
+ "tokens/trainable": 9134535
2523
+ },
2524
+ {
2525
+ "epoch": 1.7175757575757575,
2526
+ "grad_norm": 0.04653792828321457,
2527
+ "learning_rate": 0.00016607021962304565,
2528
+ "loss": 0.0003617320442572236,
2529
+ "memory/device_reserved (GiB)": 20.01,
2530
+ "memory/max_active (GiB)": 16.23,
2531
+ "memory/max_allocated (GiB)": 16.23,
2532
+ "ppl": 1.00036,
2533
+ "step": 1770,
2534
+ "tokens/total": 29011968,
2535
+ "tokens/train_per_sec_per_gpu": 14.01,
2536
+ "tokens/trainable": 9186666
2537
+ },
2538
+ {
2539
+ "epoch": 1.7272727272727273,
2540
+ "grad_norm": 0.009638557210564613,
2541
+ "learning_rate": 0.00016556046975451963,
2542
+ "loss": 0.00031410730443894865,
2543
+ "memory/device_reserved (GiB)": 20.01,
2544
+ "memory/max_active (GiB)": 16.23,
2545
+ "memory/max_allocated (GiB)": 16.23,
2546
+ "ppl": 1.00031,
2547
+ "step": 1780,
2548
+ "tokens/total": 29175808,
2549
+ "tokens/train_per_sec_per_gpu": 14.23,
2550
+ "tokens/trainable": 9238529
2551
+ },
2552
+ {
2553
+ "epoch": 1.7369696969696968,
2554
+ "grad_norm": 0.017064686864614487,
2555
+ "learning_rate": 0.0001650477144728462,
2556
+ "loss": 0.00043909624218940735,
2557
+ "memory/device_reserved (GiB)": 20.01,
2558
+ "memory/max_active (GiB)": 16.23,
2559
+ "memory/max_allocated (GiB)": 16.23,
2560
+ "ppl": 1.00044,
2561
+ "step": 1790,
2562
+ "tokens/total": 29339648,
2563
+ "tokens/train_per_sec_per_gpu": 14.08,
2564
+ "tokens/trainable": 9290289
2565
+ },
2566
+ {
2567
+ "epoch": 1.7466666666666666,
2568
+ "grad_norm": 0.0022802259773015976,
2569
+ "learning_rate": 0.00016453197728367563,
2570
+ "loss": 0.00032380607444792986,
2571
+ "memory/device_reserved (GiB)": 20.01,
2572
+ "memory/max_active (GiB)": 16.23,
2573
+ "memory/max_allocated (GiB)": 16.23,
2574
+ "ppl": 1.00032,
2575
+ "step": 1800,
2576
+ "tokens/total": 29503488,
2577
+ "tokens/train_per_sec_per_gpu": 13.73,
2578
+ "tokens/trainable": 9341953
2579
+ },
2580
+ {
2581
+ "epoch": 1.7563636363636363,
2582
+ "grad_norm": 0.0036841712426394224,
2583
+ "learning_rate": 0.00016401328182935417,
2584
+ "loss": 0.0006712255533784627,
2585
+ "memory/device_reserved (GiB)": 20.01,
2586
+ "memory/max_active (GiB)": 16.23,
2587
+ "memory/max_allocated (GiB)": 16.23,
2588
+ "ppl": 1.00067,
2589
+ "step": 1810,
2590
+ "tokens/total": 29667328,
2591
+ "tokens/train_per_sec_per_gpu": 16.36,
2592
+ "tokens/trainable": 9393126
2593
+ },
2594
+ {
2595
+ "epoch": 1.766060606060606,
2596
+ "grad_norm": 0.0006454121321439743,
2597
+ "learning_rate": 0.0001634916518878404,
2598
+ "loss": 0.00010477005271241069,
2599
+ "memory/device_reserved (GiB)": 20.01,
2600
+ "memory/max_active (GiB)": 16.23,
2601
+ "memory/max_allocated (GiB)": 16.23,
2602
+ "ppl": 1.0001,
2603
+ "step": 1820,
2604
+ "tokens/total": 29831168,
2605
+ "tokens/train_per_sec_per_gpu": 14.7,
2606
+ "tokens/trainable": 9444494
2607
+ },
2608
+ {
2609
+ "epoch": 1.7757575757575759,
2610
+ "grad_norm": 0.035474907606840134,
2611
+ "learning_rate": 0.00016296711137161535,
2612
+ "loss": 0.00034273902419954536,
2613
+ "memory/device_reserved (GiB)": 20.01,
2614
+ "memory/max_active (GiB)": 16.23,
2615
+ "memory/max_allocated (GiB)": 16.23,
2616
+ "ppl": 1.00034,
2617
+ "step": 1830,
2618
+ "tokens/total": 29995008,
2619
+ "tokens/train_per_sec_per_gpu": 14.78,
2620
+ "tokens/trainable": 9496432
2621
+ },
2622
+ {
2623
+ "epoch": 1.7854545454545454,
2624
+ "grad_norm": 0.0042278701439499855,
2625
+ "learning_rate": 0.00016243968432658605,
2626
+ "loss": 0.0004896576981991529,
2627
+ "memory/device_reserved (GiB)": 20.01,
2628
+ "memory/max_active (GiB)": 16.23,
2629
+ "memory/max_allocated (GiB)": 16.23,
2630
+ "ppl": 1.00049,
2631
+ "step": 1840,
2632
+ "tokens/total": 30158848,
2633
+ "tokens/train_per_sec_per_gpu": 15.01,
2634
+ "tokens/trainable": 9547913
2635
+ },
2636
+ {
2637
+ "epoch": 1.7951515151515152,
2638
+ "grad_norm": 0.008337569423019886,
2639
+ "learning_rate": 0.00016190939493098344,
2640
+ "loss": 0.0003711160738021135,
2641
+ "memory/device_reserved (GiB)": 20.01,
2642
+ "memory/max_active (GiB)": 16.23,
2643
+ "memory/max_allocated (GiB)": 16.23,
2644
+ "ppl": 1.00037,
2645
+ "step": 1850,
2646
+ "tokens/total": 30322688,
2647
+ "tokens/train_per_sec_per_gpu": 14.24,
2648
+ "tokens/trainable": 9599023
2649
+ },
2650
+ {
2651
+ "epoch": 1.8048484848484847,
2652
+ "grad_norm": 0.033457424491643906,
2653
+ "learning_rate": 0.00016137626749425377,
2654
+ "loss": 0.0005191094242036343,
2655
+ "memory/device_reserved (GiB)": 20.01,
2656
+ "memory/max_active (GiB)": 16.23,
2657
+ "memory/max_allocated (GiB)": 16.23,
2658
+ "ppl": 1.00052,
2659
+ "step": 1860,
2660
+ "tokens/total": 30486528,
2661
+ "tokens/train_per_sec_per_gpu": 14.35,
2662
+ "tokens/trainable": 9651048
2663
+ },
2664
+ {
2665
+ "epoch": 1.8145454545454545,
2666
+ "grad_norm": 0.014811063185334206,
2667
+ "learning_rate": 0.0001608403264559445,
2668
+ "loss": 0.0002689486602321267,
2669
+ "memory/device_reserved (GiB)": 20.01,
2670
+ "memory/max_active (GiB)": 16.23,
2671
+ "memory/max_allocated (GiB)": 16.23,
2672
+ "ppl": 1.00027,
2673
+ "step": 1870,
2674
+ "tokens/total": 30650368,
2675
+ "tokens/train_per_sec_per_gpu": 14.52,
2676
+ "tokens/trainable": 9703354
2677
+ },
2678
+ {
2679
+ "epoch": 1.8242424242424242,
2680
+ "grad_norm": 0.011829032562673092,
2681
+ "learning_rate": 0.00016030159638458376,
2682
+ "loss": 0.0003055253764614463,
2683
+ "memory/device_reserved (GiB)": 20.01,
2684
+ "memory/max_active (GiB)": 16.23,
2685
+ "memory/max_allocated (GiB)": 16.23,
2686
+ "ppl": 1.00031,
2687
+ "step": 1880,
2688
+ "tokens/total": 30814208,
2689
+ "tokens/train_per_sec_per_gpu": 14.05,
2690
+ "tokens/trainable": 9755371
2691
+ },
2692
+ {
2693
+ "epoch": 1.833939393939394,
2694
+ "grad_norm": 0.003898326540365815,
2695
+ "learning_rate": 0.00015976010197655397,
2696
+ "loss": 0.00023026440758258104,
2697
+ "memory/device_reserved (GiB)": 20.01,
2698
+ "memory/max_active (GiB)": 16.23,
2699
+ "memory/max_allocated (GiB)": 16.23,
2700
+ "ppl": 1.00023,
2701
+ "step": 1890,
2702
+ "tokens/total": 30978048,
2703
+ "tokens/train_per_sec_per_gpu": 13.89,
2704
+ "tokens/trainable": 9807011
2705
+ },
2706
+ {
2707
+ "epoch": 1.8436363636363637,
2708
+ "grad_norm": 0.00993694830685854,
2709
+ "learning_rate": 0.00015921586805496004,
2710
+ "loss": 0.000414779270067811,
2711
+ "memory/device_reserved (GiB)": 20.01,
2712
+ "memory/max_active (GiB)": 16.23,
2713
+ "memory/max_allocated (GiB)": 16.23,
2714
+ "ppl": 1.00041,
2715
+ "step": 1900,
2716
+ "tokens/total": 31141888,
2717
+ "tokens/train_per_sec_per_gpu": 14.42,
2718
+ "tokens/trainable": 9859849
2719
+ },
2720
+ {
2721
+ "epoch": 1.8533333333333335,
2722
+ "grad_norm": 0.00715588079765439,
2723
+ "learning_rate": 0.0001586689195684911,
2724
+ "loss": 0.0004666011780500412,
2725
+ "memory/device_reserved (GiB)": 20.01,
2726
+ "memory/max_active (GiB)": 16.23,
2727
+ "memory/max_allocated (GiB)": 16.23,
2728
+ "ppl": 1.00047,
2729
+ "step": 1910,
2730
+ "tokens/total": 31305728,
2731
+ "tokens/train_per_sec_per_gpu": 14.16,
2732
+ "tokens/trainable": 9911712
2733
+ },
2734
+ {
2735
+ "epoch": 1.863030303030303,
2736
+ "grad_norm": 0.021137356758117676,
2737
+ "learning_rate": 0.000158119281590277,
2738
+ "loss": 0.00046254890039563177,
2739
+ "memory/device_reserved (GiB)": 20.01,
2740
+ "memory/max_active (GiB)": 16.23,
2741
+ "memory/max_allocated (GiB)": 16.23,
2742
+ "ppl": 1.00046,
2743
+ "step": 1920,
2744
+ "tokens/total": 31469568,
2745
+ "tokens/train_per_sec_per_gpu": 14.81,
2746
+ "tokens/trainable": 9963813
2747
+ },
2748
+ {
2749
+ "epoch": 1.8727272727272726,
2750
+ "grad_norm": 0.0023340010084211826,
2751
+ "learning_rate": 0.000157566979316739,
2752
+ "loss": 0.0004919813480228185,
2753
+ "memory/device_reserved (GiB)": 20.01,
2754
+ "memory/max_active (GiB)": 16.23,
2755
+ "memory/max_allocated (GiB)": 16.23,
2756
+ "ppl": 1.00049,
2757
+ "step": 1930,
2758
+ "tokens/total": 31633408,
2759
+ "tokens/train_per_sec_per_gpu": 15.8,
2760
+ "tokens/trainable": 10015724
2761
+ },
2762
+ {
2763
+ "epoch": 1.8824242424242423,
2764
+ "grad_norm": 0.01151804905384779,
2765
+ "learning_rate": 0.00015701203806643433,
2766
+ "loss": 0.00023937469813972712,
2767
+ "memory/device_reserved (GiB)": 20.01,
2768
+ "memory/max_active (GiB)": 16.23,
2769
+ "memory/max_allocated (GiB)": 16.23,
2770
+ "ppl": 1.00024,
2771
+ "step": 1940,
2772
+ "tokens/total": 31797248,
2773
+ "tokens/train_per_sec_per_gpu": 14.32,
2774
+ "tokens/trainable": 10067073
2775
+ },
2776
+ {
2777
+ "epoch": 1.892121212121212,
2778
+ "grad_norm": 0.016535570845007896,
2779
+ "learning_rate": 0.00015645448327889603,
2780
+ "loss": 0.00021827330347150563,
2781
+ "memory/device_reserved (GiB)": 20.01,
2782
+ "memory/max_active (GiB)": 16.23,
2783
+ "memory/max_allocated (GiB)": 16.23,
2784
+ "ppl": 1.00022,
2785
+ "step": 1950,
2786
+ "tokens/total": 31961088,
2787
+ "tokens/train_per_sec_per_gpu": 14.48,
2788
+ "tokens/trainable": 10119393
2789
+ },
2790
+ {
2791
+ "epoch": 1.9018181818181819,
2792
+ "grad_norm": 0.0034130853600800037,
2793
+ "learning_rate": 0.00015589434051346634,
2794
+ "loss": 0.00017861993983387948,
2795
+ "memory/device_reserved (GiB)": 20.01,
2796
+ "memory/max_active (GiB)": 16.23,
2797
+ "memory/max_allocated (GiB)": 16.23,
2798
+ "ppl": 1.00018,
2799
+ "step": 1960,
2800
+ "tokens/total": 32124928,
2801
+ "tokens/train_per_sec_per_gpu": 14.23,
2802
+ "tokens/trainable": 10171930
2803
+ },
2804
+ {
2805
+ "epoch": 1.9115151515151516,
2806
+ "grad_norm": 0.02398502826690674,
2807
+ "learning_rate": 0.0001553316354481253,
2808
+ "loss": 0.00014141426654532552,
2809
+ "memory/device_reserved (GiB)": 20.01,
2810
+ "memory/max_active (GiB)": 16.23,
2811
+ "memory/max_allocated (GiB)": 16.23,
2812
+ "ppl": 1.00014,
2813
+ "step": 1970,
2814
+ "tokens/total": 32288768,
2815
+ "tokens/train_per_sec_per_gpu": 15.59,
2816
+ "tokens/trainable": 10223639
2817
+ },
2818
+ {
2819
+ "epoch": 1.9212121212121214,
2820
+ "grad_norm": 0.0007365989149548113,
2821
+ "learning_rate": 0.00015476639387831343,
2822
+ "loss": 0.00011406640987843275,
2823
+ "memory/device_reserved (GiB)": 20.01,
2824
+ "memory/max_active (GiB)": 16.23,
2825
+ "memory/max_allocated (GiB)": 16.23,
2826
+ "ppl": 1.00011,
2827
+ "step": 1980,
2828
+ "tokens/total": 32452608,
2829
+ "tokens/train_per_sec_per_gpu": 13.45,
2830
+ "tokens/trainable": 10275019
2831
+ },
2832
+ {
2833
+ "epoch": 1.930909090909091,
2834
+ "grad_norm": 0.028317851945757866,
2835
+ "learning_rate": 0.00015419864171574944,
2836
+ "loss": 0.0004076042678207159,
2837
+ "memory/device_reserved (GiB)": 20.01,
2838
+ "memory/max_active (GiB)": 16.23,
2839
+ "memory/max_allocated (GiB)": 16.23,
2840
+ "ppl": 1.00041,
2841
+ "step": 1990,
2842
+ "tokens/total": 32616448,
2843
+ "tokens/train_per_sec_per_gpu": 14.68,
2844
+ "tokens/trainable": 10327234
2845
+ },
2846
+ {
2847
+ "epoch": 1.9406060606060604,
2848
+ "grad_norm": 0.0007216805825009942,
2849
+ "learning_rate": 0.00015362840498724215,
2850
+ "loss": 0.0002287053968757391,
2851
+ "memory/device_reserved (GiB)": 20.01,
2852
+ "memory/max_active (GiB)": 16.23,
2853
+ "memory/max_allocated (GiB)": 16.23,
2854
+ "ppl": 1.00023,
2855
+ "step": 2000,
2856
+ "tokens/total": 32780288,
2857
+ "tokens/train_per_sec_per_gpu": 14.77,
2858
+ "tokens/trainable": 10379906
2859
+ },
2860
+ {
2861
+ "epoch": 1.9503030303030302,
2862
+ "grad_norm": 0.021391045302152634,
2863
+ "learning_rate": 0.00015305570983349743,
2864
+ "loss": 0.0006855262909084558,
2865
+ "memory/device_reserved (GiB)": 20.01,
2866
+ "memory/max_active (GiB)": 16.23,
2867
+ "memory/max_allocated (GiB)": 16.23,
2868
+ "ppl": 1.00069,
2869
+ "step": 2010,
2870
+ "tokens/total": 32944128,
2871
+ "tokens/train_per_sec_per_gpu": 13.75,
2872
+ "tokens/trainable": 10431864
2873
+ },
2874
+ {
2875
+ "epoch": 1.96,
2876
+ "grad_norm": 0.014411289244890213,
2877
+ "learning_rate": 0.00015248058250792008,
2878
+ "loss": 0.00020992583595216274,
2879
+ "memory/device_reserved (GiB)": 20.01,
2880
+ "memory/max_active (GiB)": 16.23,
2881
+ "memory/max_allocated (GiB)": 16.23,
2882
+ "ppl": 1.00021,
2883
+ "step": 2020,
2884
+ "tokens/total": 33107968,
2885
+ "tokens/train_per_sec_per_gpu": 14.32,
2886
+ "tokens/trainable": 10483503
2887
+ },
2888
+ {
2889
+ "epoch": 1.9696969696969697,
2890
+ "grad_norm": 0.0019180785166099668,
2891
+ "learning_rate": 0.00015190304937540993,
2892
+ "loss": 0.000295165297575295,
2893
+ "memory/device_reserved (GiB)": 20.01,
2894
+ "memory/max_active (GiB)": 16.23,
2895
+ "memory/max_allocated (GiB)": 16.23,
2896
+ "ppl": 1.0003,
2897
+ "step": 2030,
2898
+ "tokens/total": 33271808,
2899
+ "tokens/train_per_sec_per_gpu": 15.32,
2900
+ "tokens/trainable": 10534682
2901
+ },
2902
+ {
2903
+ "epoch": 1.9793939393939395,
2904
+ "grad_norm": 0.027906686067581177,
2905
+ "learning_rate": 0.00015132313691115367,
2906
+ "loss": 0.00030230602715164423,
2907
+ "memory/device_reserved (GiB)": 20.01,
2908
+ "memory/max_active (GiB)": 16.23,
2909
+ "memory/max_allocated (GiB)": 16.23,
2910
+ "ppl": 1.0003,
2911
+ "step": 2040,
2912
+ "tokens/total": 33435648,
2913
+ "tokens/train_per_sec_per_gpu": 13.52,
2914
+ "tokens/trainable": 10586848
2915
+ },
2916
+ {
2917
+ "epoch": 1.9890909090909092,
2918
+ "grad_norm": 0.030775317922234535,
2919
+ "learning_rate": 0.00015074087169941085,
2920
+ "loss": 0.00011671001557260752,
2921
+ "memory/device_reserved (GiB)": 20.01,
2922
+ "memory/max_active (GiB)": 16.23,
2923
+ "memory/max_allocated (GiB)": 16.23,
2924
+ "ppl": 1.00012,
2925
+ "step": 2050,
2926
+ "tokens/total": 33599488,
2927
+ "tokens/train_per_sec_per_gpu": 14.23,
2928
+ "tokens/trainable": 10638485
2929
+ },
2930
+ {
2931
+ "epoch": 1.9987878787878788,
2932
+ "grad_norm": 0.054577309638261795,
2933
+ "learning_rate": 0.00015015628043229523,
2934
+ "loss": 0.0003703285474330187,
2935
+ "memory/device_reserved (GiB)": 20.01,
2936
+ "memory/max_active (GiB)": 16.23,
2937
+ "memory/max_allocated (GiB)": 16.23,
2938
+ "ppl": 1.00037,
2939
+ "step": 2060,
2940
+ "tokens/total": 33763328,
2941
+ "tokens/train_per_sec_per_gpu": 14.81,
2942
+ "tokens/trainable": 10689855
2943
+ }
2944
+ ],
2945
+ "logging_steps": 10,
2946
+ "max_steps": 5155,
2947
+ "num_input_tokens_seen": 0,
2948
+ "num_train_epochs": 5,
2949
+ "save_steps": 1031,
2950
+ "stateful_callbacks": {
2951
+ "TrainerControl": {
2952
+ "args": {
2953
+ "should_epoch_stop": false,
2954
+ "should_evaluate": false,
2955
+ "should_log": false,
2956
+ "should_save": true,
2957
+ "should_training_stop": false
2958
+ },
2959
+ "attributes": {}
2960
+ }
2961
+ },
2962
+ "total_flos": 7.504593503083561e+17,
2963
+ "train_batch_size": 2,
2964
+ "trial_name": null,
2965
+ "trial_params": null
2966
+ }
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-2062/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d29b464b8810e63db4689f2a7488bb151d3c44002b850563c9f99c9489ec58c9
3
+ size 7121
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 32,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "down_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "o_proj",
36
+ "k_proj",
37
+ "q_proj",
38
+ "up_proj"
39
+ ],
40
+ "target_parameters": [],
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:304cf3c64ceeb4dbb87e6d765e3fdd3d8b3df46600c6e4d2ab994562417e6d49
3
+ size 264308896
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ ' + message['content'] + '<|im_end|>' + '
3
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
+ ' }}{% endif %}
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7d8f1e89d2e8184d2cc04e29ba3277d83504548164114bf1fa45b8def190b14
3
+ size 528915403
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11373fdb5420e35d6d93ff498e2565c10ff01f1d221981eab3aa5b4440e7e839
3
+ size 14645
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67dd49c975b5d448314d39403a62311e9125e433e71f19378514313c6ecb95fd
3
+ size 1465
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": true,
24
+ "model_max_length": 1010000,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/tokens_state. ADDED
@@ -0,0 +1 @@
 
 
1
+ {"total": 50685952, "trainable": 16045130}
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d29b464b8810e63db4689f2a7488bb151d3c44002b850563c9f99c9489ec58c9
3
+ size 7121
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 32,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "down_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "o_proj",
36
+ "k_proj",
37
+ "q_proj",
38
+ "up_proj"
39
+ ],
40
+ "target_parameters": [],
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7907c6a742aff25f84719b2e90e16acc2e79bd97ad9e7127dbd22e6e86445cc0
3
+ size 264308896
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ ' + message['content'] + '<|im_end|>' + '
3
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
+ ' }}{% endif %}
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1243521d4e81e503200c0a7fe4556360192e75412d7a01df57b733f134d517d2
3
+ size 528915403
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f603d8f7ba790664405ac7fd41c632b9b529eac52f0f9d90a909cf98e312030e
3
+ size 14645
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a257e56f88f21994d367dc800f9fb8e354b66c8cc6ee4d584b76332e1d572c3c
3
+ size 1465
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": true,
24
+ "model_max_length": 1010000,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/tokens_state. ADDED
@@ -0,0 +1 @@
 
 
1
+ {"total": 67588096, "trainable": 21396854}
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d29b464b8810e63db4689f2a7488bb151d3c44002b850563c9f99c9489ec58c9
3
+ size 7121
checkpoints/math_operations/lora_sft_primitive_atomic_50k/debug.log ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/balanced_test_alpaca_converted.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/balanced_test_alpaca_results.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/eval_results.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ category,filename,total,correct,accuracy,format_found,format_accuracy,errors_count
2
+ math_operations,balanced_test_alpaca_results,500,8,1.60,500,100.00,492
checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/eval_summary.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "overall": {
3
+ "total": 500,
4
+ "correct": 8,
5
+ "accuracy": 1.6,
6
+ "format_found": 500,
7
+ "format_accuracy": 100.0
8
+ },
9
+ "per_operation": {
10
+ "a": {
11
+ "total": 25,
12
+ "correct": 0,
13
+ "accuracy": 0.0,
14
+ "format_found": 25
15
+ },
16
+ "b": {
17
+ "total": 25,
18
+ "correct": 0,
19
+ "accuracy": 0.0,
20
+ "format_found": 25
21
+ },
22
+ "c": {
23
+ "total": 25,
24
+ "correct": 1,
25
+ "accuracy": 4.0,
26
+ "format_found": 25
27
+ },
28
+ "d": {
29
+ "total": 25,
30
+ "correct": 1,
31
+ "accuracy": 4.0,
32
+ "format_found": 25
33
+ },
34
+ "e": {
35
+ "total": 25,
36
+ "correct": 0,
37
+ "accuracy": 0.0,
38
+ "format_found": 25
39
+ },
40
+ "f": {
41
+ "total": 25,
42
+ "correct": 0,
43
+ "accuracy": 0.0,
44
+ "format_found": 25
45
+ },
46
+ "g": {
47
+ "total": 25,
48
+ "correct": 2,
49
+ "accuracy": 8.0,
50
+ "format_found": 25
51
+ },
52
+ "h": {
53
+ "total": 25,
54
+ "correct": 0,
55
+ "accuracy": 0.0,
56
+ "format_found": 25
57
+ },
58
+ "i": {
59
+ "total": 25,
60
+ "correct": 1,
61
+ "accuracy": 4.0,
62
+ "format_found": 25
63
+ },
64
+ "j": {
65
+ "total": 25,
66
+ "correct": 0,
67
+ "accuracy": 0.0,
68
+ "format_found": 25
69
+ },
70
+ "k": {
71
+ "total": 25,
72
+ "correct": 0,
73
+ "accuracy": 0.0,
74
+ "format_found": 25
75
+ },
76
+ "l": {
77
+ "total": 25,
78
+ "correct": 0,
79
+ "accuracy": 0.0,
80
+ "format_found": 25
81
+ },
82
+ "m": {
83
+ "total": 25,
84
+ "correct": 0,
85
+ "accuracy": 0.0,
86
+ "format_found": 25
87
+ },
88
+ "n": {
89
+ "total": 25,
90
+ "correct": 0,
91
+ "accuracy": 0.0,
92
+ "format_found": 25
93
+ },
94
+ "o": {
95
+ "total": 25,
96
+ "correct": 1,
97
+ "accuracy": 4.0,
98
+ "format_found": 25
99
+ },
100
+ "p": {
101
+ "total": 25,
102
+ "correct": 2,
103
+ "accuracy": 8.0,
104
+ "format_found": 25
105
+ },
106
+ "q": {
107
+ "total": 25,
108
+ "correct": 0,
109
+ "accuracy": 0.0,
110
+ "format_found": 25
111
+ },
112
+ "r": {
113
+ "total": 25,
114
+ "correct": 0,
115
+ "accuracy": 0.0,
116
+ "format_found": 25
117
+ },
118
+ "s": {
119
+ "total": 25,
120
+ "correct": 0,
121
+ "accuracy": 0.0,
122
+ "format_found": 25
123
+ },
124
+ "t": {
125
+ "total": 25,
126
+ "correct": 0,
127
+ "accuracy": 0.0,
128
+ "format_found": 25
129
+ }
130
+ },
131
+ "n_errors": 492,
132
+ "results_file": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/lora_sft_primitive_atomic_50k/eval_results_easy_ops/balanced_test_alpaca_results.jsonl"
133
+ }