narySt commited on
Commit
f0df813
·
verified ·
1 Parent(s): 12f2c31

Upload routing tuning test outputs 2026-04-07

Browse files
Files changed (32) hide show
  1. routing_tuning_test_07_04/N_2.5/.hydra/config.yaml +53 -0
  2. routing_tuning_test_07_04/N_2.5/.hydra/hydra.yaml +162 -0
  3. routing_tuning_test_07_04/N_2.5/.hydra/overrides.yaml +2 -0
  4. routing_tuning_test_07_04/N_2.5/model_best.pt +3 -0
  5. routing_tuning_test_07_04/N_2.5/model_final.pt +3 -0
  6. routing_tuning_test_07_04/N_2.5/routing_weights/routing_step_0.pt +3 -0
  7. routing_tuning_test_07_04/N_2.5/routing_weights/routing_step_781.pt +3 -0
  8. routing_tuning_test_07_04/N_2.5/train.log +386 -0
  9. routing_tuning_test_07_04/N_4.0/.hydra/config.yaml +53 -0
  10. routing_tuning_test_07_04/N_4.0/.hydra/hydra.yaml +162 -0
  11. routing_tuning_test_07_04/N_4.0/.hydra/overrides.yaml +2 -0
  12. routing_tuning_test_07_04/N_4.0/model_best.pt +3 -0
  13. routing_tuning_test_07_04/N_4.0/model_final.pt +3 -0
  14. routing_tuning_test_07_04/N_4.0/routing_weights/routing_step_0.pt +3 -0
  15. routing_tuning_test_07_04/N_4.0/routing_weights/routing_step_781.pt +3 -0
  16. routing_tuning_test_07_04/N_4.0/train.log +386 -0
  17. routing_tuning_test_07_04/N_6.0/.hydra/config.yaml +53 -0
  18. routing_tuning_test_07_04/N_6.0/.hydra/hydra.yaml +162 -0
  19. routing_tuning_test_07_04/N_6.0/.hydra/overrides.yaml +2 -0
  20. routing_tuning_test_07_04/N_6.0/model_best.pt +3 -0
  21. routing_tuning_test_07_04/N_6.0/model_final.pt +3 -0
  22. routing_tuning_test_07_04/N_6.0/routing_weights/routing_step_0.pt +3 -0
  23. routing_tuning_test_07_04/N_6.0/routing_weights/routing_step_781.pt +3 -0
  24. routing_tuning_test_07_04/N_6.0/train.log +386 -0
  25. routing_tuning_test_07_04/N_8.0/.hydra/config.yaml +53 -0
  26. routing_tuning_test_07_04/N_8.0/.hydra/hydra.yaml +162 -0
  27. routing_tuning_test_07_04/N_8.0/.hydra/overrides.yaml +2 -0
  28. routing_tuning_test_07_04/N_8.0/model_best.pt +3 -0
  29. routing_tuning_test_07_04/N_8.0/model_final.pt +3 -0
  30. routing_tuning_test_07_04/N_8.0/routing_weights/routing_step_0.pt +3 -0
  31. routing_tuning_test_07_04/N_8.0/routing_weights/routing_step_781.pt +3 -0
  32. routing_tuning_test_07_04/N_8.0/train.log +386 -0
routing_tuning_test_07_04/N_2.5/.hydra/config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ config_path: ${oc.env:PROJECT_ROOT}/hnet_project/configs/hnet_2stage_XL_code.json
3
+ checkpoint_path: ${oc.env:PROJECT_ROOT}/hnet_project/checkpoints/hnet_2stage_XL_code.pt
4
+ training:
5
+ epochs: 1
6
+ max_steps: null
7
+ batch_size: 8
8
+ eval_batch_size: 24
9
+ gradient_accumulation_steps: 4
10
+ lr: 0.0001
11
+ weight_decay: 0.1
12
+ betas:
13
+ - 0.9
14
+ - 0.95
15
+ eps: 1.0e-08
16
+ lr_scheduler: wsd
17
+ warmup_ratio: 0.1
18
+ decay_ratio: 0.2
19
+ warmup_steps: 100
20
+ min_lr_ratio: 0.1
21
+ lr_multiplier:
22
+ - 2.0
23
+ - 1.5
24
+ - 1.0
25
+ load_balancing_weight: 0.05
26
+ load_balancing_N: 2.5
27
+ max_grad_norm: 1.0
28
+ use_amp: true
29
+ resume: false
30
+ resume_checkpoint: null
31
+ warmup_model: true
32
+ data:
33
+ path: ${oc.env:PROJECT_ROOT}/code_completion_exp/datasets/data_V4_full
34
+ max_context_len: 4096
35
+ max_target_len: 256
36
+ num_workers: 0
37
+ pin_memory: true
38
+ max_train_samples: 50000
39
+ max_val_samples: null
40
+ logging:
41
+ log_interval: 10
42
+ save_interval: 1000
43
+ eval_interval: 250
44
+ save_every_epoch: false
45
+ model_only_checkpoints: true
46
+ tracking:
47
+ enabled: true
48
+ project: routing-evolution
49
+ run_name: routing_N2.5
50
+ paths:
51
+ output_dir: outputs/N_${training.load_balancing_N}
52
+ seed: 42
53
+ device: cuda
routing_tuning_test_07_04/N_2.5/.hydra/hydra.yaml ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${paths.output_dir}
4
+ sweep:
5
+ dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - training.load_balancing_N=2.5
116
+ - tracking.run_name=routing_N2.5
117
+ job:
118
+ name: train
119
+ chdir: false
120
+ override_dirname: tracking.run_name=routing_N2.5,training.load_balancing_N=2.5
121
+ id: ???
122
+ num: ???
123
+ config_name: config
124
+ env_set: {}
125
+ env_copy: []
126
+ config:
127
+ override_dirname:
128
+ kv_sep: '='
129
+ item_sep: ','
130
+ exclude_keys: []
131
+ runtime:
132
+ version: 1.3.2
133
+ version_base: '1.3'
134
+ cwd: /workspace/byte-llms-code/routing_evolution_exp
135
+ config_sources:
136
+ - path: hydra.conf
137
+ schema: pkg
138
+ provider: hydra
139
+ - path: /workspace/byte-llms-code/routing_evolution_exp/configs
140
+ schema: file
141
+ provider: main
142
+ - path: ''
143
+ schema: structured
144
+ provider: schema
145
+ output_dir: /workspace/byte-llms-code/routing_evolution_exp/outputs/N_2.5
146
+ choices:
147
+ paths: default
148
+ tracking: default
149
+ logging: default
150
+ data: default
151
+ training: default
152
+ model: hnet_xl_code
153
+ hydra/env: default
154
+ hydra/callbacks: null
155
+ hydra/job_logging: default
156
+ hydra/hydra_logging: default
157
+ hydra/hydra_help: default
158
+ hydra/help: default
159
+ hydra/sweeper: basic
160
+ hydra/launcher: basic
161
+ hydra/output: default
162
+ verbose: false
routing_tuning_test_07_04/N_2.5/.hydra/overrides.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ - training.load_balancing_N=2.5
2
+ - tracking.run_name=routing_N2.5
routing_tuning_test_07_04/N_2.5/model_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56ffd4eec1c3270b45b6fe7338584cac9f6a57aee6737fc373ffd7b3e5731461
3
+ size 3315165139
routing_tuning_test_07_04/N_2.5/model_final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba7c46532c2086aa0a0a61a36a91b28c01addcff63728d9a90892e702af44611
3
+ size 3315165484
routing_tuning_test_07_04/N_2.5/routing_weights/routing_step_0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:132bc38ff57f9db6c9ca06a66251bf38dc4231f85bd01101af2cf1df371b5db7
3
+ size 13633736
routing_tuning_test_07_04/N_2.5/routing_weights/routing_step_781.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c02d24288839c9545a03982a22732394ed5d2a755a9f14de1eb8df2038f79d8f
3
+ size 13633752
routing_tuning_test_07_04/N_2.5/train.log ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-04-07 17:39:50] CUDA_VISIBLE_DEVICES: 0,1
2
+ [2026-04-07 17:39:50] Number of processes: 2
3
+ [2026-04-07 17:39:50] Mixed precision: bf16
4
+ [2026-04-07 17:39:50] ============================================================
5
+ [2026-04-07 17:39:50] Routing Evolution Experiment | N=2.5
6
+ [2026-04-07 17:39:50] ============================================================
7
+ [2026-04-07 17:39:50] Config:
8
+ model:
9
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
10
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
11
+ training:
12
+ epochs: 1
13
+ max_steps: null
14
+ batch_size: 8
15
+ eval_batch_size: 24
16
+ gradient_accumulation_steps: 4
17
+ lr: 0.0001
18
+ weight_decay: 0.1
19
+ betas:
20
+ - 0.9
21
+ - 0.95
22
+ eps: 1.0e-08
23
+ lr_scheduler: wsd
24
+ warmup_ratio: 0.1
25
+ decay_ratio: 0.2
26
+ warmup_steps: 100
27
+ min_lr_ratio: 0.1
28
+ lr_multiplier:
29
+ - 2.0
30
+ - 1.5
31
+ - 1.0
32
+ load_balancing_weight: 0.05
33
+ load_balancing_N: 2.5
34
+ max_grad_norm: 1.0
35
+ use_amp: true
36
+ resume: false
37
+ resume_checkpoint: null
38
+ warmup_model: true
39
+ data:
40
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
41
+ max_context_len: 4096
42
+ max_target_len: 256
43
+ num_workers: 0
44
+ pin_memory: true
45
+ max_train_samples: 50000
46
+ max_val_samples: null
47
+ logging:
48
+ log_interval: 10
49
+ save_interval: 1000
50
+ eval_interval: 250
51
+ save_every_epoch: false
52
+ model_only_checkpoints: true
53
+ tracking:
54
+ enabled: true
55
+ project: routing-evolution
56
+ run_name: routing_N2.5
57
+ paths:
58
+ output_dir: outputs/N_2.5
59
+ seed: 42
60
+ device: cuda
61
+
62
+ [2026-04-07 17:39:51] Loading model...
63
+ [2026-04-07 17:39:57] Loaded pretrained: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
64
+ [2026-04-07 17:39:57] Applied LR multipliers: [2.0, 1.5, 1.0]
65
+ [2026-04-07 17:39:57] Warming up model...
66
+ [2026-04-07 17:40:43] Total params: 1,654,090,112
67
+ [2026-04-07 17:40:43] Trainable params: 1,654,090,112
68
+ [2026-04-07 17:40:43] Creating dataloaders...
69
+ [2026-04-07 17:40:43] Train dataset size: 50000 (max_train_samples=50000) | Epochs: 1
70
+ [2026-04-07 17:40:43] Max steps: 781, Steps per epoch: 3125
71
+ [2026-04-07 17:40:45] Starting training...
72
+ [2026-04-07 17:40:45]
73
+ ============================================================
74
+ [2026-04-07 17:40:45] EPOCH 1/1 (step 0)
75
+ [2026-04-07 17:40:45] ============================================================
76
+ [2026-04-07 17:41:18] Epoch 1 | Step 10 | Loss: 0.6962 | LM: 0.6496 | LB: 1.0154 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.361 | HR1: 0.482/SR1: 0.460 | LR: 3.31e-05
77
+ [2026-04-07 17:41:28] Epoch 1 | Step 20 | Loss: 0.5961 | LM: 0.5509 | LB: 1.0146 | CL0: 2.8 | CL1: 2.1 | HR0: 0.364/SR0: 0.362 | HR1: 0.479/SR1: 0.459 | LR: 5.62e-05
78
+ [2026-04-07 17:41:37] Epoch 1 | Step 30 | Loss: 0.5376 | LM: 0.4788 | LB: 1.0147 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.362 | HR1: 0.480/SR1: 0.458 | LR: 7.92e-05
79
+ [2026-04-07 17:41:47] Epoch 1 | Step 40 | Loss: 0.5016 | LM: 0.4479 | LB: 1.0154 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.478/SR1: 0.457 | LR: 1.00e-04
80
+ [2026-04-07 17:41:56] Epoch 1 | Step 50 | Loss: 0.4791 | LM: 0.4096 | LB: 1.0145 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.475/SR1: 0.455 | LR: 1.00e-04
81
+ [2026-04-07 17:42:06] Epoch 1 | Step 60 | Loss: 0.4529 | LM: 0.3812 | LB: 1.0148 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.477/SR1: 0.457 | LR: 1.00e-04
82
+ [2026-04-07 17:42:15] Epoch 1 | Step 70 | Loss: 0.4386 | LM: 0.3774 | LB: 1.0148 | CL0: 2.8 | CL1: 2.1 | HR0: 0.359/SR0: 0.358 | HR1: 0.477/SR1: 0.456 | LR: 1.00e-04
83
+ [2026-04-07 17:42:24] Epoch 1 | Step 80 | Loss: 0.4239 | LM: 0.3666 | LB: 1.0146 | CL0: 2.8 | CL1: 2.1 | HR0: 0.360/SR0: 0.359 | HR1: 0.476/SR1: 0.456 | LR: 1.00e-04
84
+ [2026-04-07 17:42:33] Epoch 1 | Step 90 | Loss: 0.4154 | LM: 0.3536 | LB: 1.0142 | CL0: 2.8 | CL1: 2.1 | HR0: 0.361/SR0: 0.359 | HR1: 0.475/SR1: 0.454 | LR: 1.00e-04
85
+ [2026-04-07 17:42:42] Epoch 1 | Step 100 | Loss: 0.4104 | LM: 0.3510 | LB: 1.0146 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.475/SR1: 0.454 | LR: 1.00e-04
86
+ [2026-04-07 17:42:51] Epoch 1 | Step 110 | Loss: 0.4061 | LM: 0.3457 | LB: 1.0144 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.474/SR1: 0.454 | LR: 1.00e-04
87
+ [2026-04-07 17:43:00] Epoch 1 | Step 120 | Loss: 0.4037 | LM: 0.3472 | LB: 1.0143 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.474/SR1: 0.453 | LR: 1.00e-04
88
+ [2026-04-07 17:43:10] Epoch 1 | Step 130 | Loss: 0.3998 | LM: 0.3442 | LB: 1.0143 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.474/SR1: 0.453 | LR: 1.00e-04
89
+ [2026-04-07 17:43:19] Epoch 1 | Step 140 | Loss: 0.3955 | LM: 0.3397 | LB: 1.0140 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.356 | HR1: 0.472/SR1: 0.452 | LR: 1.00e-04
90
+ [2026-04-07 17:43:28] Epoch 1 | Step 150 | Loss: 0.3932 | LM: 0.3373 | LB: 1.0138 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.472/SR1: 0.451 | LR: 1.00e-04
91
+ [2026-04-07 17:43:37] Epoch 1 | Step 160 | Loss: 0.3889 | LM: 0.3319 | LB: 1.0137 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.355 | HR1: 0.471/SR1: 0.450 | LR: 1.00e-04
92
+ [2026-04-07 17:43:46] Epoch 1 | Step 170 | Loss: 0.3841 | LM: 0.3272 | LB: 1.0134 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.356 | HR1: 0.470/SR1: 0.450 | LR: 1.00e-04
93
+ [2026-04-07 17:43:55] Epoch 1 | Step 180 | Loss: 0.3806 | LM: 0.3231 | LB: 1.0133 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.470/SR1: 0.449 | LR: 1.00e-04
94
+ [2026-04-07 17:44:05] Epoch 1 | Step 190 | Loss: 0.3789 | LM: 0.3227 | LB: 1.0130 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.469/SR1: 0.448 | LR: 1.00e-04
95
+ [2026-04-07 17:44:14] Epoch 1 | Step 200 | Loss: 0.3763 | LM: 0.3198 | LB: 1.0130 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.469/SR1: 0.448 | LR: 1.00e-04
96
+ [2026-04-07 17:44:23] Epoch 1 | Step 210 | Loss: 0.3761 | LM: 0.3201 | LB: 1.0129 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.355 | HR1: 0.468/SR1: 0.447 | LR: 1.00e-04
97
+ [2026-04-07 17:44:33] Epoch 1 | Step 220 | Loss: 0.3752 | LM: 0.3180 | LB: 1.0128 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.468/SR1: 0.447 | LR: 1.00e-04
98
+ [2026-04-07 17:44:42] Epoch 1 | Step 230 | Loss: 0.3739 | LM: 0.3172 | LB: 1.0126 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.468/SR1: 0.446 | LR: 1.00e-04
99
+ [2026-04-07 17:44:51] Epoch 1 | Step 240 | Loss: 0.3718 | LM: 0.3183 | LB: 1.0126 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.356 | HR1: 0.467/SR1: 0.446 | LR: 1.00e-04
100
+ [2026-04-07 17:45:00] Epoch 1 | Step 250 | Loss: 0.3695 | LM: 0.3165 | LB: 1.0124 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.467/SR1: 0.445 | LR: 1.00e-04
101
+ [2026-04-07 17:45:01] Validation | Batch 10/732 | Loss: 0.3270 | LM: 0.2764
102
+ [2026-04-07 17:45:02] Validation | Batch 20/732 | Loss: 0.3483 | LM: 0.2979
103
+ [2026-04-07 17:45:04] Validation | Batch 30/732 | Loss: 0.3395 | LM: 0.2890
104
+ [2026-04-07 17:45:05] Validation | Batch 40/732 | Loss: 0.3440 | LM: 0.2936
105
+ [2026-04-07 17:45:06] Validation | Batch 50/732 | Loss: 0.3441 | LM: 0.2937
106
+ [2026-04-07 17:45:07] Validation | Batch 60/732 | Loss: 0.3466 | LM: 0.2962
107
+ [2026-04-07 17:45:08] Validation | Batch 70/732 | Loss: 0.3503 | LM: 0.2999
108
+ [2026-04-07 17:45:10] Validation | Batch 80/732 | Loss: 0.3487 | LM: 0.2983
109
+ [2026-04-07 17:45:11] Validation | Batch 90/732 | Loss: 0.3482 | LM: 0.2978
110
+ [2026-04-07 17:45:12] Validation | Batch 100/732 | Loss: 0.3493 | LM: 0.2989
111
+ [2026-04-07 17:45:13] Validation | Batch 110/732 | Loss: 0.3461 | LM: 0.2957
112
+ [2026-04-07 17:45:14] Validation | Batch 120/732 | Loss: 0.3494 | LM: 0.2990
113
+ [2026-04-07 17:45:16] Validation | Batch 130/732 | Loss: 0.3508 | LM: 0.3004
114
+ [2026-04-07 17:45:17] Validation | Batch 140/732 | Loss: 0.3502 | LM: 0.2998
115
+ [2026-04-07 17:45:18] Validation | Batch 150/732 | Loss: 0.3495 | LM: 0.2991
116
+ [2026-04-07 17:45:19] Validation | Batch 160/732 | Loss: 0.3485 | LM: 0.2981
117
+ [2026-04-07 17:45:20] Validation | Batch 170/732 | Loss: 0.3490 | LM: 0.2986
118
+ [2026-04-07 17:45:21] Validation | Batch 180/732 | Loss: 0.3501 | LM: 0.2997
119
+ [2026-04-07 17:45:22] Validation | Batch 190/732 | Loss: 0.3495 | LM: 0.2991
120
+ [2026-04-07 17:45:23] Validation | Batch 200/732 | Loss: 0.3496 | LM: 0.2992
121
+ [2026-04-07 17:45:24] Validation | Batch 210/732 | Loss: 0.3488 | LM: 0.2984
122
+ [2026-04-07 17:45:26] Validation | Batch 220/732 | Loss: 0.3482 | LM: 0.2978
123
+ [2026-04-07 17:45:27] Validation | Batch 230/732 | Loss: 0.3486 | LM: 0.2982
124
+ [2026-04-07 17:45:28] Validation | Batch 240/732 | Loss: 0.3483 | LM: 0.2979
125
+ [2026-04-07 17:45:30] Validation | Batch 250/732 | Loss: 0.3484 | LM: 0.2980
126
+ [2026-04-07 17:45:31] Validation | Batch 260/732 | Loss: 0.3474 | LM: 0.2970
127
+ [2026-04-07 17:45:32] Validation | Batch 270/732 | Loss: 0.3471 | LM: 0.2967
128
+ [2026-04-07 17:45:33] Validation | Batch 280/732 | Loss: 0.3460 | LM: 0.2956
129
+ [2026-04-07 17:45:34] Validation | Batch 290/732 | Loss: 0.3458 | LM: 0.2954
130
+ [2026-04-07 17:45:35] Validation | Batch 300/732 | Loss: 0.3457 | LM: 0.2953
131
+ [2026-04-07 17:45:36] Validation | Batch 310/732 | Loss: 0.3456 | LM: 0.2952
132
+ [2026-04-07 17:45:38] Validation | Batch 320/732 | Loss: 0.3447 | LM: 0.2943
133
+ [2026-04-07 17:45:39] Validation | Batch 330/732 | Loss: 0.3435 | LM: 0.2931
134
+ [2026-04-07 17:45:40] Validation | Batch 340/732 | Loss: 0.3429 | LM: 0.2925
135
+ [2026-04-07 17:45:41] Validation | Batch 350/732 | Loss: 0.3432 | LM: 0.2928
136
+ [2026-04-07 17:45:42] Validation | Batch 360/732 | Loss: 0.3441 | LM: 0.2937
137
+ [2026-04-07 17:45:43] Validation | Batch 370/732 | Loss: 0.3431 | LM: 0.2927
138
+ [2026-04-07 17:45:44] Validation | Batch 380/732 | Loss: 0.3424 | LM: 0.2920
139
+ [2026-04-07 17:45:45] Validation | Batch 390/732 | Loss: 0.3420 | LM: 0.2916
140
+ [2026-04-07 17:45:46] Validation | Batch 400/732 | Loss: 0.3418 | LM: 0.2914
141
+ [2026-04-07 17:45:47] Validation | Batch 410/732 | Loss: 0.3411 | LM: 0.2907
142
+ [2026-04-07 17:45:48] Validation | Batch 420/732 | Loss: 0.3413 | LM: 0.2909
143
+ [2026-04-07 17:45:50] Validation | Batch 430/732 | Loss: 0.3412 | LM: 0.2908
144
+ [2026-04-07 17:45:51] Validation | Batch 440/732 | Loss: 0.3407 | LM: 0.2903
145
+ [2026-04-07 17:45:52] Validation | Batch 450/732 | Loss: 0.3405 | LM: 0.2901
146
+ [2026-04-07 17:45:53] Validation | Batch 460/732 | Loss: 0.3409 | LM: 0.2904
147
+ [2026-04-07 17:45:54] Validation | Batch 470/732 | Loss: 0.3406 | LM: 0.2902
148
+ [2026-04-07 17:45:55] Validation | Batch 480/732 | Loss: 0.3408 | LM: 0.2904
149
+ [2026-04-07 17:45:57] Validation | Batch 490/732 | Loss: 0.3419 | LM: 0.2915
150
+ [2026-04-07 17:45:58] Validation | Batch 500/732 | Loss: 0.3429 | LM: 0.2925
151
+ [2026-04-07 17:45:59] Validation | Batch 510/732 | Loss: 0.3426 | LM: 0.2921
152
+ [2026-04-07 17:46:00] Validation | Batch 520/732 | Loss: 0.3423 | LM: 0.2919
153
+ [2026-04-07 17:46:01] Validation | Batch 530/732 | Loss: 0.3417 | LM: 0.2913
154
+ [2026-04-07 17:46:02] Validation | Batch 540/732 | Loss: 0.3419 | LM: 0.2915
155
+ [2026-04-07 17:46:03] Validation | Batch 550/732 | Loss: 0.3418 | LM: 0.2914
156
+ [2026-04-07 17:46:04] Validation | Batch 560/732 | Loss: 0.3414 | LM: 0.2909
157
+ [2026-04-07 17:46:06] Validation | Batch 570/732 | Loss: 0.3415 | LM: 0.2911
158
+ [2026-04-07 17:46:07] Validation | Batch 580/732 | Loss: 0.3412 | LM: 0.2908
159
+ [2026-04-07 17:46:08] Validation | Batch 590/732 | Loss: 0.3412 | LM: 0.2908
160
+ [2026-04-07 17:46:10] Validation | Batch 600/732 | Loss: 0.3411 | LM: 0.2907
161
+ [2026-04-07 17:46:11] Validation | Batch 610/732 | Loss: 0.3417 | LM: 0.2913
162
+ [2026-04-07 17:46:12] Validation | Batch 620/732 | Loss: 0.3421 | LM: 0.2916
163
+ [2026-04-07 17:46:13] Validation | Batch 630/732 | Loss: 0.3419 | LM: 0.2915
164
+ [2026-04-07 17:46:14] Validation | Batch 640/732 | Loss: 0.3416 | LM: 0.2912
165
+ [2026-04-07 17:46:16] Validation | Batch 650/732 | Loss: 0.3414 | LM: 0.2910
166
+ [2026-04-07 17:46:17] Validation | Batch 660/732 | Loss: 0.3419 | LM: 0.2915
167
+ [2026-04-07 17:46:18] Validation | Batch 670/732 | Loss: 0.3425 | LM: 0.2921
168
+ [2026-04-07 17:46:19] Validation | Batch 680/732 | Loss: 0.3424 | LM: 0.2920
169
+ [2026-04-07 17:46:20] Validation | Batch 690/732 | Loss: 0.3426 | LM: 0.2922
170
+ [2026-04-07 17:46:21] Validation | Batch 700/732 | Loss: 0.3431 | LM: 0.2927
171
+ [2026-04-07 17:46:22] Validation | Batch 710/732 | Loss: 0.3434 | LM: 0.2930
172
+ [2026-04-07 17:46:23] Validation | Batch 720/732 | Loss: 0.3444 | LM: 0.2940
173
+ [2026-04-07 17:46:25] Validation | Batch 730/732 | Loss: 0.3441 | LM: 0.2937
174
+ [2026-04-07 17:46:25] Validation | Batch 732/732 | Loss: 0.3439 | LM: 0.2935
175
+ [2026-04-07 17:46:25] Validation | Loss: 0.3439 | LM: 0.2935 | PPL: 1.34 | Time: 84.70s
176
+ [2026-04-07 17:46:27] New best model saved! Val loss: 0.3439
177
+ [2026-04-07 17:46:37] Epoch 1 | Step 260 | Loss: 0.3676 | LM: 0.3155 | LB: 1.0124 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.356 | HR1: 0.467/SR1: 0.445 | LR: 1.00e-04
178
+ [2026-04-07 17:46:46] Epoch 1 | Step 270 | Loss: 0.3676 | LM: 0.3137 | LB: 1.0123 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.466/SR1: 0.445 | LR: 1.00e-04
179
+ [2026-04-07 17:46:55] Epoch 1 | Step 280 | Loss: 0.3674 | LM: 0.3138 | LB: 1.0122 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.356 | HR1: 0.466/SR1: 0.444 | LR: 1.00e-04
180
+ [2026-04-07 17:47:04] Epoch 1 | Step 290 | Loss: 0.3657 | LM: 0.3118 | LB: 1.0121 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.465/SR1: 0.444 | LR: 1.00e-04
181
+ [2026-04-07 17:47:13] Epoch 1 | Step 300 | Loss: 0.3650 | LM: 0.3113 | LB: 1.0119 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.465/SR1: 0.443 | LR: 1.00e-04
182
+ [2026-04-07 17:47:23] Epoch 1 | Step 310 | Loss: 0.3641 | LM: 0.3103 | LB: 1.0117 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.464/SR1: 0.443 | LR: 1.00e-04
183
+ [2026-04-07 17:47:32] Epoch 1 | Step 320 | Loss: 0.3635 | LM: 0.3098 | LB: 1.0116 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.464/SR1: 0.443 | LR: 9.80e-05
184
+ [2026-04-07 17:47:41] Epoch 1 | Step 330 | Loss: 0.3622 | LM: 0.3088 | LB: 1.0115 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.463/SR1: 0.442 | LR: 8.93e-05
185
+ [2026-04-07 17:47:50] Epoch 1 | Step 340 | Loss: 0.3613 | LM: 0.3080 | LB: 1.0114 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.463/SR1: 0.442 | LR: 7.51e-05
186
+ [2026-04-07 17:47:59] Epoch 1 | Step 350 | Loss: 0.3609 | LM: 0.3085 | LB: 1.0113 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.441 | LR: 5.77e-05
187
+ [2026-04-07 17:48:09] Epoch 1 | Step 360 | Loss: 0.3602 | LM: 0.3100 | LB: 1.0113 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.441 | LR: 3.99e-05
188
+ [2026-04-07 17:48:18] Epoch 1 | Step 370 | Loss: 0.3597 | LM: 0.3096 | LB: 1.0112 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.441 | LR: 2.45e-05
189
+ [2026-04-07 17:48:27] Epoch 1 | Step 380 | Loss: 0.3587 | LM: 0.3077 | LB: 1.0112 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.441 | LR: 1.40e-05
190
+ [2026-04-07 17:48:36] Epoch 1 | Step 390 | Loss: 0.3589 | LM: 0.3073 | LB: 1.0110 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.440 | LR: 1.00e-05
191
+ [2026-04-07 17:48:45] Epoch 1 | Step 400 | Loss: 0.3588 | LM: 0.3082 | LB: 1.0111 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.440 | LR: 1.00e-05
192
+ [2026-04-07 17:48:54] Epoch 1 | Step 410 | Loss: 0.3587 | LM: 0.3081 | LB: 1.0111 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.462/SR1: 0.440 | LR: 1.00e-05
193
+ [2026-04-07 17:49:03] Epoch 1 | Step 420 | Loss: 0.3588 | LM: 0.3074 | LB: 1.0111 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.462/SR1: 0.440 | LR: 1.00e-05
194
+ [2026-04-07 17:49:12] Epoch 1 | Step 430 | Loss: 0.3582 | LM: 0.3058 | LB: 1.0110 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.462/SR1: 0.440 | LR: 1.00e-05
195
+ [2026-04-07 17:49:21] Epoch 1 | Step 440 | Loss: 0.3574 | LM: 0.3049 | LB: 1.0110 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.462/SR1: 0.440 | LR: 1.00e-05
196
+ [2026-04-07 17:49:31] Epoch 1 | Step 450 | Loss: 0.3568 | LM: 0.3036 | LB: 1.0110 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.461/SR1: 0.440 | LR: 1.00e-05
197
+ [2026-04-07 17:49:40] Epoch 1 | Step 460 | Loss: 0.3569 | LM: 0.3027 | LB: 1.0109 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.461/SR1: 0.439 | LR: 1.00e-05
198
+ [2026-04-07 17:49:49] Epoch 1 | Step 470 | Loss: 0.3567 | LM: 0.3035 | LB: 1.0108 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.461/SR1: 0.439 | LR: 1.00e-05
199
+ [2026-04-07 17:49:58] Epoch 1 | Step 480 | Loss: 0.3563 | LM: 0.3028 | LB: 1.0108 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.461/SR1: 0.439 | LR: 1.00e-05
200
+ [2026-04-07 17:50:07] Epoch 1 | Step 490 | Loss: 0.3556 | LM: 0.3017 | LB: 1.0107 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.460/SR1: 0.439 | LR: 1.00e-05
201
+ [2026-04-07 17:50:16] Epoch 1 | Step 500 | Loss: 0.3558 | LM: 0.3020 | LB: 1.0107 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.460/SR1: 0.439 | LR: 1.00e-05
202
+ [2026-04-07 17:50:17] Validation | Batch 10/732 | Loss: 0.3237 | LM: 0.2732
203
+ [2026-04-07 17:50:19] Validation | Batch 20/732 | Loss: 0.3455 | LM: 0.2951
204
+ [2026-04-07 17:50:20] Validation | Batch 30/732 | Loss: 0.3370 | LM: 0.2866
205
+ [2026-04-07 17:50:21] Validation | Batch 40/732 | Loss: 0.3413 | LM: 0.2909
206
+ [2026-04-07 17:50:22] Validation | Batch 50/732 | Loss: 0.3412 | LM: 0.2908
207
+ [2026-04-07 17:50:23] Validation | Batch 60/732 | Loss: 0.3432 | LM: 0.2928
208
+ [2026-04-07 17:50:25] Validation | Batch 70/732 | Loss: 0.3468 | LM: 0.2964
209
+ [2026-04-07 17:50:26] Validation | Batch 80/732 | Loss: 0.3452 | LM: 0.2948
210
+ [2026-04-07 17:50:27] Validation | Batch 90/732 | Loss: 0.3449 | LM: 0.2945
211
+ [2026-04-07 17:50:28] Validation | Batch 100/732 | Loss: 0.3460 | LM: 0.2956
212
+ [2026-04-07 17:50:30] Validation | Batch 110/732 | Loss: 0.3428 | LM: 0.2924
213
+ [2026-04-07 17:50:31] Validation | Batch 120/732 | Loss: 0.3460 | LM: 0.2956
214
+ [2026-04-07 17:50:32] Validation | Batch 130/732 | Loss: 0.3473 | LM: 0.2969
215
+ [2026-04-07 17:50:33] Validation | Batch 140/732 | Loss: 0.3468 | LM: 0.2964
216
+ [2026-04-07 17:50:34] Validation | Batch 150/732 | Loss: 0.3461 | LM: 0.2957
217
+ [2026-04-07 17:50:35] Validation | Batch 160/732 | Loss: 0.3452 | LM: 0.2949
218
+ [2026-04-07 17:50:36] Validation | Batch 170/732 | Loss: 0.3458 | LM: 0.2954
219
+ [2026-04-07 17:50:38] Validation | Batch 180/732 | Loss: 0.3470 | LM: 0.2966
220
+ [2026-04-07 17:50:39] Validation | Batch 190/732 | Loss: 0.3464 | LM: 0.2960
221
+ [2026-04-07 17:50:40] Validation | Batch 200/732 | Loss: 0.3464 | LM: 0.2961
222
+ [2026-04-07 17:50:41] Validation | Batch 210/732 | Loss: 0.3457 | LM: 0.2953
223
+ [2026-04-07 17:50:42] Validation | Batch 220/732 | Loss: 0.3452 | LM: 0.2948
224
+ [2026-04-07 17:50:43] Validation | Batch 230/732 | Loss: 0.3456 | LM: 0.2952
225
+ [2026-04-07 17:50:45] Validation | Batch 240/732 | Loss: 0.3453 | LM: 0.2949
226
+ [2026-04-07 17:50:46] Validation | Batch 250/732 | Loss: 0.3452 | LM: 0.2949
227
+ [2026-04-07 17:50:47] Validation | Batch 260/732 | Loss: 0.3442 | LM: 0.2938
228
+ [2026-04-07 17:50:48] Validation | Batch 270/732 | Loss: 0.3440 | LM: 0.2937
229
+ [2026-04-07 17:50:49] Validation | Batch 280/732 | Loss: 0.3430 | LM: 0.2926
230
+ [2026-04-07 17:50:50] Validation | Batch 290/732 | Loss: 0.3427 | LM: 0.2923
231
+ [2026-04-07 17:50:51] Validation | Batch 300/732 | Loss: 0.3426 | LM: 0.2923
232
+ [2026-04-07 17:50:53] Validation | Batch 310/732 | Loss: 0.3425 | LM: 0.2921
233
+ [2026-04-07 17:50:54] Validation | Batch 320/732 | Loss: 0.3416 | LM: 0.2912
234
+ [2026-04-07 17:50:55] Validation | Batch 330/732 | Loss: 0.3405 | LM: 0.2901
235
+ [2026-04-07 17:50:56] Validation | Batch 340/732 | Loss: 0.3399 | LM: 0.2895
236
+ [2026-04-07 17:50:57] Validation | Batch 350/732 | Loss: 0.3403 | LM: 0.2899
237
+ [2026-04-07 17:50:58] Validation | Batch 360/732 | Loss: 0.3411 | LM: 0.2907
238
+ [2026-04-07 17:50:59] Validation | Batch 370/732 | Loss: 0.3401 | LM: 0.2898
239
+ [2026-04-07 17:51:00] Validation | Batch 380/732 | Loss: 0.3395 | LM: 0.2891
240
+ [2026-04-07 17:51:01] Validation | Batch 390/732 | Loss: 0.3391 | LM: 0.2887
241
+ [2026-04-07 17:51:02] Validation | Batch 400/732 | Loss: 0.3389 | LM: 0.2885
242
+ [2026-04-07 17:51:03] Validation | Batch 410/732 | Loss: 0.3382 | LM: 0.2878
243
+ [2026-04-07 17:51:05] Validation | Batch 420/732 | Loss: 0.3384 | LM: 0.2880
244
+ [2026-04-07 17:51:06] Validation | Batch 430/732 | Loss: 0.3383 | LM: 0.2879
245
+ [2026-04-07 17:51:07] Validation | Batch 440/732 | Loss: 0.3378 | LM: 0.2875
246
+ [2026-04-07 17:51:08] Validation | Batch 450/732 | Loss: 0.3376 | LM: 0.2873
247
+ [2026-04-07 17:51:10] Validation | Batch 460/732 | Loss: 0.3380 | LM: 0.2876
248
+ [2026-04-07 17:51:11] Validation | Batch 470/732 | Loss: 0.3378 | LM: 0.2874
249
+ [2026-04-07 17:51:12] Validation | Batch 480/732 | Loss: 0.3379 | LM: 0.2875
250
+ [2026-04-07 17:51:13] Validation | Batch 490/732 | Loss: 0.3389 | LM: 0.2886
251
+ [2026-04-07 17:51:14] Validation | Batch 500/732 | Loss: 0.3400 | LM: 0.2896
252
+ [2026-04-07 17:51:15] Validation | Batch 510/732 | Loss: 0.3397 | LM: 0.2893
253
+ [2026-04-07 17:51:16] Validation | Batch 520/732 | Loss: 0.3395 | LM: 0.2891
254
+ [2026-04-07 17:51:17] Validation | Batch 530/732 | Loss: 0.3389 | LM: 0.2885
255
+ [2026-04-07 17:51:19] Validation | Batch 540/732 | Loss: 0.3390 | LM: 0.2887
256
+ [2026-04-07 17:51:20] Validation | Batch 550/732 | Loss: 0.3390 | LM: 0.2886
257
+ [2026-04-07 17:51:21] Validation | Batch 560/732 | Loss: 0.3385 | LM: 0.2881
258
+ [2026-04-07 17:51:22] Validation | Batch 570/732 | Loss: 0.3386 | LM: 0.2882
259
+ [2026-04-07 17:51:23] Validation | Batch 580/732 | Loss: 0.3383 | LM: 0.2879
260
+ [2026-04-07 17:51:25] Validation | Batch 590/732 | Loss: 0.3383 | LM: 0.2879
261
+ [2026-04-07 17:51:26] Validation | Batch 600/732 | Loss: 0.3382 | LM: 0.2878
262
+ [2026-04-07 17:51:27] Validation | Batch 610/732 | Loss: 0.3388 | LM: 0.2884
263
+ [2026-04-07 17:51:28] Validation | Batch 620/732 | Loss: 0.3391 | LM: 0.2887
264
+ [2026-04-07 17:51:29] Validation | Batch 630/732 | Loss: 0.3389 | LM: 0.2885
265
+ [2026-04-07 17:51:31] Validation | Batch 640/732 | Loss: 0.3386 | LM: 0.2883
266
+ [2026-04-07 17:51:32] Validation | Batch 650/732 | Loss: 0.3385 | LM: 0.2881
267
+ [2026-04-07 17:51:33] Validation | Batch 660/732 | Loss: 0.3390 | LM: 0.2886
268
+ [2026-04-07 17:51:34] Validation | Batch 670/732 | Loss: 0.3396 | LM: 0.2892
269
+ [2026-04-07 17:51:35] Validation | Batch 680/732 | Loss: 0.3395 | LM: 0.2891
270
+ [2026-04-07 17:51:36] Validation | Batch 690/732 | Loss: 0.3397 | LM: 0.2893
271
+ [2026-04-07 17:51:37] Validation | Batch 700/732 | Loss: 0.3402 | LM: 0.2898
272
+ [2026-04-07 17:51:39] Validation | Batch 710/732 | Loss: 0.3405 | LM: 0.2901
273
+ [2026-04-07 17:51:40] Validation | Batch 720/732 | Loss: 0.3415 | LM: 0.2911
274
+ [2026-04-07 17:51:41] Validation | Batch 730/732 | Loss: 0.3412 | LM: 0.2908
275
+ [2026-04-07 17:51:41] Validation | Batch 732/732 | Loss: 0.3410 | LM: 0.2906
276
+ [2026-04-07 17:51:41] Validation | Loss: 0.3410 | LM: 0.2906 | PPL: 1.34 | Time: 84.74s
277
+ [2026-04-07 17:51:44] New best model saved! Val loss: 0.3410
278
+ [2026-04-07 17:51:54] Epoch 1 | Step 510 | Loss: 0.3557 | LM: 0.3003 | LB: 1.0107 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.460/SR1: 0.439 | LR: 1.00e-05
279
+ [2026-04-07 17:52:03] Epoch 1 | Step 520 | Loss: 0.3554 | LM: 0.2993 | LB: 1.0106 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.460/SR1: 0.438 | LR: 1.00e-05
280
+ [2026-04-07 17:52:13] Epoch 1 | Step 530 | Loss: 0.3550 | LM: 0.2984 | LB: 1.0106 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.460/SR1: 0.438 | LR: 1.00e-05
281
+ [2026-04-07 17:52:22] Epoch 1 | Step 540 | Loss: 0.3547 | LM: 0.2982 | LB: 1.0106 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.460/SR1: 0.438 | LR: 1.00e-05
282
+ [2026-04-07 17:52:31] Epoch 1 | Step 550 | Loss: 0.3544 | LM: 0.2987 | LB: 1.0106 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.460/SR1: 0.438 | LR: 1.00e-05
283
+ [2026-04-07 17:52:40] Epoch 1 | Step 560 | Loss: 0.3546 | LM: 0.2995 | LB: 1.0106 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.460/SR1: 0.438 | LR: 1.00e-05
284
+ [2026-04-07 17:52:49] Epoch 1 | Step 570 | Loss: 0.3547 | LM: 0.3000 | LB: 1.0106 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.460/SR1: 0.438 | LR: 1.00e-05
285
+ [2026-04-07 17:52:58] Epoch 1 | Step 580 | Loss: 0.3543 | LM: 0.2998 | LB: 1.0105 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.459/SR1: 0.438 | LR: 1.00e-05
286
+ [2026-04-07 17:53:07] Epoch 1 | Step 590 | Loss: 0.3548 | LM: 0.3011 | LB: 1.0105 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.459/SR1: 0.437 | LR: 1.00e-05
287
+ [2026-04-07 17:53:17] Epoch 1 | Step 600 | Loss: 0.3544 | LM: 0.3003 | LB: 1.0105 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.357 | HR1: 0.459/SR1: 0.437 | LR: 1.00e-05
288
+ [2026-04-07 17:53:26] Epoch 1 | Step 610 | Loss: 0.3539 | LM: 0.3000 | LB: 1.0104 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.459/SR1: 0.437 | LR: 1.00e-05
289
+ [2026-04-07 17:53:35] Epoch 1 | Step 620 | Loss: 0.3537 | LM: 0.3001 | LB: 1.0104 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.459/SR1: 0.437 | LR: 1.00e-05
290
+ [2026-04-07 17:53:44] Epoch 1 | Step 630 | Loss: 0.3532 | LM: 0.2999 | LB: 1.0103 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.459/SR1: 0.437 | LR: 1.00e-05
291
+ [2026-04-07 17:53:54] Epoch 1 | Step 640 | Loss: 0.3529 | LM: 0.2999 | LB: 1.0103 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.459/SR1: 0.437 | LR: 1.00e-05
292
+ [2026-04-07 17:54:03] Epoch 1 | Step 650 | Loss: 0.3531 | LM: 0.2994 | LB: 1.0103 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
293
+ [2026-04-07 17:54:12] Epoch 1 | Step 660 | Loss: 0.3526 | LM: 0.2991 | LB: 1.0102 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
294
+ [2026-04-07 17:54:21] Epoch 1 | Step 670 | Loss: 0.3533 | LM: 0.2996 | LB: 1.0102 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
295
+ [2026-04-07 17:54:31] Epoch 1 | Step 680 | Loss: 0.3530 | LM: 0.2999 | LB: 1.0102 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
296
+ [2026-04-07 17:54:40] Epoch 1 | Step 690 | Loss: 0.3531 | LM: 0.3004 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
297
+ [2026-04-07 17:54:49] Epoch 1 | Step 700 | Loss: 0.3532 | LM: 0.3005 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
298
+ [2026-04-07 17:54:58] Epoch 1 | Step 710 | Loss: 0.3530 | LM: 0.3000 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
299
+ [2026-04-07 17:55:07] Epoch 1 | Step 720 | Loss: 0.3527 | LM: 0.2999 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
300
+ [2026-04-07 17:55:16] Epoch 1 | Step 730 | Loss: 0.3523 | LM: 0.3005 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
301
+ [2026-04-07 17:55:26] Epoch 1 | Step 740 | Loss: 0.3521 | LM: 0.3007 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
302
+ [2026-04-07 17:55:35] Epoch 1 | Step 750 | Loss: 0.3519 | LM: 0.3003 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
303
+ [2026-04-07 17:55:36] Validation | Batch 10/732 | Loss: 0.3236 | LM: 0.2731
304
+ [2026-04-07 17:55:37] Validation | Batch 20/732 | Loss: 0.3450 | LM: 0.2946
305
+ [2026-04-07 17:55:38] Validation | Batch 30/732 | Loss: 0.3364 | LM: 0.2860
306
+ [2026-04-07 17:55:40] Validation | Batch 40/732 | Loss: 0.3408 | LM: 0.2904
307
+ [2026-04-07 17:55:41] Validation | Batch 50/732 | Loss: 0.3406 | LM: 0.2902
308
+ [2026-04-07 17:55:42] Validation | Batch 60/732 | Loss: 0.3426 | LM: 0.2923
309
+ [2026-04-07 17:55:43] Validation | Batch 70/732 | Loss: 0.3462 | LM: 0.2958
310
+ [2026-04-07 17:55:44] Validation | Batch 80/732 | Loss: 0.3447 | LM: 0.2943
311
+ [2026-04-07 17:55:46] Validation | Batch 90/732 | Loss: 0.3444 | LM: 0.2940
312
+ [2026-04-07 17:55:47] Validation | Batch 100/732 | Loss: 0.3455 | LM: 0.2951
313
+ [2026-04-07 17:55:48] Validation | Batch 110/732 | Loss: 0.3423 | LM: 0.2919
314
+ [2026-04-07 17:55:49] Validation | Batch 120/732 | Loss: 0.3455 | LM: 0.2951
315
+ [2026-04-07 17:55:50] Validation | Batch 130/732 | Loss: 0.3468 | LM: 0.2964
316
+ [2026-04-07 17:55:51] Validation | Batch 140/732 | Loss: 0.3463 | LM: 0.2959
317
+ [2026-04-07 17:55:52] Validation | Batch 150/732 | Loss: 0.3456 | LM: 0.2952
318
+ [2026-04-07 17:55:53] Validation | Batch 160/732 | Loss: 0.3447 | LM: 0.2944
319
+ [2026-04-07 17:55:54] Validation | Batch 170/732 | Loss: 0.3453 | LM: 0.2949
320
+ [2026-04-07 17:55:56] Validation | Batch 180/732 | Loss: 0.3465 | LM: 0.2961
321
+ [2026-04-07 17:55:57] Validation | Batch 190/732 | Loss: 0.3459 | LM: 0.2955
322
+ [2026-04-07 17:55:58] Validation | Batch 200/732 | Loss: 0.3459 | LM: 0.2956
323
+ [2026-04-07 17:55:59] Validation | Batch 210/732 | Loss: 0.3452 | LM: 0.2948
324
+ [2026-04-07 17:56:00] Validation | Batch 220/732 | Loss: 0.3447 | LM: 0.2943
325
+ [2026-04-07 17:56:02] Validation | Batch 230/732 | Loss: 0.3451 | LM: 0.2947
326
+ [2026-04-07 17:56:03] Validation | Batch 240/732 | Loss: 0.3448 | LM: 0.2944
327
+ [2026-04-07 17:56:04] Validation | Batch 250/732 | Loss: 0.3448 | LM: 0.2944
328
+ [2026-04-07 17:56:05] Validation | Batch 260/732 | Loss: 0.3438 | LM: 0.2934
329
+ [2026-04-07 17:56:06] Validation | Batch 270/732 | Loss: 0.3436 | LM: 0.2932
330
+ [2026-04-07 17:56:07] Validation | Batch 280/732 | Loss: 0.3425 | LM: 0.2921
331
+ [2026-04-07 17:56:09] Validation | Batch 290/732 | Loss: 0.3422 | LM: 0.2918
332
+ [2026-04-07 17:56:10] Validation | Batch 300/732 | Loss: 0.3422 | LM: 0.2918
333
+ [2026-04-07 17:56:11] Validation | Batch 310/732 | Loss: 0.3420 | LM: 0.2916
334
+ [2026-04-07 17:56:12] Validation | Batch 320/732 | Loss: 0.3411 | LM: 0.2908
335
+ [2026-04-07 17:56:13] Validation | Batch 330/732 | Loss: 0.3400 | LM: 0.2896
336
+ [2026-04-07 17:56:14] Validation | Batch 340/732 | Loss: 0.3394 | LM: 0.2891
337
+ [2026-04-07 17:56:15] Validation | Batch 350/732 | Loss: 0.3398 | LM: 0.2894
338
+ [2026-04-07 17:56:17] Validation | Batch 360/732 | Loss: 0.3406 | LM: 0.2902
339
+ [2026-04-07 17:56:17] Validation | Batch 370/732 | Loss: 0.3397 | LM: 0.2893
340
+ [2026-04-07 17:56:19] Validation | Batch 380/732 | Loss: 0.3390 | LM: 0.2886
341
+ [2026-04-07 17:56:19] Validation | Batch 390/732 | Loss: 0.3386 | LM: 0.2882
342
+ [2026-04-07 17:56:20] Validation | Batch 400/732 | Loss: 0.3384 | LM: 0.2881
343
+ [2026-04-07 17:56:22] Validation | Batch 410/732 | Loss: 0.3377 | LM: 0.2873
344
+ [2026-04-07 17:56:23] Validation | Batch 420/732 | Loss: 0.3379 | LM: 0.2876
345
+ [2026-04-07 17:56:24] Validation | Batch 430/732 | Loss: 0.3379 | LM: 0.2875
346
+ [2026-04-07 17:56:25] Validation | Batch 440/732 | Loss: 0.3374 | LM: 0.2870
347
+ [2026-04-07 17:56:26] Validation | Batch 450/732 | Loss: 0.3372 | LM: 0.2868
348
+ [2026-04-07 17:56:28] Validation | Batch 460/732 | Loss: 0.3376 | LM: 0.2872
349
+ [2026-04-07 17:56:29] Validation | Batch 470/732 | Loss: 0.3373 | LM: 0.2869
350
+ [2026-04-07 17:56:30] Validation | Batch 480/732 | Loss: 0.3375 | LM: 0.2871
351
+ [2026-04-07 17:56:31] Validation | Batch 490/732 | Loss: 0.3385 | LM: 0.2881
352
+ [2026-04-07 17:56:32] Validation | Batch 500/732 | Loss: 0.3396 | LM: 0.2892
353
+ [2026-04-07 17:56:33] Validation | Batch 510/732 | Loss: 0.3392 | LM: 0.2888
354
+ [2026-04-07 17:56:35] Validation | Batch 520/732 | Loss: 0.3390 | LM: 0.2886
355
+ [2026-04-07 17:56:36] Validation | Batch 530/732 | Loss: 0.3384 | LM: 0.2880
356
+ [2026-04-07 17:56:37] Validation | Batch 540/732 | Loss: 0.3386 | LM: 0.2882
357
+ [2026-04-07 17:56:38] Validation | Batch 550/732 | Loss: 0.3385 | LM: 0.2881
358
+ [2026-04-07 17:56:39] Validation | Batch 560/732 | Loss: 0.3380 | LM: 0.2877
359
+ [2026-04-07 17:56:40] Validation | Batch 570/732 | Loss: 0.3381 | LM: 0.2877
360
+ [2026-04-07 17:56:41] Validation | Batch 580/732 | Loss: 0.3378 | LM: 0.2874
361
+ [2026-04-07 17:56:43] Validation | Batch 590/732 | Loss: 0.3379 | LM: 0.2875
362
+ [2026-04-07 17:56:44] Validation | Batch 600/732 | Loss: 0.3378 | LM: 0.2874
363
+ [2026-04-07 17:56:45] Validation | Batch 610/732 | Loss: 0.3383 | LM: 0.2879
364
+ [2026-04-07 17:56:46] Validation | Batch 620/732 | Loss: 0.3387 | LM: 0.2883
365
+ [2026-04-07 17:56:48] Validation | Batch 630/732 | Loss: 0.3385 | LM: 0.2881
366
+ [2026-04-07 17:56:49] Validation | Batch 640/732 | Loss: 0.3382 | LM: 0.2878
367
+ [2026-04-07 17:56:50] Validation | Batch 650/732 | Loss: 0.3380 | LM: 0.2876
368
+ [2026-04-07 17:56:51] Validation | Batch 660/732 | Loss: 0.3385 | LM: 0.2881
369
+ [2026-04-07 17:56:52] Validation | Batch 670/732 | Loss: 0.3391 | LM: 0.2887
370
+ [2026-04-07 17:56:53] Validation | Batch 680/732 | Loss: 0.3391 | LM: 0.2887
371
+ [2026-04-07 17:56:54] Validation | Batch 690/732 | Loss: 0.3393 | LM: 0.2889
372
+ [2026-04-07 17:56:56] Validation | Batch 700/732 | Loss: 0.3397 | LM: 0.2893
373
+ [2026-04-07 17:56:57] Validation | Batch 710/732 | Loss: 0.3401 | LM: 0.2897
374
+ [2026-04-07 17:56:58] Validation | Batch 720/732 | Loss: 0.3411 | LM: 0.2907
375
+ [2026-04-07 17:56:59] Validation | Batch 730/732 | Loss: 0.3407 | LM: 0.2904
376
+ [2026-04-07 17:56:59] Validation | Batch 732/732 | Loss: 0.3406 | LM: 0.2902
377
+ [2026-04-07 17:56:59] Validation | Loss: 0.3406 | LM: 0.2902 | PPL: 1.34 | Time: 84.69s
378
+ [2026-04-07 17:57:02] New best model saved! Val loss: 0.3406
379
+ [2026-04-07 17:57:12] Epoch 1 | Step 760 | Loss: 0.3517 | LM: 0.2994 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
380
+ [2026-04-07 17:57:22] Epoch 1 | Step 770 | Loss: 0.3514 | LM: 0.2994 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
381
+ [2026-04-07 17:57:31] Epoch 1 | Step 780 | Loss: 0.3517 | LM: 0.3008 | LB: 1.0101 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.358 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-05
382
+ [2026-04-07 17:57:32] Reached max_steps=781, stopping training.
383
+ [2026-04-07 17:57:32] Epoch 1 completed in 1006.96s | Loss: 0.3516 | CL0: 2.8 | CL1: 2.2
384
+ [2026-04-07 17:57:32]
385
+ Training completed!
386
+ [2026-04-07 17:57:34] Final model: outputs/N_2.5/model_final.pt
routing_tuning_test_07_04/N_4.0/.hydra/config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ config_path: ${oc.env:PROJECT_ROOT}/hnet_project/configs/hnet_2stage_XL_code.json
3
+ checkpoint_path: ${oc.env:PROJECT_ROOT}/hnet_project/checkpoints/hnet_2stage_XL_code.pt
4
+ training:
5
+ epochs: 1
6
+ max_steps: null
7
+ batch_size: 8
8
+ eval_batch_size: 24
9
+ gradient_accumulation_steps: 4
10
+ lr: 0.0001
11
+ weight_decay: 0.1
12
+ betas:
13
+ - 0.9
14
+ - 0.95
15
+ eps: 1.0e-08
16
+ lr_scheduler: wsd
17
+ warmup_ratio: 0.1
18
+ decay_ratio: 0.2
19
+ warmup_steps: 100
20
+ min_lr_ratio: 0.1
21
+ lr_multiplier:
22
+ - 2.0
23
+ - 1.5
24
+ - 1.0
25
+ load_balancing_weight: 0.05
26
+ load_balancing_N: 4.0
27
+ max_grad_norm: 1.0
28
+ use_amp: true
29
+ resume: false
30
+ resume_checkpoint: null
31
+ warmup_model: true
32
+ data:
33
+ path: ${oc.env:PROJECT_ROOT}/code_completion_exp/datasets/data_V4_full
34
+ max_context_len: 4096
35
+ max_target_len: 256
36
+ num_workers: 0
37
+ pin_memory: true
38
+ max_train_samples: 50000
39
+ max_val_samples: null
40
+ logging:
41
+ log_interval: 10
42
+ save_interval: 1000
43
+ eval_interval: 250
44
+ save_every_epoch: false
45
+ model_only_checkpoints: true
46
+ tracking:
47
+ enabled: true
48
+ project: routing-evolution
49
+ run_name: routing_N4.0
50
+ paths:
51
+ output_dir: outputs/N_${training.load_balancing_N}
52
+ seed: 42
53
+ device: cuda
routing_tuning_test_07_04/N_4.0/.hydra/hydra.yaml ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${paths.output_dir}
4
+ sweep:
5
+ dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - training.load_balancing_N=4.0
116
+ - tracking.run_name=routing_N4.0
117
+ job:
118
+ name: train
119
+ chdir: false
120
+ override_dirname: tracking.run_name=routing_N4.0,training.load_balancing_N=4.0
121
+ id: ???
122
+ num: ???
123
+ config_name: config
124
+ env_set: {}
125
+ env_copy: []
126
+ config:
127
+ override_dirname:
128
+ kv_sep: '='
129
+ item_sep: ','
130
+ exclude_keys: []
131
+ runtime:
132
+ version: 1.3.2
133
+ version_base: '1.3'
134
+ cwd: /workspace/byte-llms-code/routing_evolution_exp
135
+ config_sources:
136
+ - path: hydra.conf
137
+ schema: pkg
138
+ provider: hydra
139
+ - path: /workspace/byte-llms-code/routing_evolution_exp/configs
140
+ schema: file
141
+ provider: main
142
+ - path: ''
143
+ schema: structured
144
+ provider: schema
145
+ output_dir: /workspace/byte-llms-code/routing_evolution_exp/outputs/N_4.0
146
+ choices:
147
+ paths: default
148
+ tracking: default
149
+ logging: default
150
+ data: default
151
+ training: default
152
+ model: hnet_xl_code
153
+ hydra/env: default
154
+ hydra/callbacks: null
155
+ hydra/job_logging: default
156
+ hydra/hydra_logging: default
157
+ hydra/hydra_help: default
158
+ hydra/help: default
159
+ hydra/sweeper: basic
160
+ hydra/launcher: basic
161
+ hydra/output: default
162
+ verbose: false
routing_tuning_test_07_04/N_4.0/.hydra/overrides.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ - training.load_balancing_N=4.0
2
+ - tracking.run_name=routing_N4.0
routing_tuning_test_07_04/N_4.0/model_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d4e628f340f7ab262f369d2e6c937b10d82f77aabfd27ee70c40d8fd11b6e6a
3
+ size 3315165139
routing_tuning_test_07_04/N_4.0/model_final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30da6a4faa77f36ff078f9eaa61546c90682ec9cc8c595fe235028d1c0794b38
3
+ size 3315165484
routing_tuning_test_07_04/N_4.0/routing_weights/routing_step_0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:132bc38ff57f9db6c9ca06a66251bf38dc4231f85bd01101af2cf1df371b5db7
3
+ size 13633736
routing_tuning_test_07_04/N_4.0/routing_weights/routing_step_781.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06258594cce8b74a246104ac374f19d8258d68fcd8c776838309bf3024519e51
3
+ size 13633752
routing_tuning_test_07_04/N_4.0/train.log ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-04-07 17:57:52] CUDA_VISIBLE_DEVICES: 0,1
2
+ [2026-04-07 17:57:52] Number of processes: 2
3
+ [2026-04-07 17:57:52] Mixed precision: bf16
4
+ [2026-04-07 17:57:52] ============================================================
5
+ [2026-04-07 17:57:52] Routing Evolution Experiment | N=4.0
6
+ [2026-04-07 17:57:52] ============================================================
7
+ [2026-04-07 17:57:52] Config:
8
+ model:
9
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
10
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
11
+ training:
12
+ epochs: 1
13
+ max_steps: null
14
+ batch_size: 8
15
+ eval_batch_size: 24
16
+ gradient_accumulation_steps: 4
17
+ lr: 0.0001
18
+ weight_decay: 0.1
19
+ betas:
20
+ - 0.9
21
+ - 0.95
22
+ eps: 1.0e-08
23
+ lr_scheduler: wsd
24
+ warmup_ratio: 0.1
25
+ decay_ratio: 0.2
26
+ warmup_steps: 100
27
+ min_lr_ratio: 0.1
28
+ lr_multiplier:
29
+ - 2.0
30
+ - 1.5
31
+ - 1.0
32
+ load_balancing_weight: 0.05
33
+ load_balancing_N: 4.0
34
+ max_grad_norm: 1.0
35
+ use_amp: true
36
+ resume: false
37
+ resume_checkpoint: null
38
+ warmup_model: true
39
+ data:
40
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
41
+ max_context_len: 4096
42
+ max_target_len: 256
43
+ num_workers: 0
44
+ pin_memory: true
45
+ max_train_samples: 50000
46
+ max_val_samples: null
47
+ logging:
48
+ log_interval: 10
49
+ save_interval: 1000
50
+ eval_interval: 250
51
+ save_every_epoch: false
52
+ model_only_checkpoints: true
53
+ tracking:
54
+ enabled: true
55
+ project: routing-evolution
56
+ run_name: routing_N4.0
57
+ paths:
58
+ output_dir: outputs/N_4.0
59
+ seed: 42
60
+ device: cuda
61
+
62
+ [2026-04-07 17:57:53] Loading model...
63
+ [2026-04-07 17:57:59] Loaded pretrained: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
64
+ [2026-04-07 17:57:59] Applied LR multipliers: [2.0, 1.5, 1.0]
65
+ [2026-04-07 17:57:59] Warming up model...
66
+ [2026-04-07 17:58:45] Total params: 1,654,090,112
67
+ [2026-04-07 17:58:45] Trainable params: 1,654,090,112
68
+ [2026-04-07 17:58:45] Creating dataloaders...
69
+ [2026-04-07 17:58:45] Train dataset size: 50000 (max_train_samples=50000) | Epochs: 1
70
+ [2026-04-07 17:58:45] Max steps: 781, Steps per epoch: 3125
71
+ [2026-04-07 17:58:47] Starting training...
72
+ [2026-04-07 17:58:47]
73
+ ============================================================
74
+ [2026-04-07 17:58:47] EPOCH 1/1 (step 0)
75
+ [2026-04-07 17:58:47] ============================================================
76
+ [2026-04-07 17:59:19] Epoch 1 | Step 10 | Loss: 0.7035 | LM: 0.6494 | LB: 1.1662 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.361 | HR1: 0.482/SR1: 0.460 | LR: 3.31e-05
77
+ [2026-04-07 17:59:28] Epoch 1 | Step 20 | Loss: 0.6032 | LM: 0.5509 | LB: 1.1644 | CL0: 2.8 | CL1: 2.1 | HR0: 0.364/SR0: 0.362 | HR1: 0.479/SR1: 0.459 | LR: 5.62e-05
78
+ [2026-04-07 17:59:38] Epoch 1 | Step 30 | Loss: 0.5447 | LM: 0.4787 | LB: 1.1629 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.362 | HR1: 0.479/SR1: 0.457 | LR: 7.92e-05
79
+ [2026-04-07 17:59:47] Epoch 1 | Step 40 | Loss: 0.5087 | LM: 0.4478 | LB: 1.1577 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.476/SR1: 0.456 | LR: 1.00e-04
80
+ [2026-04-07 17:59:56] Epoch 1 | Step 50 | Loss: 0.4861 | LM: 0.4097 | LB: 1.1533 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.472/SR1: 0.452 | LR: 1.00e-04
81
+ [2026-04-07 18:00:05] Epoch 1 | Step 60 | Loss: 0.4599 | LM: 0.3813 | LB: 1.1530 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.472/SR1: 0.452 | LR: 1.00e-04
82
+ [2026-04-07 18:00:14] Epoch 1 | Step 70 | Loss: 0.4455 | LM: 0.3774 | LB: 1.1525 | CL0: 2.8 | CL1: 2.1 | HR0: 0.359/SR0: 0.358 | HR1: 0.471/SR1: 0.451 | LR: 1.00e-04
83
+ [2026-04-07 18:00:23] Epoch 1 | Step 80 | Loss: 0.4307 | LM: 0.3666 | LB: 1.1508 | CL0: 2.8 | CL1: 2.1 | HR0: 0.360/SR0: 0.358 | HR1: 0.469/SR1: 0.449 | LR: 1.00e-04
84
+ [2026-04-07 18:00:32] Epoch 1 | Step 90 | Loss: 0.4221 | LM: 0.3536 | LB: 1.1481 | CL0: 2.8 | CL1: 2.2 | HR0: 0.360/SR0: 0.359 | HR1: 0.466/SR1: 0.446 | LR: 1.00e-04
85
+ [2026-04-07 18:00:41] Epoch 1 | Step 100 | Loss: 0.4171 | LM: 0.3510 | LB: 1.1457 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.466/SR1: 0.444 | LR: 1.00e-04
86
+ [2026-04-07 18:00:50] Epoch 1 | Step 110 | Loss: 0.4127 | LM: 0.3458 | LB: 1.1432 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.357 | HR1: 0.463/SR1: 0.442 | LR: 1.00e-04
87
+ [2026-04-07 18:00:59] Epoch 1 | Step 120 | Loss: 0.4102 | LM: 0.3473 | LB: 1.1407 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.461/SR1: 0.440 | LR: 1.00e-04
88
+ [2026-04-07 18:01:08] Epoch 1 | Step 130 | Loss: 0.4062 | LM: 0.3443 | LB: 1.1390 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.460/SR1: 0.439 | LR: 1.00e-04
89
+ [2026-04-07 18:01:17] Epoch 1 | Step 140 | Loss: 0.4019 | LM: 0.3399 | LB: 1.1363 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.458/SR1: 0.436 | LR: 1.00e-04
90
+ [2026-04-07 18:01:26] Epoch 1 | Step 150 | Loss: 0.3995 | LM: 0.3375 | LB: 1.1345 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.456/SR1: 0.435 | LR: 1.00e-04
91
+ [2026-04-07 18:01:35] Epoch 1 | Step 160 | Loss: 0.3952 | LM: 0.3321 | LB: 1.1326 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.354 | HR1: 0.455/SR1: 0.433 | LR: 1.00e-04
92
+ [2026-04-07 18:01:44] Epoch 1 | Step 170 | Loss: 0.3904 | LM: 0.3274 | LB: 1.1309 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.453/SR1: 0.431 | LR: 1.00e-04
93
+ [2026-04-07 18:01:53] Epoch 1 | Step 180 | Loss: 0.3868 | LM: 0.3233 | LB: 1.1297 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.452/SR1: 0.430 | LR: 1.00e-04
94
+ [2026-04-07 18:02:03] Epoch 1 | Step 190 | Loss: 0.3850 | LM: 0.3229 | LB: 1.1283 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.355 | HR1: 0.450/SR1: 0.428 | LR: 1.00e-04
95
+ [2026-04-07 18:02:13] Epoch 1 | Step 200 | Loss: 0.3824 | LM: 0.3201 | LB: 1.1267 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.449/SR1: 0.426 | LR: 1.00e-04
96
+ [2026-04-07 18:02:23] Epoch 1 | Step 210 | Loss: 0.3822 | LM: 0.3205 | LB: 1.1251 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.354 | HR1: 0.447/SR1: 0.425 | LR: 1.00e-04
97
+ [2026-04-07 18:02:33] Epoch 1 | Step 220 | Loss: 0.3813 | LM: 0.3185 | LB: 1.1242 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.354 | HR1: 0.447/SR1: 0.424 | LR: 1.00e-04
98
+ [2026-04-07 18:02:42] Epoch 1 | Step 230 | Loss: 0.3799 | LM: 0.3177 | LB: 1.1229 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.355 | HR1: 0.445/SR1: 0.422 | LR: 1.00e-04
99
+ [2026-04-07 18:02:51] Epoch 1 | Step 240 | Loss: 0.3778 | LM: 0.3188 | LB: 1.1218 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.444/SR1: 0.421 | LR: 1.00e-04
100
+ [2026-04-07 18:03:00] Epoch 1 | Step 250 | Loss: 0.3754 | LM: 0.3170 | LB: 1.1205 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.443/SR1: 0.420 | LR: 1.00e-04
101
+ [2026-04-07 18:03:01] Validation | Batch 10/732 | Loss: 0.3314 | LM: 0.2769
102
+ [2026-04-07 18:03:03] Validation | Batch 20/732 | Loss: 0.3530 | LM: 0.2984
103
+ [2026-04-07 18:03:04] Validation | Batch 30/732 | Loss: 0.3439 | LM: 0.2894
104
+ [2026-04-07 18:03:05] Validation | Batch 40/732 | Loss: 0.3490 | LM: 0.2945
105
+ [2026-04-07 18:03:06] Validation | Batch 50/732 | Loss: 0.3493 | LM: 0.2948
106
+ [2026-04-07 18:03:07] Validation | Batch 60/732 | Loss: 0.3518 | LM: 0.2973
107
+ [2026-04-07 18:03:08] Validation | Batch 70/732 | Loss: 0.3553 | LM: 0.3008
108
+ [2026-04-07 18:03:10] Validation | Batch 80/732 | Loss: 0.3535 | LM: 0.2991
109
+ [2026-04-07 18:03:11] Validation | Batch 90/732 | Loss: 0.3530 | LM: 0.2986
110
+ [2026-04-07 18:03:12] Validation | Batch 100/732 | Loss: 0.3540 | LM: 0.2995
111
+ [2026-04-07 18:03:13] Validation | Batch 110/732 | Loss: 0.3508 | LM: 0.2963
112
+ [2026-04-07 18:03:14] Validation | Batch 120/732 | Loss: 0.3540 | LM: 0.2996
113
+ [2026-04-07 18:03:15] Validation | Batch 130/732 | Loss: 0.3554 | LM: 0.3010
114
+ [2026-04-07 18:03:17] Validation | Batch 140/732 | Loss: 0.3548 | LM: 0.3004
115
+ [2026-04-07 18:03:18] Validation | Batch 150/732 | Loss: 0.3541 | LM: 0.2997
116
+ [2026-04-07 18:03:19] Validation | Batch 160/732 | Loss: 0.3532 | LM: 0.2988
117
+ [2026-04-07 18:03:20] Validation | Batch 170/732 | Loss: 0.3536 | LM: 0.2992
118
+ [2026-04-07 18:03:21] Validation | Batch 180/732 | Loss: 0.3549 | LM: 0.3004
119
+ [2026-04-07 18:03:22] Validation | Batch 190/732 | Loss: 0.3542 | LM: 0.2998
120
+ [2026-04-07 18:03:23] Validation | Batch 200/732 | Loss: 0.3543 | LM: 0.2998
121
+ [2026-04-07 18:03:24] Validation | Batch 210/732 | Loss: 0.3535 | LM: 0.2991
122
+ [2026-04-07 18:03:25] Validation | Batch 220/732 | Loss: 0.3530 | LM: 0.2986
123
+ [2026-04-07 18:03:27] Validation | Batch 230/732 | Loss: 0.3534 | LM: 0.2990
124
+ [2026-04-07 18:03:28] Validation | Batch 240/732 | Loss: 0.3531 | LM: 0.2987
125
+ [2026-04-07 18:03:29] Validation | Batch 250/732 | Loss: 0.3532 | LM: 0.2987
126
+ [2026-04-07 18:03:30] Validation | Batch 260/732 | Loss: 0.3522 | LM: 0.2977
127
+ [2026-04-07 18:03:31] Validation | Batch 270/732 | Loss: 0.3519 | LM: 0.2975
128
+ [2026-04-07 18:03:32] Validation | Batch 280/732 | Loss: 0.3508 | LM: 0.2963
129
+ [2026-04-07 18:03:33] Validation | Batch 290/732 | Loss: 0.3506 | LM: 0.2962
130
+ [2026-04-07 18:03:34] Validation | Batch 300/732 | Loss: 0.3505 | LM: 0.2960
131
+ [2026-04-07 18:03:36] Validation | Batch 310/732 | Loss: 0.3504 | LM: 0.2960
132
+ [2026-04-07 18:03:37] Validation | Batch 320/732 | Loss: 0.3495 | LM: 0.2950
133
+ [2026-04-07 18:03:38] Validation | Batch 330/732 | Loss: 0.3484 | LM: 0.2939
134
+ [2026-04-07 18:03:39] Validation | Batch 340/732 | Loss: 0.3478 | LM: 0.2933
135
+ [2026-04-07 18:03:40] Validation | Batch 350/732 | Loss: 0.3481 | LM: 0.2936
136
+ [2026-04-07 18:03:41] Validation | Batch 360/732 | Loss: 0.3489 | LM: 0.2944
137
+ [2026-04-07 18:03:42] Validation | Batch 370/732 | Loss: 0.3479 | LM: 0.2935
138
+ [2026-04-07 18:03:43] Validation | Batch 380/732 | Loss: 0.3473 | LM: 0.2928
139
+ [2026-04-07 18:03:44] Validation | Batch 390/732 | Loss: 0.3469 | LM: 0.2924
140
+ [2026-04-07 18:03:45] Validation | Batch 400/732 | Loss: 0.3468 | LM: 0.2923
141
+ [2026-04-07 18:03:46] Validation | Batch 410/732 | Loss: 0.3460 | LM: 0.2915
142
+ [2026-04-07 18:03:47] Validation | Batch 420/732 | Loss: 0.3462 | LM: 0.2917
143
+ [2026-04-07 18:03:48] Validation | Batch 430/732 | Loss: 0.3461 | LM: 0.2916
144
+ [2026-04-07 18:03:50] Validation | Batch 440/732 | Loss: 0.3456 | LM: 0.2911
145
+ [2026-04-07 18:03:51] Validation | Batch 450/732 | Loss: 0.3454 | LM: 0.2909
146
+ [2026-04-07 18:03:52] Validation | Batch 460/732 | Loss: 0.3458 | LM: 0.2913
147
+ [2026-04-07 18:03:53] Validation | Batch 470/732 | Loss: 0.3456 | LM: 0.2911
148
+ [2026-04-07 18:03:54] Validation | Batch 480/732 | Loss: 0.3457 | LM: 0.2913
149
+ [2026-04-07 18:03:55] Validation | Batch 490/732 | Loss: 0.3468 | LM: 0.2923
150
+ [2026-04-07 18:03:56] Validation | Batch 500/732 | Loss: 0.3479 | LM: 0.2934
151
+ [2026-04-07 18:03:57] Validation | Batch 510/732 | Loss: 0.3475 | LM: 0.2931
152
+ [2026-04-07 18:03:58] Validation | Batch 520/732 | Loss: 0.3473 | LM: 0.2928
153
+ [2026-04-07 18:04:00] Validation | Batch 530/732 | Loss: 0.3467 | LM: 0.2922
154
+ [2026-04-07 18:04:01] Validation | Batch 540/732 | Loss: 0.3468 | LM: 0.2924
155
+ [2026-04-07 18:04:02] Validation | Batch 550/732 | Loss: 0.3468 | LM: 0.2923
156
+ [2026-04-07 18:04:03] Validation | Batch 560/732 | Loss: 0.3463 | LM: 0.2919
157
+ [2026-04-07 18:04:04] Validation | Batch 570/732 | Loss: 0.3464 | LM: 0.2919
158
+ [2026-04-07 18:04:05] Validation | Batch 580/732 | Loss: 0.3461 | LM: 0.2917
159
+ [2026-04-07 18:04:07] Validation | Batch 590/732 | Loss: 0.3461 | LM: 0.2916
160
+ [2026-04-07 18:04:08] Validation | Batch 600/732 | Loss: 0.3461 | LM: 0.2916
161
+ [2026-04-07 18:04:09] Validation | Batch 610/732 | Loss: 0.3466 | LM: 0.2922
162
+ [2026-04-07 18:04:10] Validation | Batch 620/732 | Loss: 0.3470 | LM: 0.2925
163
+ [2026-04-07 18:04:11] Validation | Batch 630/732 | Loss: 0.3468 | LM: 0.2923
164
+ [2026-04-07 18:04:12] Validation | Batch 640/732 | Loss: 0.3466 | LM: 0.2921
165
+ [2026-04-07 18:04:14] Validation | Batch 650/732 | Loss: 0.3464 | LM: 0.2919
166
+ [2026-04-07 18:04:15] Validation | Batch 660/732 | Loss: 0.3469 | LM: 0.2924
167
+ [2026-04-07 18:04:16] Validation | Batch 670/732 | Loss: 0.3475 | LM: 0.2930
168
+ [2026-04-07 18:04:17] Validation | Batch 680/732 | Loss: 0.3474 | LM: 0.2930
169
+ [2026-04-07 18:04:18] Validation | Batch 690/732 | Loss: 0.3476 | LM: 0.2932
170
+ [2026-04-07 18:04:19] Validation | Batch 700/732 | Loss: 0.3481 | LM: 0.2936
171
+ [2026-04-07 18:04:20] Validation | Batch 710/732 | Loss: 0.3485 | LM: 0.2940
172
+ [2026-04-07 18:04:21] Validation | Batch 720/732 | Loss: 0.3495 | LM: 0.2950
173
+ [2026-04-07 18:04:22] Validation | Batch 730/732 | Loss: 0.3492 | LM: 0.2947
174
+ [2026-04-07 18:04:23] Validation | Batch 732/732 | Loss: 0.3490 | LM: 0.2945
175
+ [2026-04-07 18:04:23] Validation | Loss: 0.3490 | LM: 0.2945 | PPL: 1.34 | Time: 82.14s
176
+ [2026-04-07 18:04:25] New best model saved! Val loss: 0.3490
177
+ [2026-04-07 18:04:35] Epoch 1 | Step 260 | Loss: 0.3735 | LM: 0.3160 | LB: 1.1194 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.442/SR1: 0.419 | LR: 1.00e-04
178
+ [2026-04-07 18:04:44] Epoch 1 | Step 270 | Loss: 0.3735 | LM: 0.3143 | LB: 1.1184 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.441/SR1: 0.418 | LR: 1.00e-04
179
+ [2026-04-07 18:04:53] Epoch 1 | Step 280 | Loss: 0.3733 | LM: 0.3145 | LB: 1.1175 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.440/SR1: 0.417 | LR: 1.00e-04
180
+ [2026-04-07 18:05:02] Epoch 1 | Step 290 | Loss: 0.3716 | LM: 0.3125 | LB: 1.1164 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.439/SR1: 0.415 | LR: 1.00e-04
181
+ [2026-04-07 18:05:11] Epoch 1 | Step 300 | Loss: 0.3708 | LM: 0.3120 | LB: 1.1156 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.438/SR1: 0.414 | LR: 1.00e-04
182
+ [2026-04-07 18:05:21] Epoch 1 | Step 310 | Loss: 0.3699 | LM: 0.3110 | LB: 1.1149 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.355 | HR1: 0.437/SR1: 0.413 | LR: 1.00e-04
183
+ [2026-04-07 18:05:30] Epoch 1 | Step 320 | Loss: 0.3692 | LM: 0.3105 | LB: 1.1140 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.436/SR1: 0.412 | LR: 9.80e-05
184
+ [2026-04-07 18:05:39] Epoch 1 | Step 330 | Loss: 0.3680 | LM: 0.3096 | LB: 1.1130 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.435/SR1: 0.411 | LR: 8.93e-05
185
+ [2026-04-07 18:05:48] Epoch 1 | Step 340 | Loss: 0.3671 | LM: 0.3088 | LB: 1.1121 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.434/SR1: 0.410 | LR: 7.51e-05
186
+ [2026-04-07 18:05:57] Epoch 1 | Step 350 | Loss: 0.3666 | LM: 0.3093 | LB: 1.1113 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.433/SR1: 0.409 | LR: 5.77e-05
187
+ [2026-04-07 18:06:06] Epoch 1 | Step 360 | Loss: 0.3659 | LM: 0.3108 | LB: 1.1107 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.433/SR1: 0.408 | LR: 3.99e-05
188
+ [2026-04-07 18:06:15] Epoch 1 | Step 370 | Loss: 0.3655 | LM: 0.3104 | LB: 1.1102 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.432/SR1: 0.408 | LR: 2.45e-05
189
+ [2026-04-07 18:06:24] Epoch 1 | Step 380 | Loss: 0.3644 | LM: 0.3085 | LB: 1.1095 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.432/SR1: 0.407 | LR: 1.40e-05
190
+ [2026-04-07 18:06:33] Epoch 1 | Step 390 | Loss: 0.3646 | LM: 0.3082 | LB: 1.1088 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.354 | HR1: 0.431/SR1: 0.406 | LR: 1.00e-05
191
+ [2026-04-07 18:06:42] Epoch 1 | Step 400 | Loss: 0.3644 | LM: 0.3090 | LB: 1.1083 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.430/SR1: 0.406 | LR: 1.00e-05
192
+ [2026-04-07 18:06:51] Epoch 1 | Step 410 | Loss: 0.3643 | LM: 0.3088 | LB: 1.1077 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.430/SR1: 0.405 | LR: 1.00e-05
193
+ [2026-04-07 18:07:00] Epoch 1 | Step 420 | Loss: 0.3644 | LM: 0.3082 | LB: 1.1072 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.429/SR1: 0.405 | LR: 1.00e-05
194
+ [2026-04-07 18:07:09] Epoch 1 | Step 430 | Loss: 0.3638 | LM: 0.3066 | LB: 1.1067 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.429/SR1: 0.404 | LR: 1.00e-05
195
+ [2026-04-07 18:07:18] Epoch 1 | Step 440 | Loss: 0.3630 | LM: 0.3057 | LB: 1.1064 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.429/SR1: 0.404 | LR: 1.00e-05
196
+ [2026-04-07 18:07:28] Epoch 1 | Step 450 | Loss: 0.3624 | LM: 0.3044 | LB: 1.1058 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.428/SR1: 0.403 | LR: 1.00e-05
197
+ [2026-04-07 18:07:37] Epoch 1 | Step 460 | Loss: 0.3625 | LM: 0.3035 | LB: 1.1053 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.428/SR1: 0.402 | LR: 1.00e-05
198
+ [2026-04-07 18:07:45] Epoch 1 | Step 470 | Loss: 0.3623 | LM: 0.3043 | LB: 1.1050 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.427/SR1: 0.402 | LR: 1.00e-05
199
+ [2026-04-07 18:07:54] Epoch 1 | Step 480 | Loss: 0.3619 | LM: 0.3036 | LB: 1.1045 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.427/SR1: 0.401 | LR: 1.00e-05
200
+ [2026-04-07 18:08:04] Epoch 1 | Step 490 | Loss: 0.3612 | LM: 0.3024 | LB: 1.1041 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.426/SR1: 0.401 | LR: 1.00e-05
201
+ [2026-04-07 18:08:13] Epoch 1 | Step 500 | Loss: 0.3614 | LM: 0.3029 | LB: 1.1037 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.426/SR1: 0.400 | LR: 1.00e-05
202
+ [2026-04-07 18:08:13] Validation | Batch 10/732 | Loss: 0.3283 | LM: 0.2742
203
+ [2026-04-07 18:08:15] Validation | Batch 20/732 | Loss: 0.3500 | LM: 0.2958
204
+ [2026-04-07 18:08:16] Validation | Batch 30/732 | Loss: 0.3412 | LM: 0.2872
205
+ [2026-04-07 18:08:17] Validation | Batch 40/732 | Loss: 0.3462 | LM: 0.2921
206
+ [2026-04-07 18:08:18] Validation | Batch 50/732 | Loss: 0.3460 | LM: 0.2920
207
+ [2026-04-07 18:08:19] Validation | Batch 60/732 | Loss: 0.3480 | LM: 0.2940
208
+ [2026-04-07 18:08:20] Validation | Batch 70/732 | Loss: 0.3514 | LM: 0.2973
209
+ [2026-04-07 18:08:22] Validation | Batch 80/732 | Loss: 0.3497 | LM: 0.2957
210
+ [2026-04-07 18:08:23] Validation | Batch 90/732 | Loss: 0.3495 | LM: 0.2955
211
+ [2026-04-07 18:08:24] Validation | Batch 100/732 | Loss: 0.3505 | LM: 0.2965
212
+ [2026-04-07 18:08:25] Validation | Batch 110/732 | Loss: 0.3472 | LM: 0.2932
213
+ [2026-04-07 18:08:26] Validation | Batch 120/732 | Loss: 0.3504 | LM: 0.2965
214
+ [2026-04-07 18:08:27] Validation | Batch 130/732 | Loss: 0.3517 | LM: 0.2977
215
+ [2026-04-07 18:08:29] Validation | Batch 140/732 | Loss: 0.3511 | LM: 0.2971
216
+ [2026-04-07 18:08:30] Validation | Batch 150/732 | Loss: 0.3505 | LM: 0.2965
217
+ [2026-04-07 18:08:31] Validation | Batch 160/732 | Loss: 0.3497 | LM: 0.2957
218
+ [2026-04-07 18:08:32] Validation | Batch 170/732 | Loss: 0.3503 | LM: 0.2963
219
+ [2026-04-07 18:08:33] Validation | Batch 180/732 | Loss: 0.3516 | LM: 0.2976
220
+ [2026-04-07 18:08:34] Validation | Batch 190/732 | Loss: 0.3509 | LM: 0.2970
221
+ [2026-04-07 18:08:35] Validation | Batch 200/732 | Loss: 0.3510 | LM: 0.2970
222
+ [2026-04-07 18:08:36] Validation | Batch 210/732 | Loss: 0.3503 | LM: 0.2964
223
+ [2026-04-07 18:08:37] Validation | Batch 220/732 | Loss: 0.3499 | LM: 0.2959
224
+ [2026-04-07 18:08:38] Validation | Batch 230/732 | Loss: 0.3503 | LM: 0.2963
225
+ [2026-04-07 18:08:40] Validation | Batch 240/732 | Loss: 0.3501 | LM: 0.2961
226
+ [2026-04-07 18:08:41] Validation | Batch 250/732 | Loss: 0.3500 | LM: 0.2960
227
+ [2026-04-07 18:08:42] Validation | Batch 260/732 | Loss: 0.3490 | LM: 0.2950
228
+ [2026-04-07 18:08:43] Validation | Batch 270/732 | Loss: 0.3488 | LM: 0.2948
229
+ [2026-04-07 18:08:44] Validation | Batch 280/732 | Loss: 0.3478 | LM: 0.2938
230
+ [2026-04-07 18:08:45] Validation | Batch 290/732 | Loss: 0.3476 | LM: 0.2935
231
+ [2026-04-07 18:08:46] Validation | Batch 300/732 | Loss: 0.3475 | LM: 0.2935
232
+ [2026-04-07 18:08:47] Validation | Batch 310/732 | Loss: 0.3474 | LM: 0.2933
233
+ [2026-04-07 18:08:49] Validation | Batch 320/732 | Loss: 0.3465 | LM: 0.2924
234
+ [2026-04-07 18:08:50] Validation | Batch 330/732 | Loss: 0.3454 | LM: 0.2913
235
+ [2026-04-07 18:08:51] Validation | Batch 340/732 | Loss: 0.3448 | LM: 0.2907
236
+ [2026-04-07 18:08:52] Validation | Batch 350/732 | Loss: 0.3451 | LM: 0.2911
237
+ [2026-04-07 18:08:53] Validation | Batch 360/732 | Loss: 0.3459 | LM: 0.2919
238
+ [2026-04-07 18:08:54] Validation | Batch 370/732 | Loss: 0.3450 | LM: 0.2910
239
+ [2026-04-07 18:08:55] Validation | Batch 380/732 | Loss: 0.3443 | LM: 0.2903
240
+ [2026-04-07 18:08:56] Validation | Batch 390/732 | Loss: 0.3440 | LM: 0.2899
241
+ [2026-04-07 18:08:57] Validation | Batch 400/732 | Loss: 0.3438 | LM: 0.2897
242
+ [2026-04-07 18:08:58] Validation | Batch 410/732 | Loss: 0.3431 | LM: 0.2890
243
+ [2026-04-07 18:08:59] Validation | Batch 420/732 | Loss: 0.3433 | LM: 0.2892
244
+ [2026-04-07 18:09:00] Validation | Batch 430/732 | Loss: 0.3432 | LM: 0.2892
245
+ [2026-04-07 18:09:01] Validation | Batch 440/732 | Loss: 0.3427 | LM: 0.2887
246
+ [2026-04-07 18:09:02] Validation | Batch 450/732 | Loss: 0.3426 | LM: 0.2885
247
+ [2026-04-07 18:09:04] Validation | Batch 460/732 | Loss: 0.3429 | LM: 0.2889
248
+ [2026-04-07 18:09:05] Validation | Batch 470/732 | Loss: 0.3427 | LM: 0.2886
249
+ [2026-04-07 18:09:06] Validation | Batch 480/732 | Loss: 0.3428 | LM: 0.2888
250
+ [2026-04-07 18:09:07] Validation | Batch 490/732 | Loss: 0.3439 | LM: 0.2898
251
+ [2026-04-07 18:09:08] Validation | Batch 500/732 | Loss: 0.3449 | LM: 0.2909
252
+ [2026-04-07 18:09:09] Validation | Batch 510/732 | Loss: 0.3446 | LM: 0.2905
253
+ [2026-04-07 18:09:10] Validation | Batch 520/732 | Loss: 0.3444 | LM: 0.2904
254
+ [2026-04-07 18:09:11] Validation | Batch 530/732 | Loss: 0.3438 | LM: 0.2898
255
+ [2026-04-07 18:09:12] Validation | Batch 540/732 | Loss: 0.3440 | LM: 0.2899
256
+ [2026-04-07 18:09:13] Validation | Batch 550/732 | Loss: 0.3439 | LM: 0.2899
257
+ [2026-04-07 18:09:14] Validation | Batch 560/732 | Loss: 0.3434 | LM: 0.2894
258
+ [2026-04-07 18:09:16] Validation | Batch 570/732 | Loss: 0.3435 | LM: 0.2894
259
+ [2026-04-07 18:09:17] Validation | Batch 580/732 | Loss: 0.3432 | LM: 0.2891
260
+ [2026-04-07 18:09:18] Validation | Batch 590/732 | Loss: 0.3432 | LM: 0.2891
261
+ [2026-04-07 18:09:19] Validation | Batch 600/732 | Loss: 0.3431 | LM: 0.2891
262
+ [2026-04-07 18:09:21] Validation | Batch 610/732 | Loss: 0.3437 | LM: 0.2896
263
+ [2026-04-07 18:09:22] Validation | Batch 620/732 | Loss: 0.3440 | LM: 0.2900
264
+ [2026-04-07 18:09:23] Validation | Batch 630/732 | Loss: 0.3438 | LM: 0.2898
265
+ [2026-04-07 18:09:24] Validation | Batch 640/732 | Loss: 0.3435 | LM: 0.2895
266
+ [2026-04-07 18:09:25] Validation | Batch 650/732 | Loss: 0.3434 | LM: 0.2894
267
+ [2026-04-07 18:09:26] Validation | Batch 660/732 | Loss: 0.3439 | LM: 0.2899
268
+ [2026-04-07 18:09:27] Validation | Batch 670/732 | Loss: 0.3445 | LM: 0.2905
269
+ [2026-04-07 18:09:28] Validation | Batch 680/732 | Loss: 0.3445 | LM: 0.2904
270
+ [2026-04-07 18:09:29] Validation | Batch 690/732 | Loss: 0.3447 | LM: 0.2906
271
+ [2026-04-07 18:09:31] Validation | Batch 700/732 | Loss: 0.3452 | LM: 0.2911
272
+ [2026-04-07 18:09:32] Validation | Batch 710/732 | Loss: 0.3455 | LM: 0.2915
273
+ [2026-04-07 18:09:33] Validation | Batch 720/732 | Loss: 0.3465 | LM: 0.2925
274
+ [2026-04-07 18:09:34] Validation | Batch 730/732 | Loss: 0.3462 | LM: 0.2921
275
+ [2026-04-07 18:09:34] Validation | Batch 732/732 | Loss: 0.3460 | LM: 0.2920
276
+ [2026-04-07 18:09:34] Validation | Loss: 0.3460 | LM: 0.2920 | PPL: 1.34 | Time: 81.84s
277
+ [2026-04-07 18:09:37] New best model saved! Val loss: 0.3460
278
+ [2026-04-07 18:09:46] Epoch 1 | Step 510 | Loss: 0.3613 | LM: 0.3012 | LB: 1.1036 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.426/SR1: 0.400 | LR: 1.00e-05
279
+ [2026-04-07 18:09:56] Epoch 1 | Step 520 | Loss: 0.3609 | LM: 0.3002 | LB: 1.1030 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.425/SR1: 0.399 | LR: 1.00e-05
280
+ [2026-04-07 18:10:04] Epoch 1 | Step 530 | Loss: 0.3606 | LM: 0.2993 | LB: 1.1027 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.425/SR1: 0.399 | LR: 1.00e-05
281
+ [2026-04-07 18:10:13] Epoch 1 | Step 540 | Loss: 0.3603 | LM: 0.2991 | LB: 1.1022 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.425/SR1: 0.399 | LR: 1.00e-05
282
+ [2026-04-07 18:10:22] Epoch 1 | Step 550 | Loss: 0.3600 | LM: 0.2996 | LB: 1.1017 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.424/SR1: 0.398 | LR: 1.00e-05
283
+ [2026-04-07 18:10:31] Epoch 1 | Step 560 | Loss: 0.3602 | LM: 0.3004 | LB: 1.1015 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.424/SR1: 0.398 | LR: 1.00e-05
284
+ [2026-04-07 18:10:40] Epoch 1 | Step 570 | Loss: 0.3603 | LM: 0.3008 | LB: 1.1013 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.424/SR1: 0.398 | LR: 1.00e-05
285
+ [2026-04-07 18:10:49] Epoch 1 | Step 580 | Loss: 0.3598 | LM: 0.3006 | LB: 1.1011 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.424/SR1: 0.397 | LR: 1.00e-05
286
+ [2026-04-07 18:10:58] Epoch 1 | Step 590 | Loss: 0.3603 | LM: 0.3019 | LB: 1.1007 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.423/SR1: 0.397 | LR: 1.00e-05
287
+ [2026-04-07 18:11:07] Epoch 1 | Step 600 | Loss: 0.3599 | LM: 0.3012 | LB: 1.1005 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.423/SR1: 0.397 | LR: 1.00e-05
288
+ [2026-04-07 18:11:17] Epoch 1 | Step 610 | Loss: 0.3595 | LM: 0.3009 | LB: 1.1003 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.423/SR1: 0.396 | LR: 1.00e-05
289
+ [2026-04-07 18:11:26] Epoch 1 | Step 620 | Loss: 0.3592 | LM: 0.3011 | LB: 1.1000 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.423/SR1: 0.396 | LR: 1.00e-05
290
+ [2026-04-07 18:11:35] Epoch 1 | Step 630 | Loss: 0.3588 | LM: 0.3008 | LB: 1.0997 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.422/SR1: 0.396 | LR: 1.00e-05
291
+ [2026-04-07 18:11:44] Epoch 1 | Step 640 | Loss: 0.3584 | LM: 0.3009 | LB: 1.0995 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.422/SR1: 0.395 | LR: 1.00e-05
292
+ [2026-04-07 18:11:53] Epoch 1 | Step 650 | Loss: 0.3586 | LM: 0.3004 | LB: 1.0992 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.353 | HR1: 0.422/SR1: 0.395 | LR: 1.00e-05
293
+ [2026-04-07 18:12:02] Epoch 1 | Step 660 | Loss: 0.3582 | LM: 0.3001 | LB: 1.0991 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.421/SR1: 0.395 | LR: 1.00e-05
294
+ [2026-04-07 18:12:11] Epoch 1 | Step 670 | Loss: 0.3588 | LM: 0.3005 | LB: 1.0989 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.421/SR1: 0.394 | LR: 1.00e-05
295
+ [2026-04-07 18:12:20] Epoch 1 | Step 680 | Loss: 0.3585 | LM: 0.3008 | LB: 1.0988 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.421/SR1: 0.394 | LR: 1.00e-05
296
+ [2026-04-07 18:12:29] Epoch 1 | Step 690 | Loss: 0.3586 | LM: 0.3014 | LB: 1.0988 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.421/SR1: 0.394 | LR: 1.00e-05
297
+ [2026-04-07 18:12:38] Epoch 1 | Step 700 | Loss: 0.3587 | LM: 0.3014 | LB: 1.0986 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.421/SR1: 0.394 | LR: 1.00e-05
298
+ [2026-04-07 18:12:47] Epoch 1 | Step 710 | Loss: 0.3585 | LM: 0.3010 | LB: 1.0983 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.420/SR1: 0.393 | LR: 1.00e-05
299
+ [2026-04-07 18:12:56] Epoch 1 | Step 720 | Loss: 0.3581 | LM: 0.3008 | LB: 1.0982 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.420/SR1: 0.393 | LR: 1.00e-05
300
+ [2026-04-07 18:13:05] Epoch 1 | Step 730 | Loss: 0.3578 | LM: 0.3015 | LB: 1.0981 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.420/SR1: 0.393 | LR: 1.00e-05
301
+ [2026-04-07 18:13:14] Epoch 1 | Step 740 | Loss: 0.3576 | LM: 0.3017 | LB: 1.0980 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.420/SR1: 0.393 | LR: 1.00e-05
302
+ [2026-04-07 18:13:23] Epoch 1 | Step 750 | Loss: 0.3574 | LM: 0.3013 | LB: 1.0978 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.420/SR1: 0.393 | LR: 1.00e-05
303
+ [2026-04-07 18:13:24] Validation | Batch 10/732 | Loss: 0.3281 | LM: 0.2740
304
+ [2026-04-07 18:13:25] Validation | Batch 20/732 | Loss: 0.3495 | LM: 0.2954
305
+ [2026-04-07 18:13:26] Validation | Batch 30/732 | Loss: 0.3407 | LM: 0.2866
306
+ [2026-04-07 18:13:27] Validation | Batch 40/732 | Loss: 0.3457 | LM: 0.2917
307
+ [2026-04-07 18:13:29] Validation | Batch 50/732 | Loss: 0.3455 | LM: 0.2915
308
+ [2026-04-07 18:13:30] Validation | Batch 60/732 | Loss: 0.3475 | LM: 0.2935
309
+ [2026-04-07 18:13:31] Validation | Batch 70/732 | Loss: 0.3510 | LM: 0.2969
310
+ [2026-04-07 18:13:32] Validation | Batch 80/732 | Loss: 0.3492 | LM: 0.2953
311
+ [2026-04-07 18:13:33] Validation | Batch 90/732 | Loss: 0.3491 | LM: 0.2951
312
+ [2026-04-07 18:13:34] Validation | Batch 100/732 | Loss: 0.3501 | LM: 0.2961
313
+ [2026-04-07 18:13:35] Validation | Batch 110/732 | Loss: 0.3468 | LM: 0.2928
314
+ [2026-04-07 18:13:37] Validation | Batch 120/732 | Loss: 0.3500 | LM: 0.2961
315
+ [2026-04-07 18:13:38] Validation | Batch 130/732 | Loss: 0.3513 | LM: 0.2973
316
+ [2026-04-07 18:13:39] Validation | Batch 140/732 | Loss: 0.3507 | LM: 0.2967
317
+ [2026-04-07 18:13:40] Validation | Batch 150/732 | Loss: 0.3500 | LM: 0.2961
318
+ [2026-04-07 18:13:41] Validation | Batch 160/732 | Loss: 0.3493 | LM: 0.2953
319
+ [2026-04-07 18:13:42] Validation | Batch 170/732 | Loss: 0.3498 | LM: 0.2959
320
+ [2026-04-07 18:13:43] Validation | Batch 180/732 | Loss: 0.3512 | LM: 0.2972
321
+ [2026-04-07 18:13:44] Validation | Batch 190/732 | Loss: 0.3505 | LM: 0.2965
322
+ [2026-04-07 18:13:45] Validation | Batch 200/732 | Loss: 0.3506 | LM: 0.2966
323
+ [2026-04-07 18:13:46] Validation | Batch 210/732 | Loss: 0.3499 | LM: 0.2959
324
+ [2026-04-07 18:13:47] Validation | Batch 220/732 | Loss: 0.3494 | LM: 0.2955
325
+ [2026-04-07 18:13:49] Validation | Batch 230/732 | Loss: 0.3499 | LM: 0.2959
326
+ [2026-04-07 18:13:50] Validation | Batch 240/732 | Loss: 0.3496 | LM: 0.2956
327
+ [2026-04-07 18:13:51] Validation | Batch 250/732 | Loss: 0.3496 | LM: 0.2956
328
+ [2026-04-07 18:13:52] Validation | Batch 260/732 | Loss: 0.3486 | LM: 0.2946
329
+ [2026-04-07 18:13:53] Validation | Batch 270/732 | Loss: 0.3484 | LM: 0.2944
330
+ [2026-04-07 18:13:54] Validation | Batch 280/732 | Loss: 0.3473 | LM: 0.2933
331
+ [2026-04-07 18:13:55] Validation | Batch 290/732 | Loss: 0.3471 | LM: 0.2931
332
+ [2026-04-07 18:13:56] Validation | Batch 300/732 | Loss: 0.3471 | LM: 0.2930
333
+ [2026-04-07 18:13:58] Validation | Batch 310/732 | Loss: 0.3469 | LM: 0.2929
334
+ [2026-04-07 18:13:59] Validation | Batch 320/732 | Loss: 0.3460 | LM: 0.2920
335
+ [2026-04-07 18:14:00] Validation | Batch 330/732 | Loss: 0.3449 | LM: 0.2909
336
+ [2026-04-07 18:14:01] Validation | Batch 340/732 | Loss: 0.3443 | LM: 0.2903
337
+ [2026-04-07 18:14:02] Validation | Batch 350/732 | Loss: 0.3446 | LM: 0.2906
338
+ [2026-04-07 18:14:03] Validation | Batch 360/732 | Loss: 0.3455 | LM: 0.2914
339
+ [2026-04-07 18:14:04] Validation | Batch 370/732 | Loss: 0.3445 | LM: 0.2905
340
+ [2026-04-07 18:14:05] Validation | Batch 380/732 | Loss: 0.3438 | LM: 0.2898
341
+ [2026-04-07 18:14:06] Validation | Batch 390/732 | Loss: 0.3435 | LM: 0.2895
342
+ [2026-04-07 18:14:07] Validation | Batch 400/732 | Loss: 0.3433 | LM: 0.2893
343
+ [2026-04-07 18:14:08] Validation | Batch 410/732 | Loss: 0.3426 | LM: 0.2886
344
+ [2026-04-07 18:14:09] Validation | Batch 420/732 | Loss: 0.3428 | LM: 0.2888
345
+ [2026-04-07 18:14:10] Validation | Batch 430/732 | Loss: 0.3427 | LM: 0.2887
346
+ [2026-04-07 18:14:12] Validation | Batch 440/732 | Loss: 0.3422 | LM: 0.2882
347
+ [2026-04-07 18:14:13] Validation | Batch 450/732 | Loss: 0.3421 | LM: 0.2881
348
+ [2026-04-07 18:14:14] Validation | Batch 460/732 | Loss: 0.3425 | LM: 0.2884
349
+ [2026-04-07 18:14:15] Validation | Batch 470/732 | Loss: 0.3422 | LM: 0.2882
350
+ [2026-04-07 18:14:16] Validation | Batch 480/732 | Loss: 0.3423 | LM: 0.2883
351
+ [2026-04-07 18:14:17] Validation | Batch 490/732 | Loss: 0.3434 | LM: 0.2894
352
+ [2026-04-07 18:14:18] Validation | Batch 500/732 | Loss: 0.3445 | LM: 0.2904
353
+ [2026-04-07 18:14:19] Validation | Batch 510/732 | Loss: 0.3441 | LM: 0.2901
354
+ [2026-04-07 18:14:20] Validation | Batch 520/732 | Loss: 0.3439 | LM: 0.2899
355
+ [2026-04-07 18:14:21] Validation | Batch 530/732 | Loss: 0.3433 | LM: 0.2893
356
+ [2026-04-07 18:14:23] Validation | Batch 540/732 | Loss: 0.3435 | LM: 0.2895
357
+ [2026-04-07 18:14:24] Validation | Batch 550/732 | Loss: 0.3435 | LM: 0.2894
358
+ [2026-04-07 18:14:25] Validation | Batch 560/732 | Loss: 0.3429 | LM: 0.2889
359
+ [2026-04-07 18:14:26] Validation | Batch 570/732 | Loss: 0.3430 | LM: 0.2890
360
+ [2026-04-07 18:14:27] Validation | Batch 580/732 | Loss: 0.3427 | LM: 0.2887
361
+ [2026-04-07 18:14:28] Validation | Batch 590/732 | Loss: 0.3427 | LM: 0.2887
362
+ [2026-04-07 18:14:30] Validation | Batch 600/732 | Loss: 0.3426 | LM: 0.2886
363
+ [2026-04-07 18:14:31] Validation | Batch 610/732 | Loss: 0.3432 | LM: 0.2892
364
+ [2026-04-07 18:14:32] Validation | Batch 620/732 | Loss: 0.3435 | LM: 0.2895
365
+ [2026-04-07 18:14:33] Validation | Batch 630/732 | Loss: 0.3433 | LM: 0.2893
366
+ [2026-04-07 18:14:34] Validation | Batch 640/732 | Loss: 0.3431 | LM: 0.2891
367
+ [2026-04-07 18:14:35] Validation | Batch 650/732 | Loss: 0.3429 | LM: 0.2889
368
+ [2026-04-07 18:14:36] Validation | Batch 660/732 | Loss: 0.3434 | LM: 0.2894
369
+ [2026-04-07 18:14:37] Validation | Batch 670/732 | Loss: 0.3440 | LM: 0.2900
370
+ [2026-04-07 18:14:38] Validation | Batch 680/732 | Loss: 0.3440 | LM: 0.2900
371
+ [2026-04-07 18:14:39] Validation | Batch 690/732 | Loss: 0.3442 | LM: 0.2902
372
+ [2026-04-07 18:14:41] Validation | Batch 700/732 | Loss: 0.3447 | LM: 0.2907
373
+ [2026-04-07 18:14:42] Validation | Batch 710/732 | Loss: 0.3451 | LM: 0.2910
374
+ [2026-04-07 18:14:43] Validation | Batch 720/732 | Loss: 0.3460 | LM: 0.2920
375
+ [2026-04-07 18:14:44] Validation | Batch 730/732 | Loss: 0.3457 | LM: 0.2917
376
+ [2026-04-07 18:14:44] Validation | Batch 732/732 | Loss: 0.3455 | LM: 0.2915
377
+ [2026-04-07 18:14:44] Validation | Loss: 0.3455 | LM: 0.2915 | PPL: 1.34 | Time: 81.42s
378
+ [2026-04-07 18:14:51] New best model saved! Val loss: 0.3455
379
+ [2026-04-07 18:15:00] Epoch 1 | Step 760 | Loss: 0.3571 | LM: 0.3005 | LB: 1.0976 | CL0: 2.8 | CL1: 2.4 | HR0: 0.357/SR0: 0.354 | HR1: 0.420/SR1: 0.392 | LR: 1.00e-05
380
+ [2026-04-07 18:15:09] Epoch 1 | Step 770 | Loss: 0.3569 | LM: 0.3004 | LB: 1.0976 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.420/SR1: 0.392 | LR: 1.00e-05
381
+ [2026-04-07 18:15:18] Epoch 1 | Step 780 | Loss: 0.3572 | LM: 0.3019 | LB: 1.0973 | CL0: 2.8 | CL1: 2.4 | HR0: 0.356/SR0: 0.354 | HR1: 0.420/SR1: 0.392 | LR: 1.00e-05
382
+ [2026-04-07 18:15:19] Reached max_steps=781, stopping training.
383
+ [2026-04-07 18:15:19] Epoch 1 completed in 992.60s | Loss: 0.3570 | CL0: 2.8 | CL1: 2.4
384
+ [2026-04-07 18:15:19]
385
+ Training completed!
386
+ [2026-04-07 18:15:22] Final model: outputs/N_4.0/model_final.pt
routing_tuning_test_07_04/N_6.0/.hydra/config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ config_path: ${oc.env:PROJECT_ROOT}/hnet_project/configs/hnet_2stage_XL_code.json
3
+ checkpoint_path: ${oc.env:PROJECT_ROOT}/hnet_project/checkpoints/hnet_2stage_XL_code.pt
4
+ training:
5
+ epochs: 1
6
+ max_steps: null
7
+ batch_size: 8
8
+ eval_batch_size: 24
9
+ gradient_accumulation_steps: 4
10
+ lr: 0.0001
11
+ weight_decay: 0.1
12
+ betas:
13
+ - 0.9
14
+ - 0.95
15
+ eps: 1.0e-08
16
+ lr_scheduler: wsd
17
+ warmup_ratio: 0.1
18
+ decay_ratio: 0.2
19
+ warmup_steps: 100
20
+ min_lr_ratio: 0.1
21
+ lr_multiplier:
22
+ - 2.0
23
+ - 1.5
24
+ - 1.0
25
+ load_balancing_weight: 0.05
26
+ load_balancing_N: 6.0
27
+ max_grad_norm: 1.0
28
+ use_amp: true
29
+ resume: false
30
+ resume_checkpoint: null
31
+ warmup_model: true
32
+ data:
33
+ path: ${oc.env:PROJECT_ROOT}/code_completion_exp/datasets/data_V4_full
34
+ max_context_len: 4096
35
+ max_target_len: 256
36
+ num_workers: 0
37
+ pin_memory: true
38
+ max_train_samples: 50000
39
+ max_val_samples: null
40
+ logging:
41
+ log_interval: 10
42
+ save_interval: 1000
43
+ eval_interval: 250
44
+ save_every_epoch: false
45
+ model_only_checkpoints: true
46
+ tracking:
47
+ enabled: true
48
+ project: routing-evolution
49
+ run_name: routing_N6.0
50
+ paths:
51
+ output_dir: outputs/N_${training.load_balancing_N}
52
+ seed: 42
53
+ device: cuda
routing_tuning_test_07_04/N_6.0/.hydra/hydra.yaml ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${paths.output_dir}
4
+ sweep:
5
+ dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - training.load_balancing_N=6.0
116
+ - tracking.run_name=routing_N6.0
117
+ job:
118
+ name: train
119
+ chdir: false
120
+ override_dirname: tracking.run_name=routing_N6.0,training.load_balancing_N=6.0
121
+ id: ???
122
+ num: ???
123
+ config_name: config
124
+ env_set: {}
125
+ env_copy: []
126
+ config:
127
+ override_dirname:
128
+ kv_sep: '='
129
+ item_sep: ','
130
+ exclude_keys: []
131
+ runtime:
132
+ version: 1.3.2
133
+ version_base: '1.3'
134
+ cwd: /workspace/byte-llms-code/routing_evolution_exp
135
+ config_sources:
136
+ - path: hydra.conf
137
+ schema: pkg
138
+ provider: hydra
139
+ - path: /workspace/byte-llms-code/routing_evolution_exp/configs
140
+ schema: file
141
+ provider: main
142
+ - path: ''
143
+ schema: structured
144
+ provider: schema
145
+ output_dir: /workspace/byte-llms-code/routing_evolution_exp/outputs/N_6.0
146
+ choices:
147
+ paths: default
148
+ tracking: default
149
+ logging: default
150
+ data: default
151
+ training: default
152
+ model: hnet_xl_code
153
+ hydra/env: default
154
+ hydra/callbacks: null
155
+ hydra/job_logging: default
156
+ hydra/hydra_logging: default
157
+ hydra/hydra_help: default
158
+ hydra/help: default
159
+ hydra/sweeper: basic
160
+ hydra/launcher: basic
161
+ hydra/output: default
162
+ verbose: false
routing_tuning_test_07_04/N_6.0/.hydra/overrides.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ - training.load_balancing_N=6.0
2
+ - tracking.run_name=routing_N6.0
routing_tuning_test_07_04/N_6.0/model_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:064b89f7fd706b1f705261db40d657000945a625a7099ed3a27ff2992de07de4
3
+ size 3315165139
routing_tuning_test_07_04/N_6.0/model_final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4e1dfd3b4f217116471843f4e33c225ad9497993eaa6c199237d26f00a77eeb
3
+ size 3315165484
routing_tuning_test_07_04/N_6.0/routing_weights/routing_step_0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:132bc38ff57f9db6c9ca06a66251bf38dc4231f85bd01101af2cf1df371b5db7
3
+ size 13633736
routing_tuning_test_07_04/N_6.0/routing_weights/routing_step_781.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fef4e624ccaafb1e4e287dfcc9afdf773ba6a0833c95fafe489ad0cbe32ee7ce
3
+ size 13633752
routing_tuning_test_07_04/N_6.0/train.log ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-04-07 18:15:40] CUDA_VISIBLE_DEVICES: 0,1
2
+ [2026-04-07 18:15:40] Number of processes: 2
3
+ [2026-04-07 18:15:40] Mixed precision: bf16
4
+ [2026-04-07 18:15:40] ============================================================
5
+ [2026-04-07 18:15:40] Routing Evolution Experiment | N=6.0
6
+ [2026-04-07 18:15:40] ============================================================
7
+ [2026-04-07 18:15:40] Config:
8
+ model:
9
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
10
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
11
+ training:
12
+ epochs: 1
13
+ max_steps: null
14
+ batch_size: 8
15
+ eval_batch_size: 24
16
+ gradient_accumulation_steps: 4
17
+ lr: 0.0001
18
+ weight_decay: 0.1
19
+ betas:
20
+ - 0.9
21
+ - 0.95
22
+ eps: 1.0e-08
23
+ lr_scheduler: wsd
24
+ warmup_ratio: 0.1
25
+ decay_ratio: 0.2
26
+ warmup_steps: 100
27
+ min_lr_ratio: 0.1
28
+ lr_multiplier:
29
+ - 2.0
30
+ - 1.5
31
+ - 1.0
32
+ load_balancing_weight: 0.05
33
+ load_balancing_N: 6.0
34
+ max_grad_norm: 1.0
35
+ use_amp: true
36
+ resume: false
37
+ resume_checkpoint: null
38
+ warmup_model: true
39
+ data:
40
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
41
+ max_context_len: 4096
42
+ max_target_len: 256
43
+ num_workers: 0
44
+ pin_memory: true
45
+ max_train_samples: 50000
46
+ max_val_samples: null
47
+ logging:
48
+ log_interval: 10
49
+ save_interval: 1000
50
+ eval_interval: 250
51
+ save_every_epoch: false
52
+ model_only_checkpoints: true
53
+ tracking:
54
+ enabled: true
55
+ project: routing-evolution
56
+ run_name: routing_N6.0
57
+ paths:
58
+ output_dir: outputs/N_6.0
59
+ seed: 42
60
+ device: cuda
61
+
62
+ [2026-04-07 18:15:40] Loading model...
63
+ [2026-04-07 18:15:46] Loaded pretrained: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
64
+ [2026-04-07 18:15:46] Applied LR multipliers: [2.0, 1.5, 1.0]
65
+ [2026-04-07 18:15:46] Warming up model...
66
+ [2026-04-07 18:16:31] Total params: 1,654,090,112
67
+ [2026-04-07 18:16:31] Trainable params: 1,654,090,112
68
+ [2026-04-07 18:16:31] Creating dataloaders...
69
+ [2026-04-07 18:16:31] Train dataset size: 50000 (max_train_samples=50000) | Epochs: 1
70
+ [2026-04-07 18:16:31] Max steps: 781, Steps per epoch: 3125
71
+ [2026-04-07 18:16:33] Starting training...
72
+ [2026-04-07 18:16:33]
73
+ ============================================================
74
+ [2026-04-07 18:16:33] EPOCH 1/1 (step 0)
75
+ [2026-04-07 18:16:33] ============================================================
76
+ [2026-04-07 18:17:07] Epoch 1 | Step 10 | Loss: 0.7189 | LM: 0.6496 | LB: 1.4740 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.361 | HR1: 0.482/SR1: 0.460 | LR: 3.31e-05
77
+ [2026-04-07 18:17:16] Epoch 1 | Step 20 | Loss: 0.6183 | LM: 0.5507 | LB: 1.4702 | CL0: 2.8 | CL1: 2.1 | HR0: 0.364/SR0: 0.362 | HR1: 0.479/SR1: 0.458 | LR: 5.62e-05
78
+ [2026-04-07 18:17:26] Epoch 1 | Step 30 | Loss: 0.5597 | LM: 0.4786 | LB: 1.4658 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.361 | HR1: 0.478/SR1: 0.456 | LR: 7.92e-05
79
+ [2026-04-07 18:17:35] Epoch 1 | Step 40 | Loss: 0.5234 | LM: 0.4475 | LB: 1.4524 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.474/SR1: 0.454 | LR: 1.00e-04
80
+ [2026-04-07 18:17:44] Epoch 1 | Step 50 | Loss: 0.5007 | LM: 0.4094 | LB: 1.4414 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.357 | HR1: 0.469/SR1: 0.449 | LR: 1.00e-04
81
+ [2026-04-07 18:17:53] Epoch 1 | Step 60 | Loss: 0.4742 | LM: 0.3811 | LB: 1.4383 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.468/SR1: 0.448 | LR: 1.00e-04
82
+ [2026-04-07 18:18:02] Epoch 1 | Step 70 | Loss: 0.4596 | LM: 0.3773 | LB: 1.4349 | CL0: 2.8 | CL1: 2.2 | HR0: 0.359/SR0: 0.358 | HR1: 0.465/SR1: 0.445 | LR: 1.00e-04
83
+ [2026-04-07 18:18:11] Epoch 1 | Step 80 | Loss: 0.4446 | LM: 0.3666 | LB: 1.4292 | CL0: 2.8 | CL1: 2.2 | HR0: 0.360/SR0: 0.358 | HR1: 0.462/SR1: 0.441 | LR: 1.00e-04
84
+ [2026-04-07 18:18:20] Epoch 1 | Step 90 | Loss: 0.4359 | LM: 0.3536 | LB: 1.4221 | CL0: 2.8 | CL1: 2.2 | HR0: 0.360/SR0: 0.358 | HR1: 0.458/SR1: 0.437 | LR: 1.00e-04
85
+ [2026-04-07 18:18:29] Epoch 1 | Step 100 | Loss: 0.4306 | LM: 0.3510 | LB: 1.4150 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.356 | HR1: 0.456/SR1: 0.435 | LR: 1.00e-04
86
+ [2026-04-07 18:18:39] Epoch 1 | Step 110 | Loss: 0.4261 | LM: 0.3459 | LB: 1.4085 | CL0: 2.8 | CL1: 2.2 | HR0: 0.358/SR0: 0.356 | HR1: 0.453/SR1: 0.431 | LR: 1.00e-04
87
+ [2026-04-07 18:18:48] Epoch 1 | Step 120 | Loss: 0.4234 | LM: 0.3474 | LB: 1.4018 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.355 | HR1: 0.451/SR1: 0.428 | LR: 1.00e-04
88
+ [2026-04-07 18:18:57] Epoch 1 | Step 130 | Loss: 0.4192 | LM: 0.3443 | LB: 1.3968 | CL0: 2.8 | CL1: 2.2 | HR0: 0.356/SR0: 0.355 | HR1: 0.449/SR1: 0.426 | LR: 1.00e-04
89
+ [2026-04-07 18:19:06] Epoch 1 | Step 140 | Loss: 0.4148 | LM: 0.3400 | LB: 1.3902 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.445/SR1: 0.423 | LR: 1.00e-04
90
+ [2026-04-07 18:19:15] Epoch 1 | Step 150 | Loss: 0.4123 | LM: 0.3377 | LB: 1.3854 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.443/SR1: 0.420 | LR: 1.00e-04
91
+ [2026-04-07 18:19:24] Epoch 1 | Step 160 | Loss: 0.4078 | LM: 0.3323 | LB: 1.3805 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.441/SR1: 0.418 | LR: 1.00e-04
92
+ [2026-04-07 18:19:33] Epoch 1 | Step 170 | Loss: 0.4029 | LM: 0.3276 | LB: 1.3761 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.439/SR1: 0.415 | LR: 1.00e-04
93
+ [2026-04-07 18:19:42] Epoch 1 | Step 180 | Loss: 0.3992 | LM: 0.3236 | LB: 1.3724 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.353 | HR1: 0.437/SR1: 0.413 | LR: 1.00e-04
94
+ [2026-04-07 18:19:51] Epoch 1 | Step 190 | Loss: 0.3974 | LM: 0.3234 | LB: 1.3686 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.434/SR1: 0.410 | LR: 1.00e-04
95
+ [2026-04-07 18:20:00] Epoch 1 | Step 200 | Loss: 0.3948 | LM: 0.3207 | LB: 1.3641 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.432/SR1: 0.408 | LR: 1.00e-04
96
+ [2026-04-07 18:20:09] Epoch 1 | Step 210 | Loss: 0.3945 | LM: 0.3213 | LB: 1.3598 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.430/SR1: 0.406 | LR: 1.00e-04
97
+ [2026-04-07 18:20:18] Epoch 1 | Step 220 | Loss: 0.3935 | LM: 0.3194 | LB: 1.3568 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.429/SR1: 0.404 | LR: 1.00e-04
98
+ [2026-04-07 18:20:27] Epoch 1 | Step 230 | Loss: 0.3920 | LM: 0.3186 | LB: 1.3533 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.353 | HR1: 0.427/SR1: 0.402 | LR: 1.00e-04
99
+ [2026-04-07 18:20:36] Epoch 1 | Step 240 | Loss: 0.3898 | LM: 0.3196 | LB: 1.3500 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.425/SR1: 0.400 | LR: 1.00e-04
100
+ [2026-04-07 18:20:45] Epoch 1 | Step 250 | Loss: 0.3874 | LM: 0.3177 | LB: 1.3465 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.424/SR1: 0.398 | LR: 1.00e-04
101
+ [2026-04-07 18:20:46] Validation | Batch 10/732 | Loss: 0.3396 | LM: 0.2766
102
+ [2026-04-07 18:20:47] Validation | Batch 20/732 | Loss: 0.3626 | LM: 0.2992
103
+ [2026-04-07 18:20:48] Validation | Batch 30/732 | Loss: 0.3535 | LM: 0.2904
104
+ [2026-04-07 18:20:49] Validation | Batch 40/732 | Loss: 0.3589 | LM: 0.2958
105
+ [2026-04-07 18:20:51] Validation | Batch 50/732 | Loss: 0.3593 | LM: 0.2962
106
+ [2026-04-07 18:20:52] Validation | Batch 60/732 | Loss: 0.3618 | LM: 0.2987
107
+ [2026-04-07 18:20:53] Validation | Batch 70/732 | Loss: 0.3653 | LM: 0.3022
108
+ [2026-04-07 18:20:54] Validation | Batch 80/732 | Loss: 0.3634 | LM: 0.3004
109
+ [2026-04-07 18:20:55] Validation | Batch 90/732 | Loss: 0.3630 | LM: 0.3000
110
+ [2026-04-07 18:20:56] Validation | Batch 100/732 | Loss: 0.3639 | LM: 0.3009
111
+ [2026-04-07 18:20:57] Validation | Batch 110/732 | Loss: 0.3607 | LM: 0.2976
112
+ [2026-04-07 18:20:58] Validation | Batch 120/732 | Loss: 0.3640 | LM: 0.3010
113
+ [2026-04-07 18:20:59] Validation | Batch 130/732 | Loss: 0.3653 | LM: 0.3024
114
+ [2026-04-07 18:21:00] Validation | Batch 140/732 | Loss: 0.3647 | LM: 0.3017
115
+ [2026-04-07 18:21:02] Validation | Batch 150/732 | Loss: 0.3641 | LM: 0.3010
116
+ [2026-04-07 18:21:03] Validation | Batch 160/732 | Loss: 0.3631 | LM: 0.3001
117
+ [2026-04-07 18:21:03] Validation | Batch 170/732 | Loss: 0.3635 | LM: 0.3005
118
+ [2026-04-07 18:21:05] Validation | Batch 180/732 | Loss: 0.3649 | LM: 0.3019
119
+ [2026-04-07 18:21:06] Validation | Batch 190/732 | Loss: 0.3642 | LM: 0.3012
120
+ [2026-04-07 18:21:07] Validation | Batch 200/732 | Loss: 0.3643 | LM: 0.3013
121
+ [2026-04-07 18:21:08] Validation | Batch 210/732 | Loss: 0.3635 | LM: 0.3005
122
+ [2026-04-07 18:21:09] Validation | Batch 220/732 | Loss: 0.3630 | LM: 0.2999
123
+ [2026-04-07 18:21:10] Validation | Batch 230/732 | Loss: 0.3634 | LM: 0.3004
124
+ [2026-04-07 18:21:11] Validation | Batch 240/732 | Loss: 0.3631 | LM: 0.3001
125
+ [2026-04-07 18:21:13] Validation | Batch 250/732 | Loss: 0.3632 | LM: 0.3001
126
+ [2026-04-07 18:21:14] Validation | Batch 260/732 | Loss: 0.3622 | LM: 0.2991
127
+ [2026-04-07 18:21:15] Validation | Batch 270/732 | Loss: 0.3619 | LM: 0.2988
128
+ [2026-04-07 18:21:16] Validation | Batch 280/732 | Loss: 0.3608 | LM: 0.2977
129
+ [2026-04-07 18:21:17] Validation | Batch 290/732 | Loss: 0.3607 | LM: 0.2976
130
+ [2026-04-07 18:21:18] Validation | Batch 300/732 | Loss: 0.3606 | LM: 0.2975
131
+ [2026-04-07 18:21:19] Validation | Batch 310/732 | Loss: 0.3605 | LM: 0.2974
132
+ [2026-04-07 18:21:20] Validation | Batch 320/732 | Loss: 0.3595 | LM: 0.2964
133
+ [2026-04-07 18:21:21] Validation | Batch 330/732 | Loss: 0.3584 | LM: 0.2953
134
+ [2026-04-07 18:21:22] Validation | Batch 340/732 | Loss: 0.3578 | LM: 0.2947
135
+ [2026-04-07 18:21:23] Validation | Batch 350/732 | Loss: 0.3581 | LM: 0.2950
136
+ [2026-04-07 18:21:24] Validation | Batch 360/732 | Loss: 0.3589 | LM: 0.2958
137
+ [2026-04-07 18:21:25] Validation | Batch 370/732 | Loss: 0.3580 | LM: 0.2948
138
+ [2026-04-07 18:21:26] Validation | Batch 380/732 | Loss: 0.3573 | LM: 0.2942
139
+ [2026-04-07 18:21:27] Validation | Batch 390/732 | Loss: 0.3569 | LM: 0.2938
140
+ [2026-04-07 18:21:28] Validation | Batch 400/732 | Loss: 0.3567 | LM: 0.2936
141
+ [2026-04-07 18:21:29] Validation | Batch 410/732 | Loss: 0.3560 | LM: 0.2929
142
+ [2026-04-07 18:21:30] Validation | Batch 420/732 | Loss: 0.3562 | LM: 0.2931
143
+ [2026-04-07 18:21:31] Validation | Batch 430/732 | Loss: 0.3561 | LM: 0.2930
144
+ [2026-04-07 18:21:32] Validation | Batch 440/732 | Loss: 0.3556 | LM: 0.2924
145
+ [2026-04-07 18:21:33] Validation | Batch 450/732 | Loss: 0.3554 | LM: 0.2923
146
+ [2026-04-07 18:21:35] Validation | Batch 460/732 | Loss: 0.3557 | LM: 0.2926
147
+ [2026-04-07 18:21:36] Validation | Batch 470/732 | Loss: 0.3555 | LM: 0.2924
148
+ [2026-04-07 18:21:37] Validation | Batch 480/732 | Loss: 0.3557 | LM: 0.2926
149
+ [2026-04-07 18:21:38] Validation | Batch 490/732 | Loss: 0.3568 | LM: 0.2937
150
+ [2026-04-07 18:21:39] Validation | Batch 500/732 | Loss: 0.3579 | LM: 0.2947
151
+ [2026-04-07 18:21:40] Validation | Batch 510/732 | Loss: 0.3575 | LM: 0.2944
152
+ [2026-04-07 18:21:41] Validation | Batch 520/732 | Loss: 0.3573 | LM: 0.2942
153
+ [2026-04-07 18:21:42] Validation | Batch 530/732 | Loss: 0.3567 | LM: 0.2936
154
+ [2026-04-07 18:21:43] Validation | Batch 540/732 | Loss: 0.3569 | LM: 0.2937
155
+ [2026-04-07 18:21:44] Validation | Batch 550/732 | Loss: 0.3568 | LM: 0.2937
156
+ [2026-04-07 18:21:45] Validation | Batch 560/732 | Loss: 0.3564 | LM: 0.2933
157
+ [2026-04-07 18:21:46] Validation | Batch 570/732 | Loss: 0.3565 | LM: 0.2934
158
+ [2026-04-07 18:21:48] Validation | Batch 580/732 | Loss: 0.3562 | LM: 0.2931
159
+ [2026-04-07 18:21:49] Validation | Batch 590/732 | Loss: 0.3562 | LM: 0.2930
160
+ [2026-04-07 18:21:50] Validation | Batch 600/732 | Loss: 0.3561 | LM: 0.2930
161
+ [2026-04-07 18:21:51] Validation | Batch 610/732 | Loss: 0.3567 | LM: 0.2936
162
+ [2026-04-07 18:21:52] Validation | Batch 620/732 | Loss: 0.3571 | LM: 0.2940
163
+ [2026-04-07 18:21:53] Validation | Batch 630/732 | Loss: 0.3569 | LM: 0.2937
164
+ [2026-04-07 18:21:55] Validation | Batch 640/732 | Loss: 0.3566 | LM: 0.2935
165
+ [2026-04-07 18:21:56] Validation | Batch 650/732 | Loss: 0.3565 | LM: 0.2933
166
+ [2026-04-07 18:21:57] Validation | Batch 660/732 | Loss: 0.3569 | LM: 0.2938
167
+ [2026-04-07 18:21:58] Validation | Batch 670/732 | Loss: 0.3576 | LM: 0.2944
168
+ [2026-04-07 18:21:59] Validation | Batch 680/732 | Loss: 0.3575 | LM: 0.2944
169
+ [2026-04-07 18:22:00] Validation | Batch 690/732 | Loss: 0.3577 | LM: 0.2946
170
+ [2026-04-07 18:22:01] Validation | Batch 700/732 | Loss: 0.3582 | LM: 0.2951
171
+ [2026-04-07 18:22:02] Validation | Batch 710/732 | Loss: 0.3586 | LM: 0.2955
172
+ [2026-04-07 18:22:03] Validation | Batch 720/732 | Loss: 0.3596 | LM: 0.2964
173
+ [2026-04-07 18:22:04] Validation | Batch 730/732 | Loss: 0.3592 | LM: 0.2961
174
+ [2026-04-07 18:22:04] Validation | Batch 732/732 | Loss: 0.3591 | LM: 0.2959
175
+ [2026-04-07 18:22:04] Validation | Loss: 0.3591 | LM: 0.2959 | PPL: 1.35 | Time: 79.46s
176
+ [2026-04-07 18:22:07] New best model saved! Val loss: 0.3591
177
+ [2026-04-07 18:22:16] Epoch 1 | Step 260 | Loss: 0.3854 | LM: 0.3169 | LB: 1.3433 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.422/SR1: 0.396 | LR: 1.00e-04
178
+ [2026-04-07 18:22:25] Epoch 1 | Step 270 | Loss: 0.3854 | LM: 0.3153 | LB: 1.3405 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.420/SR1: 0.394 | LR: 1.00e-04
179
+ [2026-04-07 18:22:34] Epoch 1 | Step 280 | Loss: 0.3852 | LM: 0.3155 | LB: 1.3376 | CL0: 2.8 | CL1: 2.4 | HR0: 0.354/SR0: 0.352 | HR1: 0.419/SR1: 0.393 | LR: 1.00e-04
180
+ [2026-04-07 18:22:43] Epoch 1 | Step 290 | Loss: 0.3834 | LM: 0.3135 | LB: 1.3348 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.417/SR1: 0.391 | LR: 1.00e-04
181
+ [2026-04-07 18:22:52] Epoch 1 | Step 300 | Loss: 0.3827 | LM: 0.3130 | LB: 1.3326 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.416/SR1: 0.389 | LR: 1.00e-04
182
+ [2026-04-07 18:23:01] Epoch 1 | Step 310 | Loss: 0.3817 | LM: 0.3121 | LB: 1.3305 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.415/SR1: 0.387 | LR: 1.00e-04
183
+ [2026-04-07 18:23:10] Epoch 1 | Step 320 | Loss: 0.3810 | LM: 0.3115 | LB: 1.3280 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.413/SR1: 0.386 | LR: 9.80e-05
184
+ [2026-04-07 18:23:19] Epoch 1 | Step 330 | Loss: 0.3796 | LM: 0.3106 | LB: 1.3255 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.352 | HR1: 0.412/SR1: 0.384 | LR: 8.93e-05
185
+ [2026-04-07 18:23:28] Epoch 1 | Step 340 | Loss: 0.3788 | LM: 0.3099 | LB: 1.3231 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.352 | HR1: 0.410/SR1: 0.383 | LR: 7.51e-05
186
+ [2026-04-07 18:23:37] Epoch 1 | Step 350 | Loss: 0.3782 | LM: 0.3105 | LB: 1.3210 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.352 | HR1: 0.409/SR1: 0.381 | LR: 5.77e-05
187
+ [2026-04-07 18:23:46] Epoch 1 | Step 360 | Loss: 0.3774 | LM: 0.3120 | LB: 1.3191 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.352 | HR1: 0.408/SR1: 0.380 | LR: 3.99e-05
188
+ [2026-04-07 18:23:55] Epoch 1 | Step 370 | Loss: 0.3769 | LM: 0.3116 | LB: 1.3176 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.352 | HR1: 0.407/SR1: 0.379 | LR: 2.45e-05
189
+ [2026-04-07 18:24:04] Epoch 1 | Step 380 | Loss: 0.3758 | LM: 0.3097 | LB: 1.3158 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.407/SR1: 0.378 | LR: 1.40e-05
190
+ [2026-04-07 18:24:13] Epoch 1 | Step 390 | Loss: 0.3759 | LM: 0.3095 | LB: 1.3138 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.352 | HR1: 0.405/SR1: 0.377 | LR: 1.00e-05
191
+ [2026-04-07 18:24:22] Epoch 1 | Step 400 | Loss: 0.3757 | LM: 0.3102 | LB: 1.3121 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.405/SR1: 0.376 | LR: 1.00e-05
192
+ [2026-04-07 18:24:31] Epoch 1 | Step 410 | Loss: 0.3756 | LM: 0.3101 | LB: 1.3104 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.404/SR1: 0.375 | LR: 1.00e-05
193
+ [2026-04-07 18:24:40] Epoch 1 | Step 420 | Loss: 0.3756 | LM: 0.3095 | LB: 1.3089 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.403/SR1: 0.374 | LR: 1.00e-05
194
+ [2026-04-07 18:24:49] Epoch 1 | Step 430 | Loss: 0.3750 | LM: 0.3078 | LB: 1.3075 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.402/SR1: 0.373 | LR: 1.00e-05
195
+ [2026-04-07 18:24:58] Epoch 1 | Step 440 | Loss: 0.3742 | LM: 0.3069 | LB: 1.3064 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.351 | HR1: 0.402/SR1: 0.373 | LR: 1.00e-05
196
+ [2026-04-07 18:25:07] Epoch 1 | Step 450 | Loss: 0.3735 | LM: 0.3056 | LB: 1.3051 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.401/SR1: 0.372 | LR: 1.00e-05
197
+ [2026-04-07 18:25:16] Epoch 1 | Step 460 | Loss: 0.3736 | LM: 0.3047 | LB: 1.3038 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.401/SR1: 0.371 | LR: 1.00e-05
198
+ [2026-04-07 18:25:25] Epoch 1 | Step 470 | Loss: 0.3734 | LM: 0.3055 | LB: 1.3029 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.400/SR1: 0.370 | LR: 1.00e-05
199
+ [2026-04-07 18:25:34] Epoch 1 | Step 480 | Loss: 0.3730 | LM: 0.3048 | LB: 1.3016 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.350 | HR1: 0.399/SR1: 0.369 | LR: 1.00e-05
200
+ [2026-04-07 18:25:43] Epoch 1 | Step 490 | Loss: 0.3722 | LM: 0.3037 | LB: 1.3006 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.351 | HR1: 0.399/SR1: 0.368 | LR: 1.00e-05
201
+ [2026-04-07 18:25:52] Epoch 1 | Step 500 | Loss: 0.3724 | LM: 0.3041 | LB: 1.2995 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.350 | HR1: 0.398/SR1: 0.368 | LR: 1.00e-05
202
+ [2026-04-07 18:25:53] Validation | Batch 10/732 | Loss: 0.3368 | LM: 0.2749
203
+ [2026-04-07 18:25:54] Validation | Batch 20/732 | Loss: 0.3593 | LM: 0.2970
204
+ [2026-04-07 18:25:56] Validation | Batch 30/732 | Loss: 0.3508 | LM: 0.2887
205
+ [2026-04-07 18:25:57] Validation | Batch 40/732 | Loss: 0.3562 | LM: 0.2941
206
+ [2026-04-07 18:25:58] Validation | Batch 50/732 | Loss: 0.3562 | LM: 0.2941
207
+ [2026-04-07 18:25:59] Validation | Batch 60/732 | Loss: 0.3582 | LM: 0.2961
208
+ [2026-04-07 18:26:00] Validation | Batch 70/732 | Loss: 0.3616 | LM: 0.2995
209
+ [2026-04-07 18:26:01] Validation | Batch 80/732 | Loss: 0.3598 | LM: 0.2979
210
+ [2026-04-07 18:26:02] Validation | Batch 90/732 | Loss: 0.3596 | LM: 0.2977
211
+ [2026-04-07 18:26:03] Validation | Batch 100/732 | Loss: 0.3606 | LM: 0.2986
212
+ [2026-04-07 18:26:05] Validation | Batch 110/732 | Loss: 0.3573 | LM: 0.2953
213
+ [2026-04-07 18:26:06] Validation | Batch 120/732 | Loss: 0.3604 | LM: 0.2984
214
+ [2026-04-07 18:26:07] Validation | Batch 130/732 | Loss: 0.3616 | LM: 0.2996
215
+ [2026-04-07 18:26:08] Validation | Batch 140/732 | Loss: 0.3610 | LM: 0.2990
216
+ [2026-04-07 18:26:09] Validation | Batch 150/732 | Loss: 0.3604 | LM: 0.2984
217
+ [2026-04-07 18:26:10] Validation | Batch 160/732 | Loss: 0.3595 | LM: 0.2975
218
+ [2026-04-07 18:26:11] Validation | Batch 170/732 | Loss: 0.3600 | LM: 0.2980
219
+ [2026-04-07 18:26:12] Validation | Batch 180/732 | Loss: 0.3614 | LM: 0.2995
220
+ [2026-04-07 18:26:13] Validation | Batch 190/732 | Loss: 0.3607 | LM: 0.2987
221
+ [2026-04-07 18:26:14] Validation | Batch 200/732 | Loss: 0.3608 | LM: 0.2989
222
+ [2026-04-07 18:26:15] Validation | Batch 210/732 | Loss: 0.3601 | LM: 0.2981
223
+ [2026-04-07 18:26:16] Validation | Batch 220/732 | Loss: 0.3596 | LM: 0.2976
224
+ [2026-04-07 18:26:17] Validation | Batch 230/732 | Loss: 0.3600 | LM: 0.2980
225
+ [2026-04-07 18:26:19] Validation | Batch 240/732 | Loss: 0.3597 | LM: 0.2978
226
+ [2026-04-07 18:26:20] Validation | Batch 250/732 | Loss: 0.3597 | LM: 0.2977
227
+ [2026-04-07 18:26:21] Validation | Batch 260/732 | Loss: 0.3587 | LM: 0.2967
228
+ [2026-04-07 18:26:22] Validation | Batch 270/732 | Loss: 0.3585 | LM: 0.2965
229
+ [2026-04-07 18:26:23] Validation | Batch 280/732 | Loss: 0.3576 | LM: 0.2955
230
+ [2026-04-07 18:26:24] Validation | Batch 290/732 | Loss: 0.3573 | LM: 0.2953
231
+ [2026-04-07 18:26:25] Validation | Batch 300/732 | Loss: 0.3573 | LM: 0.2953
232
+ [2026-04-07 18:26:26] Validation | Batch 310/732 | Loss: 0.3572 | LM: 0.2951
233
+ [2026-04-07 18:26:27] Validation | Batch 320/732 | Loss: 0.3562 | LM: 0.2942
234
+ [2026-04-07 18:26:28] Validation | Batch 330/732 | Loss: 0.3551 | LM: 0.2931
235
+ [2026-04-07 18:26:29] Validation | Batch 340/732 | Loss: 0.3545 | LM: 0.2925
236
+ [2026-04-07 18:26:30] Validation | Batch 350/732 | Loss: 0.3549 | LM: 0.2929
237
+ [2026-04-07 18:26:31] Validation | Batch 360/732 | Loss: 0.3557 | LM: 0.2937
238
+ [2026-04-07 18:26:32] Validation | Batch 370/732 | Loss: 0.3548 | LM: 0.2927
239
+ [2026-04-07 18:26:33] Validation | Batch 380/732 | Loss: 0.3541 | LM: 0.2921
240
+ [2026-04-07 18:26:34] Validation | Batch 390/732 | Loss: 0.3538 | LM: 0.2917
241
+ [2026-04-07 18:26:35] Validation | Batch 400/732 | Loss: 0.3536 | LM: 0.2916
242
+ [2026-04-07 18:26:36] Validation | Batch 410/732 | Loss: 0.3529 | LM: 0.2908
243
+ [2026-04-07 18:26:37] Validation | Batch 420/732 | Loss: 0.3531 | LM: 0.2911
244
+ [2026-04-07 18:26:38] Validation | Batch 430/732 | Loss: 0.3531 | LM: 0.2910
245
+ [2026-04-07 18:26:40] Validation | Batch 440/732 | Loss: 0.3525 | LM: 0.2905
246
+ [2026-04-07 18:26:41] Validation | Batch 450/732 | Loss: 0.3524 | LM: 0.2903
247
+ [2026-04-07 18:26:42] Validation | Batch 460/732 | Loss: 0.3527 | LM: 0.2907
248
+ [2026-04-07 18:26:43] Validation | Batch 470/732 | Loss: 0.3525 | LM: 0.2904
249
+ [2026-04-07 18:26:44] Validation | Batch 480/732 | Loss: 0.3526 | LM: 0.2906
250
+ [2026-04-07 18:26:45] Validation | Batch 490/732 | Loss: 0.3537 | LM: 0.2916
251
+ [2026-04-07 18:26:46] Validation | Batch 500/732 | Loss: 0.3547 | LM: 0.2927
252
+ [2026-04-07 18:26:47] Validation | Batch 510/732 | Loss: 0.3544 | LM: 0.2924
253
+ [2026-04-07 18:26:48] Validation | Batch 520/732 | Loss: 0.3542 | LM: 0.2921
254
+ [2026-04-07 18:26:49] Validation | Batch 530/732 | Loss: 0.3537 | LM: 0.2916
255
+ [2026-04-07 18:26:50] Validation | Batch 540/732 | Loss: 0.3538 | LM: 0.2918
256
+ [2026-04-07 18:26:51] Validation | Batch 550/732 | Loss: 0.3538 | LM: 0.2917
257
+ [2026-04-07 18:26:52] Validation | Batch 560/732 | Loss: 0.3533 | LM: 0.2912
258
+ [2026-04-07 18:26:53] Validation | Batch 570/732 | Loss: 0.3533 | LM: 0.2913
259
+ [2026-04-07 18:26:55] Validation | Batch 580/732 | Loss: 0.3530 | LM: 0.2910
260
+ [2026-04-07 18:26:56] Validation | Batch 590/732 | Loss: 0.3530 | LM: 0.2910
261
+ [2026-04-07 18:26:57] Validation | Batch 600/732 | Loss: 0.3530 | LM: 0.2909
262
+ [2026-04-07 18:26:58] Validation | Batch 610/732 | Loss: 0.3535 | LM: 0.2914
263
+ [2026-04-07 18:26:59] Validation | Batch 620/732 | Loss: 0.3539 | LM: 0.2918
264
+ [2026-04-07 18:27:00] Validation | Batch 630/732 | Loss: 0.3537 | LM: 0.2916
265
+ [2026-04-07 18:27:02] Validation | Batch 640/732 | Loss: 0.3534 | LM: 0.2913
266
+ [2026-04-07 18:27:03] Validation | Batch 650/732 | Loss: 0.3532 | LM: 0.2912
267
+ [2026-04-07 18:27:04] Validation | Batch 660/732 | Loss: 0.3537 | LM: 0.2917
268
+ [2026-04-07 18:27:05] Validation | Batch 670/732 | Loss: 0.3544 | LM: 0.2923
269
+ [2026-04-07 18:27:06] Validation | Batch 680/732 | Loss: 0.3544 | LM: 0.2923
270
+ [2026-04-07 18:27:07] Validation | Batch 690/732 | Loss: 0.3545 | LM: 0.2924
271
+ [2026-04-07 18:27:08] Validation | Batch 700/732 | Loss: 0.3550 | LM: 0.2929
272
+ [2026-04-07 18:27:09] Validation | Batch 710/732 | Loss: 0.3554 | LM: 0.2933
273
+ [2026-04-07 18:27:10] Validation | Batch 720/732 | Loss: 0.3563 | LM: 0.2943
274
+ [2026-04-07 18:27:11] Validation | Batch 730/732 | Loss: 0.3560 | LM: 0.2939
275
+ [2026-04-07 18:27:11] Validation | Batch 732/732 | Loss: 0.3558 | LM: 0.2938
276
+ [2026-04-07 18:27:11] Validation | Loss: 0.3558 | LM: 0.2938 | PPL: 1.34 | Time: 78.98s
277
+ [2026-04-07 18:27:14] New best model saved! Val loss: 0.3558
278
+ [2026-04-07 18:27:23] Epoch 1 | Step 510 | Loss: 0.3723 | LM: 0.3024 | LB: 1.2988 | CL0: 2.8 | CL1: 2.5 | HR0: 0.355/SR0: 0.350 | HR1: 0.398/SR1: 0.367 | LR: 1.00e-05
279
+ [2026-04-07 18:27:33] Epoch 1 | Step 520 | Loss: 0.3719 | LM: 0.3015 | LB: 1.2975 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.397/SR1: 0.366 | LR: 1.00e-05
280
+ [2026-04-07 18:27:42] Epoch 1 | Step 530 | Loss: 0.3716 | LM: 0.3006 | LB: 1.2966 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.397/SR1: 0.366 | LR: 1.00e-05
281
+ [2026-04-07 18:27:51] Epoch 1 | Step 540 | Loss: 0.3713 | LM: 0.3004 | LB: 1.2953 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.396/SR1: 0.365 | LR: 1.00e-05
282
+ [2026-04-07 18:28:00] Epoch 1 | Step 550 | Loss: 0.3709 | LM: 0.3009 | LB: 1.2940 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.396/SR1: 0.365 | LR: 1.00e-05
283
+ [2026-04-07 18:28:09] Epoch 1 | Step 560 | Loss: 0.3711 | LM: 0.3016 | LB: 1.2935 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.395/SR1: 0.364 | LR: 1.00e-05
284
+ [2026-04-07 18:28:18] Epoch 1 | Step 570 | Loss: 0.3713 | LM: 0.3021 | LB: 1.2928 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.395/SR1: 0.364 | LR: 1.00e-05
285
+ [2026-04-07 18:28:27] Epoch 1 | Step 580 | Loss: 0.3708 | LM: 0.3019 | LB: 1.2920 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.395/SR1: 0.363 | LR: 1.00e-05
286
+ [2026-04-07 18:28:36] Epoch 1 | Step 590 | Loss: 0.3713 | LM: 0.3033 | LB: 1.2910 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.394/SR1: 0.363 | LR: 1.00e-05
287
+ [2026-04-07 18:28:45] Epoch 1 | Step 600 | Loss: 0.3708 | LM: 0.3026 | LB: 1.2905 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.394/SR1: 0.362 | LR: 1.00e-05
288
+ [2026-04-07 18:28:54] Epoch 1 | Step 610 | Loss: 0.3704 | LM: 0.3023 | LB: 1.2900 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.393/SR1: 0.362 | LR: 1.00e-05
289
+ [2026-04-07 18:29:03] Epoch 1 | Step 620 | Loss: 0.3701 | LM: 0.3024 | LB: 1.2893 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.393/SR1: 0.361 | LR: 1.00e-05
290
+ [2026-04-07 18:29:12] Epoch 1 | Step 630 | Loss: 0.3697 | LM: 0.3022 | LB: 1.2884 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.393/SR1: 0.361 | LR: 1.00e-05
291
+ [2026-04-07 18:29:21] Epoch 1 | Step 640 | Loss: 0.3693 | LM: 0.3022 | LB: 1.2879 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.392/SR1: 0.360 | LR: 1.00e-05
292
+ [2026-04-07 18:29:30] Epoch 1 | Step 650 | Loss: 0.3695 | LM: 0.3018 | LB: 1.2872 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.392/SR1: 0.360 | LR: 1.00e-05
293
+ [2026-04-07 18:29:39] Epoch 1 | Step 660 | Loss: 0.3690 | LM: 0.3015 | LB: 1.2867 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.391/SR1: 0.359 | LR: 1.00e-05
294
+ [2026-04-07 18:29:48] Epoch 1 | Step 670 | Loss: 0.3697 | LM: 0.3019 | LB: 1.2864 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.391/SR1: 0.359 | LR: 1.00e-05
295
+ [2026-04-07 18:29:57] Epoch 1 | Step 680 | Loss: 0.3693 | LM: 0.3022 | LB: 1.2860 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.391/SR1: 0.359 | LR: 1.00e-05
296
+ [2026-04-07 18:30:06] Epoch 1 | Step 690 | Loss: 0.3694 | LM: 0.3027 | LB: 1.2858 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.391/SR1: 0.358 | LR: 1.00e-05
297
+ [2026-04-07 18:30:15] Epoch 1 | Step 700 | Loss: 0.3696 | LM: 0.3028 | LB: 1.2853 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.390/SR1: 0.358 | LR: 1.00e-05
298
+ [2026-04-07 18:30:24] Epoch 1 | Step 710 | Loss: 0.3693 | LM: 0.3024 | LB: 1.2846 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.390/SR1: 0.358 | LR: 1.00e-05
299
+ [2026-04-07 18:30:33] Epoch 1 | Step 720 | Loss: 0.3689 | LM: 0.3023 | LB: 1.2842 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.390/SR1: 0.357 | LR: 1.00e-05
300
+ [2026-04-07 18:30:42] Epoch 1 | Step 730 | Loss: 0.3686 | LM: 0.3030 | LB: 1.2839 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.390/SR1: 0.357 | LR: 1.00e-05
301
+ [2026-04-07 18:30:51] Epoch 1 | Step 740 | Loss: 0.3684 | LM: 0.3032 | LB: 1.2836 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.389/SR1: 0.357 | LR: 1.00e-05
302
+ [2026-04-07 18:31:00] Epoch 1 | Step 750 | Loss: 0.3682 | LM: 0.3028 | LB: 1.2829 | CL0: 2.8 | CL1: 2.6 | HR0: 0.355/SR0: 0.350 | HR1: 0.389/SR1: 0.357 | LR: 1.00e-05
303
+ [2026-04-07 18:31:01] Validation | Batch 10/732 | Loss: 0.3367 | LM: 0.2749
304
+ [2026-04-07 18:31:02] Validation | Batch 20/732 | Loss: 0.3589 | LM: 0.2966
305
+ [2026-04-07 18:31:04] Validation | Batch 30/732 | Loss: 0.3502 | LM: 0.2882
306
+ [2026-04-07 18:31:05] Validation | Batch 40/732 | Loss: 0.3557 | LM: 0.2936
307
+ [2026-04-07 18:31:06] Validation | Batch 50/732 | Loss: 0.3557 | LM: 0.2936
308
+ [2026-04-07 18:31:07] Validation | Batch 60/732 | Loss: 0.3577 | LM: 0.2957
309
+ [2026-04-07 18:31:08] Validation | Batch 70/732 | Loss: 0.3612 | LM: 0.2991
310
+ [2026-04-07 18:31:09] Validation | Batch 80/732 | Loss: 0.3594 | LM: 0.2975
311
+ [2026-04-07 18:31:10] Validation | Batch 90/732 | Loss: 0.3592 | LM: 0.2973
312
+ [2026-04-07 18:31:12] Validation | Batch 100/732 | Loss: 0.3602 | LM: 0.2983
313
+ [2026-04-07 18:31:13] Validation | Batch 110/732 | Loss: 0.3569 | LM: 0.2950
314
+ [2026-04-07 18:31:14] Validation | Batch 120/732 | Loss: 0.3600 | LM: 0.2981
315
+ [2026-04-07 18:31:15] Validation | Batch 130/732 | Loss: 0.3612 | LM: 0.2993
316
+ [2026-04-07 18:31:16] Validation | Batch 140/732 | Loss: 0.3606 | LM: 0.2987
317
+ [2026-04-07 18:31:17] Validation | Batch 150/732 | Loss: 0.3600 | LM: 0.2980
318
+ [2026-04-07 18:31:18] Validation | Batch 160/732 | Loss: 0.3591 | LM: 0.2972
319
+ [2026-04-07 18:31:19] Validation | Batch 170/732 | Loss: 0.3596 | LM: 0.2977
320
+ [2026-04-07 18:31:20] Validation | Batch 180/732 | Loss: 0.3610 | LM: 0.2991
321
+ [2026-04-07 18:31:21] Validation | Batch 190/732 | Loss: 0.3602 | LM: 0.2983
322
+ [2026-04-07 18:31:22] Validation | Batch 200/732 | Loss: 0.3604 | LM: 0.2985
323
+ [2026-04-07 18:31:23] Validation | Batch 210/732 | Loss: 0.3597 | LM: 0.2977
324
+ [2026-04-07 18:31:24] Validation | Batch 220/732 | Loss: 0.3591 | LM: 0.2972
325
+ [2026-04-07 18:31:25] Validation | Batch 230/732 | Loss: 0.3596 | LM: 0.2976
326
+ [2026-04-07 18:31:27] Validation | Batch 240/732 | Loss: 0.3593 | LM: 0.2973
327
+ [2026-04-07 18:31:28] Validation | Batch 250/732 | Loss: 0.3593 | LM: 0.2973
328
+ [2026-04-07 18:31:29] Validation | Batch 260/732 | Loss: 0.3583 | LM: 0.2963
329
+ [2026-04-07 18:31:30] Validation | Batch 270/732 | Loss: 0.3581 | LM: 0.2961
330
+ [2026-04-07 18:31:31] Validation | Batch 280/732 | Loss: 0.3571 | LM: 0.2951
331
+ [2026-04-07 18:31:32] Validation | Batch 290/732 | Loss: 0.3569 | LM: 0.2949
332
+ [2026-04-07 18:31:33] Validation | Batch 300/732 | Loss: 0.3568 | LM: 0.2948
333
+ [2026-04-07 18:31:34] Validation | Batch 310/732 | Loss: 0.3567 | LM: 0.2946
334
+ [2026-04-07 18:31:35] Validation | Batch 320/732 | Loss: 0.3558 | LM: 0.2937
335
+ [2026-04-07 18:31:36] Validation | Batch 330/732 | Loss: 0.3547 | LM: 0.2926
336
+ [2026-04-07 18:31:37] Validation | Batch 340/732 | Loss: 0.3541 | LM: 0.2920
337
+ [2026-04-07 18:31:38] Validation | Batch 350/732 | Loss: 0.3544 | LM: 0.2924
338
+ [2026-04-07 18:31:39] Validation | Batch 360/732 | Loss: 0.3552 | LM: 0.2932
339
+ [2026-04-07 18:31:40] Validation | Batch 370/732 | Loss: 0.3543 | LM: 0.2923
340
+ [2026-04-07 18:31:41] Validation | Batch 380/732 | Loss: 0.3536 | LM: 0.2916
341
+ [2026-04-07 18:31:42] Validation | Batch 390/732 | Loss: 0.3533 | LM: 0.2913
342
+ [2026-04-07 18:31:43] Validation | Batch 400/732 | Loss: 0.3531 | LM: 0.2911
343
+ [2026-04-07 18:31:44] Validation | Batch 410/732 | Loss: 0.3524 | LM: 0.2904
344
+ [2026-04-07 18:31:45] Validation | Batch 420/732 | Loss: 0.3526 | LM: 0.2906
345
+ [2026-04-07 18:31:46] Validation | Batch 430/732 | Loss: 0.3526 | LM: 0.2905
346
+ [2026-04-07 18:31:47] Validation | Batch 440/732 | Loss: 0.3521 | LM: 0.2900
347
+ [2026-04-07 18:31:48] Validation | Batch 450/732 | Loss: 0.3519 | LM: 0.2898
348
+ [2026-04-07 18:31:50] Validation | Batch 460/732 | Loss: 0.3523 | LM: 0.2902
349
+ [2026-04-07 18:31:51] Validation | Batch 470/732 | Loss: 0.3520 | LM: 0.2900
350
+ [2026-04-07 18:31:52] Validation | Batch 480/732 | Loss: 0.3521 | LM: 0.2901
351
+ [2026-04-07 18:31:53] Validation | Batch 490/732 | Loss: 0.3532 | LM: 0.2912
352
+ [2026-04-07 18:31:54] Validation | Batch 500/732 | Loss: 0.3543 | LM: 0.2922
353
+ [2026-04-07 18:31:55] Validation | Batch 510/732 | Loss: 0.3539 | LM: 0.2919
354
+ [2026-04-07 18:31:56] Validation | Batch 520/732 | Loss: 0.3537 | LM: 0.2917
355
+ [2026-04-07 18:31:57] Validation | Batch 530/732 | Loss: 0.3532 | LM: 0.2911
356
+ [2026-04-07 18:31:58] Validation | Batch 540/732 | Loss: 0.3534 | LM: 0.2913
357
+ [2026-04-07 18:31:59] Validation | Batch 550/732 | Loss: 0.3533 | LM: 0.2912
358
+ [2026-04-07 18:32:00] Validation | Batch 560/732 | Loss: 0.3528 | LM: 0.2908
359
+ [2026-04-07 18:32:02] Validation | Batch 570/732 | Loss: 0.3529 | LM: 0.2908
360
+ [2026-04-07 18:32:03] Validation | Batch 580/732 | Loss: 0.3526 | LM: 0.2905
361
+ [2026-04-07 18:32:04] Validation | Batch 590/732 | Loss: 0.3526 | LM: 0.2905
362
+ [2026-04-07 18:32:05] Validation | Batch 600/732 | Loss: 0.3525 | LM: 0.2904
363
+ [2026-04-07 18:32:06] Validation | Batch 610/732 | Loss: 0.3530 | LM: 0.2910
364
+ [2026-04-07 18:32:07] Validation | Batch 620/732 | Loss: 0.3534 | LM: 0.2914
365
+ [2026-04-07 18:32:08] Validation | Batch 630/732 | Loss: 0.3532 | LM: 0.2911
366
+ [2026-04-07 18:32:10] Validation | Batch 640/732 | Loss: 0.3529 | LM: 0.2909
367
+ [2026-04-07 18:32:11] Validation | Batch 650/732 | Loss: 0.3528 | LM: 0.2907
368
+ [2026-04-07 18:32:12] Validation | Batch 660/732 | Loss: 0.3533 | LM: 0.2912
369
+ [2026-04-07 18:32:13] Validation | Batch 670/732 | Loss: 0.3539 | LM: 0.2919
370
+ [2026-04-07 18:32:14] Validation | Batch 680/732 | Loss: 0.3539 | LM: 0.2919
371
+ [2026-04-07 18:32:15] Validation | Batch 690/732 | Loss: 0.3540 | LM: 0.2920
372
+ [2026-04-07 18:32:16] Validation | Batch 700/732 | Loss: 0.3545 | LM: 0.2925
373
+ [2026-04-07 18:32:17] Validation | Batch 710/732 | Loss: 0.3549 | LM: 0.2929
374
+ [2026-04-07 18:32:18] Validation | Batch 720/732 | Loss: 0.3559 | LM: 0.2938
375
+ [2026-04-07 18:32:19] Validation | Batch 730/732 | Loss: 0.3555 | LM: 0.2935
376
+ [2026-04-07 18:32:19] Validation | Batch 732/732 | Loss: 0.3553 | LM: 0.2933
377
+ [2026-04-07 18:32:19] Validation | Loss: 0.3553 | LM: 0.2933 | PPL: 1.34 | Time: 78.83s
378
+ [2026-04-07 18:32:26] New best model saved! Val loss: 0.3553
379
+ [2026-04-07 18:32:35] Epoch 1 | Step 760 | Loss: 0.3679 | LM: 0.3019 | LB: 1.2825 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.389/SR1: 0.356 | LR: 1.00e-05
380
+ [2026-04-07 18:32:44] Epoch 1 | Step 770 | Loss: 0.3676 | LM: 0.3019 | LB: 1.2821 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.389/SR1: 0.356 | LR: 1.00e-05
381
+ [2026-04-07 18:32:53] Epoch 1 | Step 780 | Loss: 0.3679 | LM: 0.3034 | LB: 1.2814 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.389/SR1: 0.356 | LR: 1.00e-05
382
+ [2026-04-07 18:32:54] Reached max_steps=781, stopping training.
383
+ [2026-04-07 18:32:54] Epoch 1 completed in 981.07s | Loss: 0.3678 | CL0: 2.8 | CL1: 2.6
384
+ [2026-04-07 18:32:54]
385
+ Training completed!
386
+ [2026-04-07 18:32:57] Final model: outputs/N_6.0/model_final.pt
routing_tuning_test_07_04/N_8.0/.hydra/config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ config_path: ${oc.env:PROJECT_ROOT}/hnet_project/configs/hnet_2stage_XL_code.json
3
+ checkpoint_path: ${oc.env:PROJECT_ROOT}/hnet_project/checkpoints/hnet_2stage_XL_code.pt
4
+ training:
5
+ epochs: 1
6
+ max_steps: null
7
+ batch_size: 8
8
+ eval_batch_size: 24
9
+ gradient_accumulation_steps: 4
10
+ lr: 0.0001
11
+ weight_decay: 0.1
12
+ betas:
13
+ - 0.9
14
+ - 0.95
15
+ eps: 1.0e-08
16
+ lr_scheduler: wsd
17
+ warmup_ratio: 0.1
18
+ decay_ratio: 0.2
19
+ warmup_steps: 100
20
+ min_lr_ratio: 0.1
21
+ lr_multiplier:
22
+ - 2.0
23
+ - 1.5
24
+ - 1.0
25
+ load_balancing_weight: 0.05
26
+ load_balancing_N: 8.0
27
+ max_grad_norm: 1.0
28
+ use_amp: true
29
+ resume: false
30
+ resume_checkpoint: null
31
+ warmup_model: true
32
+ data:
33
+ path: ${oc.env:PROJECT_ROOT}/code_completion_exp/datasets/data_V4_full
34
+ max_context_len: 4096
35
+ max_target_len: 256
36
+ num_workers: 0
37
+ pin_memory: true
38
+ max_train_samples: 50000
39
+ max_val_samples: null
40
+ logging:
41
+ log_interval: 10
42
+ save_interval: 1000
43
+ eval_interval: 250
44
+ save_every_epoch: false
45
+ model_only_checkpoints: true
46
+ tracking:
47
+ enabled: true
48
+ project: routing-evolution
49
+ run_name: routing_N8.0
50
+ paths:
51
+ output_dir: outputs/N_${training.load_balancing_N}
52
+ seed: 42
53
+ device: cuda
routing_tuning_test_07_04/N_8.0/.hydra/hydra.yaml ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${paths.output_dir}
4
+ sweep:
5
+ dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - training.load_balancing_N=8.0
116
+ - tracking.run_name=routing_N8.0
117
+ job:
118
+ name: train
119
+ chdir: false
120
+ override_dirname: tracking.run_name=routing_N8.0,training.load_balancing_N=8.0
121
+ id: ???
122
+ num: ???
123
+ config_name: config
124
+ env_set: {}
125
+ env_copy: []
126
+ config:
127
+ override_dirname:
128
+ kv_sep: '='
129
+ item_sep: ','
130
+ exclude_keys: []
131
+ runtime:
132
+ version: 1.3.2
133
+ version_base: '1.3'
134
+ cwd: /workspace/byte-llms-code/routing_evolution_exp
135
+ config_sources:
136
+ - path: hydra.conf
137
+ schema: pkg
138
+ provider: hydra
139
+ - path: /workspace/byte-llms-code/routing_evolution_exp/configs
140
+ schema: file
141
+ provider: main
142
+ - path: ''
143
+ schema: structured
144
+ provider: schema
145
+ output_dir: /workspace/byte-llms-code/routing_evolution_exp/outputs/N_8.0
146
+ choices:
147
+ paths: default
148
+ tracking: default
149
+ logging: default
150
+ data: default
151
+ training: default
152
+ model: hnet_xl_code
153
+ hydra/env: default
154
+ hydra/callbacks: null
155
+ hydra/job_logging: default
156
+ hydra/hydra_logging: default
157
+ hydra/hydra_help: default
158
+ hydra/help: default
159
+ hydra/sweeper: basic
160
+ hydra/launcher: basic
161
+ hydra/output: default
162
+ verbose: false
routing_tuning_test_07_04/N_8.0/.hydra/overrides.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ - training.load_balancing_N=8.0
2
+ - tracking.run_name=routing_N8.0
routing_tuning_test_07_04/N_8.0/model_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a904a1eb0767062c496d1c785af6f85234754836e77f7b8432defda88c98e79
3
+ size 3315165139
routing_tuning_test_07_04/N_8.0/model_final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b37dfdfd5ba634b8cff33353e06b70fc2fcb0bb4997fd44473c4178515bcbf8
3
+ size 3315165484
routing_tuning_test_07_04/N_8.0/routing_weights/routing_step_0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:132bc38ff57f9db6c9ca06a66251bf38dc4231f85bd01101af2cf1df371b5db7
3
+ size 13633736
routing_tuning_test_07_04/N_8.0/routing_weights/routing_step_781.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2793d8348710284e5ac059f1dee64775b65061d5315030d4f0b21f9840fe72b
3
+ size 13633752
routing_tuning_test_07_04/N_8.0/train.log ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-04-07 18:33:15] CUDA_VISIBLE_DEVICES: 0,1
2
+ [2026-04-07 18:33:15] Number of processes: 2
3
+ [2026-04-07 18:33:15] Mixed precision: bf16
4
+ [2026-04-07 18:33:15] ============================================================
5
+ [2026-04-07 18:33:15] Routing Evolution Experiment | N=8.0
6
+ [2026-04-07 18:33:15] ============================================================
7
+ [2026-04-07 18:33:15] Config:
8
+ model:
9
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
10
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
11
+ training:
12
+ epochs: 1
13
+ max_steps: null
14
+ batch_size: 8
15
+ eval_batch_size: 24
16
+ gradient_accumulation_steps: 4
17
+ lr: 0.0001
18
+ weight_decay: 0.1
19
+ betas:
20
+ - 0.9
21
+ - 0.95
22
+ eps: 1.0e-08
23
+ lr_scheduler: wsd
24
+ warmup_ratio: 0.1
25
+ decay_ratio: 0.2
26
+ warmup_steps: 100
27
+ min_lr_ratio: 0.1
28
+ lr_multiplier:
29
+ - 2.0
30
+ - 1.5
31
+ - 1.0
32
+ load_balancing_weight: 0.05
33
+ load_balancing_N: 8.0
34
+ max_grad_norm: 1.0
35
+ use_amp: true
36
+ resume: false
37
+ resume_checkpoint: null
38
+ warmup_model: true
39
+ data:
40
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
41
+ max_context_len: 4096
42
+ max_target_len: 256
43
+ num_workers: 0
44
+ pin_memory: true
45
+ max_train_samples: 50000
46
+ max_val_samples: null
47
+ logging:
48
+ log_interval: 10
49
+ save_interval: 1000
50
+ eval_interval: 250
51
+ save_every_epoch: false
52
+ model_only_checkpoints: true
53
+ tracking:
54
+ enabled: true
55
+ project: routing-evolution
56
+ run_name: routing_N8.0
57
+ paths:
58
+ output_dir: outputs/N_8.0
59
+ seed: 42
60
+ device: cuda
61
+
62
+ [2026-04-07 18:33:15] Loading model...
63
+ [2026-04-07 18:33:21] Loaded pretrained: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
64
+ [2026-04-07 18:33:21] Applied LR multipliers: [2.0, 1.5, 1.0]
65
+ [2026-04-07 18:33:21] Warming up model...
66
+ [2026-04-07 18:34:07] Total params: 1,654,090,112
67
+ [2026-04-07 18:34:07] Trainable params: 1,654,090,112
68
+ [2026-04-07 18:34:07] Creating dataloaders...
69
+ [2026-04-07 18:34:07] Train dataset size: 50000 (max_train_samples=50000) | Epochs: 1
70
+ [2026-04-07 18:34:07] Max steps: 781, Steps per epoch: 3125
71
+ [2026-04-07 18:34:09] Starting training...
72
+ [2026-04-07 18:34:09]
73
+ ============================================================
74
+ [2026-04-07 18:34:09] EPOCH 1/1 (step 0)
75
+ [2026-04-07 18:34:09] ============================================================
76
+ [2026-04-07 18:34:43] Epoch 1 | Step 10 | Loss: 0.7353 | LM: 0.6493 | LB: 1.8079 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.361 | HR1: 0.482/SR1: 0.460 | LR: 3.31e-05
77
+ [2026-04-07 18:34:52] Epoch 1 | Step 20 | Loss: 0.6347 | LM: 0.5508 | LB: 1.8018 | CL0: 2.8 | CL1: 2.1 | HR0: 0.364/SR0: 0.362 | HR1: 0.479/SR1: 0.458 | LR: 5.62e-05
78
+ [2026-04-07 18:35:02] Epoch 1 | Step 30 | Loss: 0.5759 | LM: 0.4785 | LB: 1.7936 | CL0: 2.8 | CL1: 2.1 | HR0: 0.363/SR0: 0.361 | HR1: 0.477/SR1: 0.456 | LR: 7.92e-05
79
+ [2026-04-07 18:35:12] Epoch 1 | Step 40 | Loss: 0.5394 | LM: 0.4475 | LB: 1.7712 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.357 | HR1: 0.473/SR1: 0.452 | LR: 1.00e-04
80
+ [2026-04-07 18:35:21] Epoch 1 | Step 50 | Loss: 0.5164 | LM: 0.4096 | LB: 1.7527 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.466/SR1: 0.446 | LR: 1.00e-04
81
+ [2026-04-07 18:35:30] Epoch 1 | Step 60 | Loss: 0.4896 | LM: 0.3812 | LB: 1.7451 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.464/SR1: 0.444 | LR: 1.00e-04
82
+ [2026-04-07 18:35:39] Epoch 1 | Step 70 | Loss: 0.4748 | LM: 0.3774 | LB: 1.7374 | CL0: 2.8 | CL1: 2.2 | HR0: 0.359/SR0: 0.357 | HR1: 0.461/SR1: 0.440 | LR: 1.00e-04
83
+ [2026-04-07 18:35:48] Epoch 1 | Step 80 | Loss: 0.4595 | LM: 0.3666 | LB: 1.7265 | CL0: 2.8 | CL1: 2.2 | HR0: 0.359/SR0: 0.358 | HR1: 0.457/SR1: 0.436 | LR: 1.00e-04
84
+ [2026-04-07 18:35:57] Epoch 1 | Step 90 | Loss: 0.4507 | LM: 0.3538 | LB: 1.7140 | CL0: 2.8 | CL1: 2.2 | HR0: 0.360/SR0: 0.358 | HR1: 0.452/SR1: 0.431 | LR: 1.00e-04
85
+ [2026-04-07 18:36:06] Epoch 1 | Step 100 | Loss: 0.4452 | LM: 0.3512 | LB: 1.7013 | CL0: 2.8 | CL1: 2.2 | HR0: 0.357/SR0: 0.356 | HR1: 0.450/SR1: 0.427 | LR: 1.00e-04
86
+ [2026-04-07 18:36:15] Epoch 1 | Step 110 | Loss: 0.4404 | LM: 0.3461 | LB: 1.6898 | CL0: 2.8 | CL1: 2.3 | HR0: 0.357/SR0: 0.355 | HR1: 0.446/SR1: 0.423 | LR: 1.00e-04
87
+ [2026-04-07 18:36:24] Epoch 1 | Step 120 | Loss: 0.4376 | LM: 0.3477 | LB: 1.6780 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.443/SR1: 0.419 | LR: 1.00e-04
88
+ [2026-04-07 18:36:34] Epoch 1 | Step 130 | Loss: 0.4332 | LM: 0.3445 | LB: 1.6688 | CL0: 2.8 | CL1: 2.3 | HR0: 0.356/SR0: 0.354 | HR1: 0.440/SR1: 0.416 | LR: 1.00e-04
89
+ [2026-04-07 18:36:43] Epoch 1 | Step 140 | Loss: 0.4286 | LM: 0.3403 | LB: 1.6577 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.436/SR1: 0.412 | LR: 1.00e-04
90
+ [2026-04-07 18:36:52] Epoch 1 | Step 150 | Loss: 0.4259 | LM: 0.3379 | LB: 1.6492 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.353 | HR1: 0.433/SR1: 0.409 | LR: 1.00e-04
91
+ [2026-04-07 18:37:02] Epoch 1 | Step 160 | Loss: 0.4213 | LM: 0.3326 | LB: 1.6406 | CL0: 2.8 | CL1: 2.3 | HR0: 0.355/SR0: 0.352 | HR1: 0.431/SR1: 0.406 | LR: 1.00e-04
92
+ [2026-04-07 18:37:11] Epoch 1 | Step 170 | Loss: 0.4162 | LM: 0.3280 | LB: 1.6327 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.428/SR1: 0.403 | LR: 1.00e-04
93
+ [2026-04-07 18:37:20] Epoch 1 | Step 180 | Loss: 0.4124 | LM: 0.3240 | LB: 1.6258 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.425/SR1: 0.400 | LR: 1.00e-04
94
+ [2026-04-07 18:37:29] Epoch 1 | Step 190 | Loss: 0.4105 | LM: 0.3239 | LB: 1.6189 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.353 | HR1: 0.423/SR1: 0.396 | LR: 1.00e-04
95
+ [2026-04-07 18:37:38] Epoch 1 | Step 200 | Loss: 0.4078 | LM: 0.3214 | LB: 1.6109 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.352 | HR1: 0.420/SR1: 0.393 | LR: 1.00e-04
96
+ [2026-04-07 18:37:47] Epoch 1 | Step 210 | Loss: 0.4074 | LM: 0.3221 | LB: 1.6032 | CL0: 2.8 | CL1: 2.4 | HR0: 0.354/SR0: 0.351 | HR1: 0.417/SR1: 0.391 | LR: 1.00e-04
97
+ [2026-04-07 18:37:56] Epoch 1 | Step 220 | Loss: 0.4063 | LM: 0.3201 | LB: 1.5976 | CL0: 2.8 | CL1: 2.4 | HR0: 0.354/SR0: 0.351 | HR1: 0.415/SR1: 0.388 | LR: 1.00e-04
98
+ [2026-04-07 18:38:05] Epoch 1 | Step 230 | Loss: 0.4048 | LM: 0.3194 | LB: 1.5915 | CL0: 2.8 | CL1: 2.4 | HR0: 0.355/SR0: 0.351 | HR1: 0.413/SR1: 0.385 | LR: 1.00e-04
99
+ [2026-04-07 18:38:14] Epoch 1 | Step 240 | Loss: 0.4025 | LM: 0.3205 | LB: 1.5857 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.351 | HR1: 0.411/SR1: 0.383 | LR: 1.00e-04
100
+ [2026-04-07 18:38:23] Epoch 1 | Step 250 | Loss: 0.4000 | LM: 0.3185 | LB: 1.5797 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.351 | HR1: 0.409/SR1: 0.380 | LR: 1.00e-04
101
+ [2026-04-07 18:38:24] Validation | Batch 10/732 | Loss: 0.3503 | LM: 0.2788
102
+ [2026-04-07 18:38:25] Validation | Batch 20/732 | Loss: 0.3730 | LM: 0.3007
103
+ [2026-04-07 18:38:27] Validation | Batch 30/732 | Loss: 0.3641 | LM: 0.2922
104
+ [2026-04-07 18:38:28] Validation | Batch 40/732 | Loss: 0.3696 | LM: 0.2978
105
+ [2026-04-07 18:38:29] Validation | Batch 50/732 | Loss: 0.3702 | LM: 0.2982
106
+ [2026-04-07 18:38:30] Validation | Batch 60/732 | Loss: 0.3728 | LM: 0.3010
107
+ [2026-04-07 18:38:31] Validation | Batch 70/732 | Loss: 0.3764 | LM: 0.3044
108
+ [2026-04-07 18:38:32] Validation | Batch 80/732 | Loss: 0.3746 | LM: 0.3029
109
+ [2026-04-07 18:38:33] Validation | Batch 90/732 | Loss: 0.3741 | LM: 0.3024
110
+ [2026-04-07 18:38:35] Validation | Batch 100/732 | Loss: 0.3751 | LM: 0.3033
111
+ [2026-04-07 18:38:36] Validation | Batch 110/732 | Loss: 0.3718 | LM: 0.3000
112
+ [2026-04-07 18:38:37] Validation | Batch 120/732 | Loss: 0.3750 | LM: 0.3032
113
+ [2026-04-07 18:38:38] Validation | Batch 130/732 | Loss: 0.3762 | LM: 0.3045
114
+ [2026-04-07 18:38:39] Validation | Batch 140/732 | Loss: 0.3757 | LM: 0.3039
115
+ [2026-04-07 18:38:40] Validation | Batch 150/732 | Loss: 0.3751 | LM: 0.3033
116
+ [2026-04-07 18:38:41] Validation | Batch 160/732 | Loss: 0.3740 | LM: 0.3023
117
+ [2026-04-07 18:38:42] Validation | Batch 170/732 | Loss: 0.3744 | LM: 0.3026
118
+ [2026-04-07 18:38:43] Validation | Batch 180/732 | Loss: 0.3757 | LM: 0.3040
119
+ [2026-04-07 18:38:44] Validation | Batch 190/732 | Loss: 0.3748 | LM: 0.3031
120
+ [2026-04-07 18:38:45] Validation | Batch 200/732 | Loss: 0.3749 | LM: 0.3032
121
+ [2026-04-07 18:38:46] Validation | Batch 210/732 | Loss: 0.3741 | LM: 0.3024
122
+ [2026-04-07 18:38:47] Validation | Batch 220/732 | Loss: 0.3736 | LM: 0.3018
123
+ [2026-04-07 18:38:48] Validation | Batch 230/732 | Loss: 0.3740 | LM: 0.3022
124
+ [2026-04-07 18:38:49] Validation | Batch 240/732 | Loss: 0.3736 | LM: 0.3019
125
+ [2026-04-07 18:38:51] Validation | Batch 250/732 | Loss: 0.3737 | LM: 0.3019
126
+ [2026-04-07 18:38:52] Validation | Batch 260/732 | Loss: 0.3727 | LM: 0.3009
127
+ [2026-04-07 18:38:53] Validation | Batch 270/732 | Loss: 0.3725 | LM: 0.3006
128
+ [2026-04-07 18:38:54] Validation | Batch 280/732 | Loss: 0.3714 | LM: 0.2996
129
+ [2026-04-07 18:38:55] Validation | Batch 290/732 | Loss: 0.3713 | LM: 0.2994
130
+ [2026-04-07 18:38:56] Validation | Batch 300/732 | Loss: 0.3712 | LM: 0.2993
131
+ [2026-04-07 18:38:57] Validation | Batch 310/732 | Loss: 0.3711 | LM: 0.2992
132
+ [2026-04-07 18:38:58] Validation | Batch 320/732 | Loss: 0.3701 | LM: 0.2982
133
+ [2026-04-07 18:38:59] Validation | Batch 330/732 | Loss: 0.3690 | LM: 0.2971
134
+ [2026-04-07 18:39:00] Validation | Batch 340/732 | Loss: 0.3684 | LM: 0.2965
135
+ [2026-04-07 18:39:01] Validation | Batch 350/732 | Loss: 0.3687 | LM: 0.2968
136
+ [2026-04-07 18:39:02] Validation | Batch 360/732 | Loss: 0.3696 | LM: 0.2977
137
+ [2026-04-07 18:39:03] Validation | Batch 370/732 | Loss: 0.3686 | LM: 0.2967
138
+ [2026-04-07 18:39:04] Validation | Batch 380/732 | Loss: 0.3679 | LM: 0.2960
139
+ [2026-04-07 18:39:05] Validation | Batch 390/732 | Loss: 0.3675 | LM: 0.2956
140
+ [2026-04-07 18:39:06] Validation | Batch 400/732 | Loss: 0.3674 | LM: 0.2955
141
+ [2026-04-07 18:39:07] Validation | Batch 410/732 | Loss: 0.3666 | LM: 0.2947
142
+ [2026-04-07 18:39:08] Validation | Batch 420/732 | Loss: 0.3668 | LM: 0.2949
143
+ [2026-04-07 18:39:09] Validation | Batch 430/732 | Loss: 0.3667 | LM: 0.2948
144
+ [2026-04-07 18:39:10] Validation | Batch 440/732 | Loss: 0.3662 | LM: 0.2943
145
+ [2026-04-07 18:39:11] Validation | Batch 450/732 | Loss: 0.3660 | LM: 0.2941
146
+ [2026-04-07 18:39:13] Validation | Batch 460/732 | Loss: 0.3664 | LM: 0.2945
147
+ [2026-04-07 18:39:14] Validation | Batch 470/732 | Loss: 0.3662 | LM: 0.2943
148
+ [2026-04-07 18:39:15] Validation | Batch 480/732 | Loss: 0.3663 | LM: 0.2944
149
+ [2026-04-07 18:39:16] Validation | Batch 490/732 | Loss: 0.3674 | LM: 0.2955
150
+ [2026-04-07 18:39:17] Validation | Batch 500/732 | Loss: 0.3685 | LM: 0.2966
151
+ [2026-04-07 18:39:18] Validation | Batch 510/732 | Loss: 0.3681 | LM: 0.2962
152
+ [2026-04-07 18:39:19] Validation | Batch 520/732 | Loss: 0.3679 | LM: 0.2960
153
+ [2026-04-07 18:39:20] Validation | Batch 530/732 | Loss: 0.3673 | LM: 0.2954
154
+ [2026-04-07 18:39:21] Validation | Batch 540/732 | Loss: 0.3675 | LM: 0.2956
155
+ [2026-04-07 18:39:22] Validation | Batch 550/732 | Loss: 0.3674 | LM: 0.2955
156
+ [2026-04-07 18:39:23] Validation | Batch 560/732 | Loss: 0.3670 | LM: 0.2951
157
+ [2026-04-07 18:39:24] Validation | Batch 570/732 | Loss: 0.3670 | LM: 0.2951
158
+ [2026-04-07 18:39:25] Validation | Batch 580/732 | Loss: 0.3668 | LM: 0.2949
159
+ [2026-04-07 18:39:27] Validation | Batch 590/732 | Loss: 0.3667 | LM: 0.2948
160
+ [2026-04-07 18:39:28] Validation | Batch 600/732 | Loss: 0.3667 | LM: 0.2948
161
+ [2026-04-07 18:39:29] Validation | Batch 610/732 | Loss: 0.3673 | LM: 0.2954
162
+ [2026-04-07 18:39:30] Validation | Batch 620/732 | Loss: 0.3677 | LM: 0.2958
163
+ [2026-04-07 18:39:31] Validation | Batch 630/732 | Loss: 0.3675 | LM: 0.2956
164
+ [2026-04-07 18:39:32] Validation | Batch 640/732 | Loss: 0.3673 | LM: 0.2954
165
+ [2026-04-07 18:39:33] Validation | Batch 650/732 | Loss: 0.3671 | LM: 0.2952
166
+ [2026-04-07 18:39:34] Validation | Batch 660/732 | Loss: 0.3675 | LM: 0.2957
167
+ [2026-04-07 18:39:35] Validation | Batch 670/732 | Loss: 0.3682 | LM: 0.2963
168
+ [2026-04-07 18:39:36] Validation | Batch 680/732 | Loss: 0.3681 | LM: 0.2963
169
+ [2026-04-07 18:39:37] Validation | Batch 690/732 | Loss: 0.3683 | LM: 0.2964
170
+ [2026-04-07 18:39:39] Validation | Batch 700/732 | Loss: 0.3688 | LM: 0.2969
171
+ [2026-04-07 18:39:40] Validation | Batch 710/732 | Loss: 0.3692 | LM: 0.2973
172
+ [2026-04-07 18:39:41] Validation | Batch 720/732 | Loss: 0.3701 | LM: 0.2983
173
+ [2026-04-07 18:39:42] Validation | Batch 730/732 | Loss: 0.3698 | LM: 0.2979
174
+ [2026-04-07 18:39:42] Validation | Batch 732/732 | Loss: 0.3696 | LM: 0.2977
175
+ [2026-04-07 18:39:42] Validation | Loss: 0.3696 | LM: 0.2977 | PPL: 1.35 | Time: 78.62s
176
+ [2026-04-07 18:39:45] New best model saved! Val loss: 0.3696
177
+ [2026-04-07 18:39:54] Epoch 1 | Step 260 | Loss: 0.3979 | LM: 0.3177 | LB: 1.5742 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.407/SR1: 0.378 | LR: 1.00e-04
178
+ [2026-04-07 18:40:03] Epoch 1 | Step 270 | Loss: 0.3979 | LM: 0.3162 | LB: 1.5694 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.405/SR1: 0.376 | LR: 1.00e-04
179
+ [2026-04-07 18:40:12] Epoch 1 | Step 280 | Loss: 0.3976 | LM: 0.3163 | LB: 1.5644 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.404/SR1: 0.374 | LR: 1.00e-04
180
+ [2026-04-07 18:40:21] Epoch 1 | Step 290 | Loss: 0.3958 | LM: 0.3144 | LB: 1.5597 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.402/SR1: 0.372 | LR: 1.00e-04
181
+ [2026-04-07 18:40:30] Epoch 1 | Step 300 | Loss: 0.3950 | LM: 0.3140 | LB: 1.5559 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.400/SR1: 0.370 | LR: 1.00e-04
182
+ [2026-04-07 18:40:40] Epoch 1 | Step 310 | Loss: 0.3940 | LM: 0.3131 | LB: 1.5524 | CL0: 2.8 | CL1: 2.5 | HR0: 0.354/SR0: 0.350 | HR1: 0.398/SR1: 0.368 | LR: 1.00e-04
183
+ [2026-04-07 18:40:49] Epoch 1 | Step 320 | Loss: 0.3932 | LM: 0.3125 | LB: 1.5481 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.397/SR1: 0.366 | LR: 9.80e-05
184
+ [2026-04-07 18:40:58] Epoch 1 | Step 330 | Loss: 0.3918 | LM: 0.3116 | LB: 1.5438 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.395/SR1: 0.364 | LR: 8.93e-05
185
+ [2026-04-07 18:41:07] Epoch 1 | Step 340 | Loss: 0.3909 | LM: 0.3109 | LB: 1.5397 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.394/SR1: 0.362 | LR: 7.51e-05
186
+ [2026-04-07 18:41:16] Epoch 1 | Step 350 | Loss: 0.3903 | LM: 0.3115 | LB: 1.5360 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.350 | HR1: 0.392/SR1: 0.360 | LR: 5.77e-05
187
+ [2026-04-07 18:41:25] Epoch 1 | Step 360 | Loss: 0.3895 | LM: 0.3131 | LB: 1.5325 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.349 | HR1: 0.391/SR1: 0.359 | LR: 3.99e-05
188
+ [2026-04-07 18:41:34] Epoch 1 | Step 370 | Loss: 0.3890 | LM: 0.3127 | LB: 1.5297 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.349 | HR1: 0.390/SR1: 0.357 | LR: 2.45e-05
189
+ [2026-04-07 18:41:43] Epoch 1 | Step 380 | Loss: 0.3879 | LM: 0.3108 | LB: 1.5263 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.349 | HR1: 0.389/SR1: 0.356 | LR: 1.40e-05
190
+ [2026-04-07 18:41:52] Epoch 1 | Step 390 | Loss: 0.3880 | LM: 0.3106 | LB: 1.5230 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.349 | HR1: 0.387/SR1: 0.354 | LR: 1.00e-05
191
+ [2026-04-07 18:42:01] Epoch 1 | Step 400 | Loss: 0.3877 | LM: 0.3114 | LB: 1.5197 | CL0: 2.8 | CL1: 2.6 | HR0: 0.354/SR0: 0.349 | HR1: 0.386/SR1: 0.353 | LR: 1.00e-05
192
+ [2026-04-07 18:42:10] Epoch 1 | Step 410 | Loss: 0.3875 | LM: 0.3113 | LB: 1.5166 | CL0: 2.8 | CL1: 2.6 | HR0: 0.353/SR0: 0.349 | HR1: 0.385/SR1: 0.352 | LR: 1.00e-05
193
+ [2026-04-07 18:42:20] Epoch 1 | Step 420 | Loss: 0.3875 | LM: 0.3107 | LB: 1.5138 | CL0: 2.8 | CL1: 2.6 | HR0: 0.353/SR0: 0.348 | HR1: 0.384/SR1: 0.351 | LR: 1.00e-05
194
+ [2026-04-07 18:42:29] Epoch 1 | Step 430 | Loss: 0.3868 | LM: 0.3091 | LB: 1.5112 | CL0: 2.8 | CL1: 2.6 | HR0: 0.353/SR0: 0.348 | HR1: 0.383/SR1: 0.350 | LR: 1.00e-05
195
+ [2026-04-07 18:42:37] Epoch 1 | Step 440 | Loss: 0.3861 | LM: 0.3082 | LB: 1.5090 | CL0: 2.8 | CL1: 2.6 | HR0: 0.353/SR0: 0.348 | HR1: 0.383/SR1: 0.349 | LR: 1.00e-05
196
+ [2026-04-07 18:42:47] Epoch 1 | Step 450 | Loss: 0.3853 | LM: 0.3070 | LB: 1.5068 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.382/SR1: 0.347 | LR: 1.00e-05
197
+ [2026-04-07 18:42:56] Epoch 1 | Step 460 | Loss: 0.3853 | LM: 0.3061 | LB: 1.5044 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.381/SR1: 0.346 | LR: 1.00e-05
198
+ [2026-04-07 18:43:05] Epoch 1 | Step 470 | Loss: 0.3851 | LM: 0.3069 | LB: 1.5026 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.380/SR1: 0.345 | LR: 1.00e-05
199
+ [2026-04-07 18:43:14] Epoch 1 | Step 480 | Loss: 0.3847 | LM: 0.3062 | LB: 1.5003 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.379/SR1: 0.344 | LR: 1.00e-05
200
+ [2026-04-07 18:43:23] Epoch 1 | Step 490 | Loss: 0.3839 | LM: 0.3051 | LB: 1.4984 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.378/SR1: 0.343 | LR: 1.00e-05
201
+ [2026-04-07 18:43:32] Epoch 1 | Step 500 | Loss: 0.3841 | LM: 0.3056 | LB: 1.4964 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.378/SR1: 0.342 | LR: 1.00e-05
202
+ [2026-04-07 18:43:33] Validation | Batch 10/732 | Loss: 0.3485 | LM: 0.2792
203
+ [2026-04-07 18:43:34] Validation | Batch 20/732 | Loss: 0.3705 | LM: 0.3004
204
+ [2026-04-07 18:43:35] Validation | Batch 30/732 | Loss: 0.3618 | LM: 0.2921
205
+ [2026-04-07 18:43:36] Validation | Batch 40/732 | Loss: 0.3675 | LM: 0.2979
206
+ [2026-04-07 18:43:37] Validation | Batch 50/732 | Loss: 0.3673 | LM: 0.2977
207
+ [2026-04-07 18:43:38] Validation | Batch 60/732 | Loss: 0.3694 | LM: 0.2998
208
+ [2026-04-07 18:43:39] Validation | Batch 70/732 | Loss: 0.3729 | LM: 0.3032
209
+ [2026-04-07 18:43:41] Validation | Batch 80/732 | Loss: 0.3713 | LM: 0.3018
210
+ [2026-04-07 18:43:42] Validation | Batch 90/732 | Loss: 0.3710 | LM: 0.3015
211
+ [2026-04-07 18:43:43] Validation | Batch 100/732 | Loss: 0.3718 | LM: 0.3023
212
+ [2026-04-07 18:43:44] Validation | Batch 110/732 | Loss: 0.3685 | LM: 0.2989
213
+ [2026-04-07 18:43:45] Validation | Batch 120/732 | Loss: 0.3715 | LM: 0.3020
214
+ [2026-04-07 18:43:46] Validation | Batch 130/732 | Loss: 0.3728 | LM: 0.3033
215
+ [2026-04-07 18:43:47] Validation | Batch 140/732 | Loss: 0.3722 | LM: 0.3027
216
+ [2026-04-07 18:43:48] Validation | Batch 150/732 | Loss: 0.3717 | LM: 0.3021
217
+ [2026-04-07 18:43:49] Validation | Batch 160/732 | Loss: 0.3707 | LM: 0.3011
218
+ [2026-04-07 18:43:50] Validation | Batch 170/732 | Loss: 0.3711 | LM: 0.3016
219
+ [2026-04-07 18:43:51] Validation | Batch 180/732 | Loss: 0.3726 | LM: 0.3030
220
+ [2026-04-07 18:43:52] Validation | Batch 190/732 | Loss: 0.3717 | LM: 0.3021
221
+ [2026-04-07 18:43:53] Validation | Batch 200/732 | Loss: 0.3718 | LM: 0.3022
222
+ [2026-04-07 18:43:54] Validation | Batch 210/732 | Loss: 0.3711 | LM: 0.3016
223
+ [2026-04-07 18:43:55] Validation | Batch 220/732 | Loss: 0.3706 | LM: 0.3010
224
+ [2026-04-07 18:43:56] Validation | Batch 230/732 | Loss: 0.3710 | LM: 0.3015
225
+ [2026-04-07 18:43:58] Validation | Batch 240/732 | Loss: 0.3707 | LM: 0.3012
226
+ [2026-04-07 18:43:59] Validation | Batch 250/732 | Loss: 0.3707 | LM: 0.3011
227
+ [2026-04-07 18:44:00] Validation | Batch 260/732 | Loss: 0.3697 | LM: 0.3001
228
+ [2026-04-07 18:44:01] Validation | Batch 270/732 | Loss: 0.3695 | LM: 0.2999
229
+ [2026-04-07 18:44:02] Validation | Batch 280/732 | Loss: 0.3685 | LM: 0.2989
230
+ [2026-04-07 18:44:03] Validation | Batch 290/732 | Loss: 0.3683 | LM: 0.2987
231
+ [2026-04-07 18:44:04] Validation | Batch 300/732 | Loss: 0.3683 | LM: 0.2986
232
+ [2026-04-07 18:44:05] Validation | Batch 310/732 | Loss: 0.3681 | LM: 0.2984
233
+ [2026-04-07 18:44:06] Validation | Batch 320/732 | Loss: 0.3672 | LM: 0.2975
234
+ [2026-04-07 18:44:07] Validation | Batch 330/732 | Loss: 0.3660 | LM: 0.2964
235
+ [2026-04-07 18:44:08] Validation | Batch 340/732 | Loss: 0.3655 | LM: 0.2958
236
+ [2026-04-07 18:44:09] Validation | Batch 350/732 | Loss: 0.3658 | LM: 0.2961
237
+ [2026-04-07 18:44:10] Validation | Batch 360/732 | Loss: 0.3667 | LM: 0.2970
238
+ [2026-04-07 18:44:11] Validation | Batch 370/732 | Loss: 0.3657 | LM: 0.2960
239
+ [2026-04-07 18:44:12] Validation | Batch 380/732 | Loss: 0.3651 | LM: 0.2954
240
+ [2026-04-07 18:44:13] Validation | Batch 390/732 | Loss: 0.3647 | LM: 0.2950
241
+ [2026-04-07 18:44:14] Validation | Batch 400/732 | Loss: 0.3645 | LM: 0.2949
242
+ [2026-04-07 18:44:15] Validation | Batch 410/732 | Loss: 0.3638 | LM: 0.2941
243
+ [2026-04-07 18:44:16] Validation | Batch 420/732 | Loss: 0.3640 | LM: 0.2944
244
+ [2026-04-07 18:44:17] Validation | Batch 430/732 | Loss: 0.3639 | LM: 0.2943
245
+ [2026-04-07 18:44:18] Validation | Batch 440/732 | Loss: 0.3635 | LM: 0.2938
246
+ [2026-04-07 18:44:19] Validation | Batch 450/732 | Loss: 0.3633 | LM: 0.2936
247
+ [2026-04-07 18:44:21] Validation | Batch 460/732 | Loss: 0.3637 | LM: 0.2940
248
+ [2026-04-07 18:44:22] Validation | Batch 470/732 | Loss: 0.3634 | LM: 0.2938
249
+ [2026-04-07 18:44:23] Validation | Batch 480/732 | Loss: 0.3635 | LM: 0.2938
250
+ [2026-04-07 18:44:24] Validation | Batch 490/732 | Loss: 0.3646 | LM: 0.2949
251
+ [2026-04-07 18:44:25] Validation | Batch 500/732 | Loss: 0.3656 | LM: 0.2960
252
+ [2026-04-07 18:44:26] Validation | Batch 510/732 | Loss: 0.3653 | LM: 0.2956
253
+ [2026-04-07 18:44:27] Validation | Batch 520/732 | Loss: 0.3651 | LM: 0.2955
254
+ [2026-04-07 18:44:28] Validation | Batch 530/732 | Loss: 0.3646 | LM: 0.2949
255
+ [2026-04-07 18:44:29] Validation | Batch 540/732 | Loss: 0.3647 | LM: 0.2951
256
+ [2026-04-07 18:44:30] Validation | Batch 550/732 | Loss: 0.3646 | LM: 0.2950
257
+ [2026-04-07 18:44:31] Validation | Batch 560/732 | Loss: 0.3642 | LM: 0.2945
258
+ [2026-04-07 18:44:32] Validation | Batch 570/732 | Loss: 0.3642 | LM: 0.2945
259
+ [2026-04-07 18:44:33] Validation | Batch 580/732 | Loss: 0.3639 | LM: 0.2943
260
+ [2026-04-07 18:44:35] Validation | Batch 590/732 | Loss: 0.3639 | LM: 0.2943
261
+ [2026-04-07 18:44:36] Validation | Batch 600/732 | Loss: 0.3638 | LM: 0.2942
262
+ [2026-04-07 18:44:37] Validation | Batch 610/732 | Loss: 0.3644 | LM: 0.2947
263
+ [2026-04-07 18:44:38] Validation | Batch 620/732 | Loss: 0.3648 | LM: 0.2951
264
+ [2026-04-07 18:44:39] Validation | Batch 630/732 | Loss: 0.3646 | LM: 0.2949
265
+ [2026-04-07 18:44:40] Validation | Batch 640/732 | Loss: 0.3643 | LM: 0.2946
266
+ [2026-04-07 18:44:41] Validation | Batch 650/732 | Loss: 0.3641 | LM: 0.2945
267
+ [2026-04-07 18:44:42] Validation | Batch 660/732 | Loss: 0.3646 | LM: 0.2950
268
+ [2026-04-07 18:44:43] Validation | Batch 670/732 | Loss: 0.3652 | LM: 0.2956
269
+ [2026-04-07 18:44:44] Validation | Batch 680/732 | Loss: 0.3652 | LM: 0.2956
270
+ [2026-04-07 18:44:45] Validation | Batch 690/732 | Loss: 0.3654 | LM: 0.2957
271
+ [2026-04-07 18:44:46] Validation | Batch 700/732 | Loss: 0.3658 | LM: 0.2962
272
+ [2026-04-07 18:44:47] Validation | Batch 710/732 | Loss: 0.3662 | LM: 0.2966
273
+ [2026-04-07 18:44:48] Validation | Batch 720/732 | Loss: 0.3672 | LM: 0.2975
274
+ [2026-04-07 18:44:49] Validation | Batch 730/732 | Loss: 0.3668 | LM: 0.2972
275
+ [2026-04-07 18:44:50] Validation | Batch 732/732 | Loss: 0.3667 | LM: 0.2970
276
+ [2026-04-07 18:44:50] Validation | Loss: 0.3667 | LM: 0.2970 | PPL: 1.35 | Time: 77.66s
277
+ [2026-04-07 18:44:56] New best model saved! Val loss: 0.3667
278
+ [2026-04-07 18:45:05] Epoch 1 | Step 510 | Loss: 0.3840 | LM: 0.3039 | LB: 1.4950 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.348 | HR1: 0.377/SR1: 0.342 | LR: 1.00e-05
279
+ [2026-04-07 18:45:14] Epoch 1 | Step 520 | Loss: 0.3836 | LM: 0.3030 | LB: 1.4927 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.347 | HR1: 0.376/SR1: 0.341 | LR: 1.00e-05
280
+ [2026-04-07 18:45:23] Epoch 1 | Step 530 | Loss: 0.3833 | LM: 0.3022 | LB: 1.4910 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.347 | HR1: 0.376/SR1: 0.340 | LR: 1.00e-05
281
+ [2026-04-07 18:45:32] Epoch 1 | Step 540 | Loss: 0.3829 | LM: 0.3020 | LB: 1.4888 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.347 | HR1: 0.375/SR1: 0.339 | LR: 1.00e-05
282
+ [2026-04-07 18:45:41] Epoch 1 | Step 550 | Loss: 0.3825 | LM: 0.3024 | LB: 1.4866 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.374/SR1: 0.338 | LR: 1.00e-05
283
+ [2026-04-07 18:45:50] Epoch 1 | Step 560 | Loss: 0.3827 | LM: 0.3032 | LB: 1.4855 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.374/SR1: 0.338 | LR: 1.00e-05
284
+ [2026-04-07 18:46:00] Epoch 1 | Step 570 | Loss: 0.3829 | LM: 0.3037 | LB: 1.4842 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.374/SR1: 0.337 | LR: 1.00e-05
285
+ [2026-04-07 18:46:09] Epoch 1 | Step 580 | Loss: 0.3824 | LM: 0.3037 | LB: 1.4828 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.373/SR1: 0.336 | LR: 1.00e-05
286
+ [2026-04-07 18:46:18] Epoch 1 | Step 590 | Loss: 0.3829 | LM: 0.3051 | LB: 1.4811 | CL0: 2.9 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.372/SR1: 0.336 | LR: 1.00e-05
287
+ [2026-04-07 18:46:27] Epoch 1 | Step 600 | Loss: 0.3825 | LM: 0.3044 | LB: 1.4800 | CL0: 2.9 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.372/SR1: 0.335 | LR: 1.00e-05
288
+ [2026-04-07 18:46:36] Epoch 1 | Step 610 | Loss: 0.3820 | LM: 0.3041 | LB: 1.4791 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.371/SR1: 0.335 | LR: 1.00e-05
289
+ [2026-04-07 18:46:45] Epoch 1 | Step 620 | Loss: 0.3817 | LM: 0.3042 | LB: 1.4778 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.371/SR1: 0.334 | LR: 1.00e-05
290
+ [2026-04-07 18:46:54] Epoch 1 | Step 630 | Loss: 0.3813 | LM: 0.3040 | LB: 1.4763 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.370/SR1: 0.333 | LR: 1.00e-05
291
+ [2026-04-07 18:47:03] Epoch 1 | Step 640 | Loss: 0.3809 | LM: 0.3041 | LB: 1.4753 | CL0: 2.8 | CL1: 2.7 | HR0: 0.352/SR0: 0.347 | HR1: 0.370/SR1: 0.333 | LR: 1.00e-05
292
+ [2026-04-07 18:47:12] Epoch 1 | Step 650 | Loss: 0.3811 | LM: 0.3037 | LB: 1.4741 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.347 | HR1: 0.369/SR1: 0.332 | LR: 1.00e-05
293
+ [2026-04-07 18:47:21] Epoch 1 | Step 660 | Loss: 0.3806 | LM: 0.3034 | LB: 1.4732 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.347 | HR1: 0.369/SR1: 0.332 | LR: 1.00e-05
294
+ [2026-04-07 18:47:31] Epoch 1 | Step 670 | Loss: 0.3813 | LM: 0.3039 | LB: 1.4725 | CL0: 2.8 | CL1: 2.7 | HR0: 0.353/SR0: 0.347 | HR1: 0.369/SR1: 0.331 | LR: 1.00e-05
295
+ [2026-04-07 18:47:40] Epoch 1 | Step 680 | Loss: 0.3810 | LM: 0.3042 | LB: 1.4717 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.368/SR1: 0.331 | LR: 1.00e-05
296
+ [2026-04-07 18:47:49] Epoch 1 | Step 690 | Loss: 0.3810 | LM: 0.3047 | LB: 1.4712 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.368/SR1: 0.330 | LR: 1.00e-05
297
+ [2026-04-07 18:47:58] Epoch 1 | Step 700 | Loss: 0.3811 | LM: 0.3048 | LB: 1.4703 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.368/SR1: 0.330 | LR: 1.00e-05
298
+ [2026-04-07 18:48:07] Epoch 1 | Step 710 | Loss: 0.3808 | LM: 0.3044 | LB: 1.4692 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.367/SR1: 0.329 | LR: 1.00e-05
299
+ [2026-04-07 18:48:16] Epoch 1 | Step 720 | Loss: 0.3805 | LM: 0.3043 | LB: 1.4684 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.367/SR1: 0.329 | LR: 1.00e-05
300
+ [2026-04-07 18:48:25] Epoch 1 | Step 730 | Loss: 0.3801 | LM: 0.3050 | LB: 1.4677 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.366/SR1: 0.329 | LR: 1.00e-05
301
+ [2026-04-07 18:48:34] Epoch 1 | Step 740 | Loss: 0.3799 | LM: 0.3052 | LB: 1.4670 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.366/SR1: 0.328 | LR: 1.00e-05
302
+ [2026-04-07 18:48:43] Epoch 1 | Step 750 | Loss: 0.3796 | LM: 0.3048 | LB: 1.4658 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.366/SR1: 0.328 | LR: 1.00e-05
303
+ [2026-04-07 18:48:44] Validation | Batch 10/732 | Loss: 0.3485 | LM: 0.2793
304
+ [2026-04-07 18:48:45] Validation | Batch 20/732 | Loss: 0.3702 | LM: 0.3003
305
+ [2026-04-07 18:48:46] Validation | Batch 30/732 | Loss: 0.3614 | LM: 0.2918
306
+ [2026-04-07 18:48:47] Validation | Batch 40/732 | Loss: 0.3671 | LM: 0.2975
307
+ [2026-04-07 18:48:48] Validation | Batch 50/732 | Loss: 0.3669 | LM: 0.2973
308
+ [2026-04-07 18:48:49] Validation | Batch 60/732 | Loss: 0.3689 | LM: 0.2994
309
+ [2026-04-07 18:48:51] Validation | Batch 70/732 | Loss: 0.3724 | LM: 0.3028
310
+ [2026-04-07 18:48:52] Validation | Batch 80/732 | Loss: 0.3708 | LM: 0.3014
311
+ [2026-04-07 18:48:53] Validation | Batch 90/732 | Loss: 0.3705 | LM: 0.3011
312
+ [2026-04-07 18:48:54] Validation | Batch 100/732 | Loss: 0.3714 | LM: 0.3019
313
+ [2026-04-07 18:48:55] Validation | Batch 110/732 | Loss: 0.3681 | LM: 0.2986
314
+ [2026-04-07 18:48:56] Validation | Batch 120/732 | Loss: 0.3711 | LM: 0.3016
315
+ [2026-04-07 18:48:57] Validation | Batch 130/732 | Loss: 0.3723 | LM: 0.3029
316
+ [2026-04-07 18:48:58] Validation | Batch 140/732 | Loss: 0.3718 | LM: 0.3024
317
+ [2026-04-07 18:48:59] Validation | Batch 150/732 | Loss: 0.3713 | LM: 0.3017
318
+ [2026-04-07 18:49:00] Validation | Batch 160/732 | Loss: 0.3702 | LM: 0.3007
319
+ [2026-04-07 18:49:01] Validation | Batch 170/732 | Loss: 0.3706 | LM: 0.3012
320
+ [2026-04-07 18:49:02] Validation | Batch 180/732 | Loss: 0.3721 | LM: 0.3026
321
+ [2026-04-07 18:49:03] Validation | Batch 190/732 | Loss: 0.3712 | LM: 0.3017
322
+ [2026-04-07 18:49:04] Validation | Batch 200/732 | Loss: 0.3713 | LM: 0.3018
323
+ [2026-04-07 18:49:05] Validation | Batch 210/732 | Loss: 0.3706 | LM: 0.3011
324
+ [2026-04-07 18:49:06] Validation | Batch 220/732 | Loss: 0.3701 | LM: 0.3006
325
+ [2026-04-07 18:49:08] Validation | Batch 230/732 | Loss: 0.3705 | LM: 0.3010
326
+ [2026-04-07 18:49:09] Validation | Batch 240/732 | Loss: 0.3702 | LM: 0.3007
327
+ [2026-04-07 18:49:10] Validation | Batch 250/732 | Loss: 0.3702 | LM: 0.3007
328
+ [2026-04-07 18:49:11] Validation | Batch 260/732 | Loss: 0.3692 | LM: 0.2997
329
+ [2026-04-07 18:49:12] Validation | Batch 270/732 | Loss: 0.3690 | LM: 0.2995
330
+ [2026-04-07 18:49:13] Validation | Batch 280/732 | Loss: 0.3680 | LM: 0.2985
331
+ [2026-04-07 18:49:14] Validation | Batch 290/732 | Loss: 0.3678 | LM: 0.2983
332
+ [2026-04-07 18:49:15] Validation | Batch 300/732 | Loss: 0.3678 | LM: 0.2982
333
+ [2026-04-07 18:49:16] Validation | Batch 310/732 | Loss: 0.3676 | LM: 0.2980
334
+ [2026-04-07 18:49:17] Validation | Batch 320/732 | Loss: 0.3667 | LM: 0.2971
335
+ [2026-04-07 18:49:18] Validation | Batch 330/732 | Loss: 0.3656 | LM: 0.2960
336
+ [2026-04-07 18:49:19] Validation | Batch 340/732 | Loss: 0.3650 | LM: 0.2954
337
+ [2026-04-07 18:49:20] Validation | Batch 350/732 | Loss: 0.3653 | LM: 0.2957
338
+ [2026-04-07 18:49:21] Validation | Batch 360/732 | Loss: 0.3662 | LM: 0.2966
339
+ [2026-04-07 18:49:22] Validation | Batch 370/732 | Loss: 0.3652 | LM: 0.2956
340
+ [2026-04-07 18:49:23] Validation | Batch 380/732 | Loss: 0.3646 | LM: 0.2950
341
+ [2026-04-07 18:49:24] Validation | Batch 390/732 | Loss: 0.3642 | LM: 0.2946
342
+ [2026-04-07 18:49:25] Validation | Batch 400/732 | Loss: 0.3641 | LM: 0.2945
343
+ [2026-04-07 18:49:26] Validation | Batch 410/732 | Loss: 0.3633 | LM: 0.2937
344
+ [2026-04-07 18:49:27] Validation | Batch 420/732 | Loss: 0.3635 | LM: 0.2940
345
+ [2026-04-07 18:49:28] Validation | Batch 430/732 | Loss: 0.3635 | LM: 0.2939
346
+ [2026-04-07 18:49:29] Validation | Batch 440/732 | Loss: 0.3630 | LM: 0.2934
347
+ [2026-04-07 18:49:30] Validation | Batch 450/732 | Loss: 0.3628 | LM: 0.2932
348
+ [2026-04-07 18:49:31] Validation | Batch 460/732 | Loss: 0.3632 | LM: 0.2936
349
+ [2026-04-07 18:49:33] Validation | Batch 470/732 | Loss: 0.3630 | LM: 0.2934
350
+ [2026-04-07 18:49:33] Validation | Batch 480/732 | Loss: 0.3630 | LM: 0.2935
351
+ [2026-04-07 18:49:35] Validation | Batch 490/732 | Loss: 0.3641 | LM: 0.2945
352
+ [2026-04-07 18:49:36] Validation | Batch 500/732 | Loss: 0.3652 | LM: 0.2956
353
+ [2026-04-07 18:49:37] Validation | Batch 510/732 | Loss: 0.3648 | LM: 0.2953
354
+ [2026-04-07 18:49:38] Validation | Batch 520/732 | Loss: 0.3647 | LM: 0.2951
355
+ [2026-04-07 18:49:39] Validation | Batch 530/732 | Loss: 0.3641 | LM: 0.2945
356
+ [2026-04-07 18:49:40] Validation | Batch 540/732 | Loss: 0.3643 | LM: 0.2947
357
+ [2026-04-07 18:49:41] Validation | Batch 550/732 | Loss: 0.3642 | LM: 0.2946
358
+ [2026-04-07 18:49:42] Validation | Batch 560/732 | Loss: 0.3637 | LM: 0.2941
359
+ [2026-04-07 18:49:43] Validation | Batch 570/732 | Loss: 0.3638 | LM: 0.2941
360
+ [2026-04-07 18:49:44] Validation | Batch 580/732 | Loss: 0.3635 | LM: 0.2939
361
+ [2026-04-07 18:49:45] Validation | Batch 590/732 | Loss: 0.3635 | LM: 0.2939
362
+ [2026-04-07 18:49:46] Validation | Batch 600/732 | Loss: 0.3634 | LM: 0.2938
363
+ [2026-04-07 18:49:47] Validation | Batch 610/732 | Loss: 0.3639 | LM: 0.2943
364
+ [2026-04-07 18:49:49] Validation | Batch 620/732 | Loss: 0.3643 | LM: 0.2947
365
+ [2026-04-07 18:49:50] Validation | Batch 630/732 | Loss: 0.3641 | LM: 0.2945
366
+ [2026-04-07 18:49:51] Validation | Batch 640/732 | Loss: 0.3638 | LM: 0.2942
367
+ [2026-04-07 18:49:52] Validation | Batch 650/732 | Loss: 0.3637 | LM: 0.2941
368
+ [2026-04-07 18:49:53] Validation | Batch 660/732 | Loss: 0.3642 | LM: 0.2946
369
+ [2026-04-07 18:49:54] Validation | Batch 670/732 | Loss: 0.3648 | LM: 0.2952
370
+ [2026-04-07 18:49:55] Validation | Batch 680/732 | Loss: 0.3648 | LM: 0.2952
371
+ [2026-04-07 18:49:56] Validation | Batch 690/732 | Loss: 0.3649 | LM: 0.2953
372
+ [2026-04-07 18:49:57] Validation | Batch 700/732 | Loss: 0.3654 | LM: 0.2958
373
+ [2026-04-07 18:49:58] Validation | Batch 710/732 | Loss: 0.3658 | LM: 0.2962
374
+ [2026-04-07 18:49:59] Validation | Batch 720/732 | Loss: 0.3667 | LM: 0.2971
375
+ [2026-04-07 18:50:00] Validation | Batch 730/732 | Loss: 0.3664 | LM: 0.2968
376
+ [2026-04-07 18:50:00] Validation | Batch 732/732 | Loss: 0.3662 | LM: 0.2966
377
+ [2026-04-07 18:50:00] Validation | Loss: 0.3662 | LM: 0.2966 | PPL: 1.35 | Time: 77.29s
378
+ [2026-04-07 18:50:07] New best model saved! Val loss: 0.3662
379
+ [2026-04-07 18:50:16] Epoch 1 | Step 760 | Loss: 0.3793 | LM: 0.3040 | LB: 1.4650 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.347 | HR1: 0.366/SR1: 0.328 | LR: 1.00e-05
380
+ [2026-04-07 18:50:25] Epoch 1 | Step 770 | Loss: 0.3791 | LM: 0.3039 | LB: 1.4642 | CL0: 2.8 | CL1: 2.8 | HR0: 0.353/SR0: 0.346 | HR1: 0.365/SR1: 0.327 | LR: 1.00e-05
381
+ [2026-04-07 18:50:34] Epoch 1 | Step 780 | Loss: 0.3793 | LM: 0.3054 | LB: 1.4630 | CL0: 2.9 | CL1: 2.8 | HR0: 0.352/SR0: 0.346 | HR1: 0.365/SR1: 0.327 | LR: 1.00e-05
382
+ [2026-04-07 18:50:35] Reached max_steps=781, stopping training.
383
+ [2026-04-07 18:50:35] Epoch 1 completed in 986.44s | Loss: 0.3792 | CL0: 2.9 | CL1: 2.8
384
+ [2026-04-07 18:50:35]
385
+ Training completed!
386
+ [2026-04-07 18:50:38] Final model: outputs/N_8.0/model_final.pt