DannyAI commited on
Commit
7bde204
·
verified ·
1 Parent(s): 486307e

Training in progress, step 120

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "microsoft/Phi-4-mini-instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "o_proj",
33
+ "k_proj",
34
+ "v_proj",
35
+ "q_proj"
36
+ ],
37
+ "target_parameters": [],
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ba15576476b741f5d2c780b3510a6d84c2431ba91617313d892077caf538da5
3
+ size 6299896
added_tokens.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|/tool_call|>": 200026,
3
+ "<|/tool|>": 200024,
4
+ "<|assistant|>": 200019,
5
+ "<|end|>": 200020,
6
+ "<|system|>": 200022,
7
+ "<|tag|>": 200028,
8
+ "<|tool_call|>": 200025,
9
+ "<|tool_response|>": 200027,
10
+ "<|tool|>": 200023,
11
+ "<|user|>": 200021
12
+ }
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}
config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Phi3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_phi3.Phi3Config",
9
+ "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM",
10
+ "AutoTokenizer": "Xenova/gpt-4o"
11
+ },
12
+ "bos_token_id": 199999,
13
+ "dtype": "bfloat16",
14
+ "embd_pdrop": 0.0,
15
+ "eos_token_id": 199999,
16
+ "full_attn_mod": 1,
17
+ "hidden_act": "silu",
18
+ "hidden_size": 3072,
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 8192,
21
+ "interpolate_factor": 1,
22
+ "lm_head_bias": false,
23
+ "max_position_embeddings": 131072,
24
+ "mlp_bias": false,
25
+ "model_type": "phi3",
26
+ "num_attention_heads": 24,
27
+ "num_hidden_layers": 32,
28
+ "num_key_value_heads": 8,
29
+ "original_max_position_embeddings": 4096,
30
+ "pad_token_id": 199999,
31
+ "partial_rotary_factor": 0.75,
32
+ "resid_pdrop": 0.0,
33
+ "rms_norm_eps": 1e-05,
34
+ "rope_scaling": {
35
+ "long_factor": [
36
+ 1,
37
+ 1.118320672,
38
+ 1.250641126,
39
+ 1.398617824,
40
+ 1.564103225,
41
+ 1.74916897,
42
+ 1.956131817,
43
+ 2.187582649,
44
+ 2.446418898,
45
+ 2.735880826,
46
+ 3.059592084,
47
+ 3.421605075,
48
+ 3.826451687,
49
+ 4.279200023,
50
+ 4.785517845,
51
+ 5.351743533,
52
+ 5.984965424,
53
+ 6.693110555,
54
+ 7.485043894,
55
+ 8.370679318,
56
+ 9.36110372,
57
+ 10.4687158,
58
+ 11.70738129,
59
+ 13.09260651,
60
+ 14.64173252,
61
+ 16.37415215,
62
+ 18.31155283,
63
+ 20.47818807,
64
+ 22.90118105,
65
+ 25.61086418,
66
+ 28.64115884,
67
+ 32.03,
68
+ 32.1,
69
+ 32.13,
70
+ 32.23,
71
+ 32.6,
72
+ 32.61,
73
+ 32.64,
74
+ 32.66,
75
+ 32.7,
76
+ 32.71,
77
+ 32.93,
78
+ 32.97,
79
+ 33.28,
80
+ 33.49,
81
+ 33.5,
82
+ 44.16,
83
+ 47.77
84
+ ],
85
+ "short_factor": [
86
+ 1.0,
87
+ 1.0,
88
+ 1.0,
89
+ 1.0,
90
+ 1.0,
91
+ 1.0,
92
+ 1.0,
93
+ 1.0,
94
+ 1.0,
95
+ 1.0,
96
+ 1.0,
97
+ 1.0,
98
+ 1.0,
99
+ 1.0,
100
+ 1.0,
101
+ 1.0,
102
+ 1.0,
103
+ 1.0,
104
+ 1.0,
105
+ 1.0,
106
+ 1.0,
107
+ 1.0,
108
+ 1.0,
109
+ 1.0,
110
+ 1.0,
111
+ 1.0,
112
+ 1.0,
113
+ 1.0,
114
+ 1.0,
115
+ 1.0,
116
+ 1.0,
117
+ 1.0,
118
+ 1.0,
119
+ 1.0,
120
+ 1.0,
121
+ 1.0,
122
+ 1.0,
123
+ 1.0,
124
+ 1.0,
125
+ 1.0,
126
+ 1.0,
127
+ 1.0,
128
+ 1.0,
129
+ 1.0,
130
+ 1.0,
131
+ 1.0,
132
+ 1.0,
133
+ 1.0
134
+ ],
135
+ "type": "longrope"
136
+ },
137
+ "rope_theta": 10000.0,
138
+ "sliding_window": 262144,
139
+ "tie_word_embeddings": true,
140
+ "transformers_version": "4.57.6",
141
+ "use_cache": false,
142
+ "vocab_size": 200064
143
+ }
debug.log ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
  0%| | 0/120 [00:00<?, ?it/s][2026-01-24 12:52:54,083] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:6937] Running evaluation step...
 
 
1
  0%| | 0/100 [00:00<?, ?it/s]
 
2
  2%|▋ | 2/100 [00:00<00:08, 11.63it/s]
 
3
  4%|█▍ | 4/100 [00:01<00:32, 2.97it/s]
 
4
  5%|█▊ | 5/100 [00:01<00:37, 2.50it/s]
 
5
  6%|██▏ | 6/100 [00:02<00:40, 2.32it/s]
 
6
  7%|██▌ | 7/100 [00:02<00:42, 2.20it/s]
 
7
  8%|██▉ | 8/100 [00:03<00:43, 2.12it/s]
 
8
  9%|███▏ | 9/100 [00:03<00:45, 2.01it/s]
 
9
  10%|███▌ | 10/100 [00:04<00:44, 2.00it/s]
 
10
  11%|███▊ | 11/100 [00:04<00:44, 1.99it/s]
 
11
  12%|████▏ | 12/100 [00:05<00:44, 1.98it/s]
 
12
  13%|████▌ | 13/100 [00:05<00:45, 1.92it/s]
 
13
  14%|████▉ | 14/100 [00:06<00:44, 1.94it/s]
 
14
  15%|█████▎ | 15/100 [00:06<00:43, 1.94it/s]
 
15
  16%|█████▌ | 16/100 [00:07<00:43, 1.95it/s]
 
16
  17%|█████▉ | 17/100 [00:07<00:43, 1.91it/s]
 
17
  18%|██████▎ | 18/100 [00:08<00:42, 1.93it/s]
 
18
  19%|██████▋ | 19/100 [00:09<00:41, 1.94it/s]
 
19
  20%|███████ | 20/100 [00:09<00:41, 1.95it/s]
 
20
  21%|███████▎ | 21/100 [00:10<00:41, 1.91it/s]
 
21
  22%|███████▋ | 22/100 [00:10<00:40, 1.93it/s]
 
22
  23%|████████ | 23/100 [00:11<00:39, 1.94it/s]
 
23
  24%|████████▍ | 24/100 [00:11<00:39, 1.95it/s]
 
24
  25%|████████▊ | 25/100 [00:12<00:39, 1.90it/s]
 
25
  26%|█████████ | 26/100 [00:12<00:38, 1.93it/s]
 
26
  27%|█████████▍ | 27/100 [00:13<00:37, 1.94it/s]
 
27
  28%|█████████▊ | 28/100 [00:13<00:37, 1.94it/s]
 
28
  29%|██████████▏ | 29/100 [00:14<00:37, 1.90it/s]
 
29
  30%|██████████▌ | 30/100 [00:14<00:36, 1.92it/s]
 
30
  31%|██████████▊ | 31/100 [00:15<00:35, 1.93it/s]
 
31
  32%|███████████▏ | 32/100 [00:15<00:35, 1.94it/s]
 
32
  33%|███████████▌ | 33/100 [00:16<00:35, 1.90it/s]
 
33
  34%|███████████▉ | 34/100 [00:16<00:34, 1.92it/s]
 
34
  35%|████████████▎ | 35/100 [00:17<00:33, 1.93it/s]
 
35
  36%|████████████▌ | 36/100 [00:17<00:33, 1.94it/s]
 
36
  37%|████████████▉ | 37/100 [00:18<00:33, 1.90it/s]
 
37
  38%|█████████████▎ | 38/100 [00:18<00:32, 1.92it/s]
 
38
  39%|█████████████▋ | 39/100 [00:19<00:31, 1.93it/s]
 
39
  40%|██████████████ | 40/100 [00:19<00:30, 1.94it/s]
 
40
  41%|██████████████▎ | 41/100 [00:20<00:31, 1.90it/s]
 
41
  42%|██████████████▋ | 42/100 [00:20<00:30, 1.92it/s]
 
42
  43%|███████████████ | 43/100 [00:21<00:29, 1.93it/s]
 
43
  44%|███████████████▍ | 44/100 [00:21<00:28, 1.94it/s]
 
44
  45%|███████████████▊ | 45/100 [00:22<00:28, 1.90it/s]
 
45
  46%|████████████████ | 46/100 [00:23<00:28, 1.92it/s]
 
46
  47%|████████████████▍ | 47/100 [00:23<00:27, 1.93it/s]
 
47
  48%|████████████████▊ | 48/100 [00:24<00:26, 1.94it/s]
 
48
  49%|█████████████████▏ | 49/100 [00:24<00:26, 1.90it/s]
 
49
  50%|█████████████████▌ | 50/100 [00:25<00:26, 1.92it/s]
 
50
  51%|█████████████████▊ | 51/100 [00:25<00:25, 1.93it/s]
 
51
  52%|██████████████████▏ | 52/100 [00:26<00:24, 1.94it/s]
 
52
  53%|██████████████████▌ | 53/100 [00:26<00:24, 1.90it/s]
 
53
  54%|██████████████████▉ | 54/100 [00:27<00:23, 1.92it/s]
 
54
  55%|███████████████████▎ | 55/100 [00:27<00:23, 1.93it/s]
 
55
  56%|███████████████████▌ | 56/100 [00:28<00:22, 1.93it/s]
 
56
  57%|███████████████████▉ | 57/100 [00:28<00:22, 1.90it/s]
 
57
  58%|████████████████████▎ | 58/100 [00:29<00:21, 1.92it/s]
 
58
  59%|████████████████████▋ | 59/100 [00:29<00:21, 1.92it/s]
 
59
  60%|█████████████████████ | 60/100 [00:30<00:20, 1.93it/s]
 
60
  61%|█████████████████████▎ | 61/100 [00:30<00:20, 1.89it/s]
 
61
  62%|█████████████████████▋ | 62/100 [00:31<00:19, 1.91it/s]
 
62
  63%|██████████████████████ | 63/100 [00:31<00:19, 1.92it/s]
 
63
  64%|██████████████████████▍ | 64/100 [00:32<00:18, 1.93it/s]
 
64
  65%|██████████████████████▊ | 65/100 [00:32<00:18, 1.89it/s]
 
65
  66%|███████████████████████ | 66/100 [00:33<00:17, 1.91it/s]
 
66
  67%|███████████████████████▍ | 67/100 [00:34<00:17, 1.92it/s]
 
67
  68%|███████████████████████▊ | 68/100 [00:34<00:16, 1.93it/s]
 
68
  69%|████████████████████████▏ | 69/100 [00:35<00:16, 1.89it/s]
 
69
  70%|████████████████████████▌ | 70/100 [00:35<00:15, 1.91it/s]
 
70
  71%|████████████████████████▊ | 71/100 [00:36<00:15, 1.92it/s]
 
71
  72%|█████████████████████████▏ | 72/100 [00:36<00:14, 1.93it/s]
 
72
  73%|█████████████████████████▌ | 73/100 [00:37<00:14, 1.89it/s]
 
73
  74%|█████████████████████████▉ | 74/100 [00:37<00:13, 1.91it/s]
 
74
  75%|██████████████████████████▎ | 75/100 [00:38<00:13, 1.92it/s]
 
75
  76%|██████████████████████████▌ | 76/100 [00:38<00:12, 1.93it/s]
 
76
  77%|██████████████████████████▉ | 77/100 [00:39<00:12, 1.89it/s]
 
77
  78%|███████████████████████████▎ | 78/100 [00:39<00:11, 1.91it/s]
 
78
  79%|██████████████��████████████▋ | 79/100 [00:40<00:10, 1.92it/s]
 
79
  80%|████████████████████████████ | 80/100 [00:40<00:10, 1.92it/s]
 
80
  81%|████████████████████████████▎ | 81/100 [00:41<00:10, 1.88it/s]
 
81
  82%|████████████████████████████▋ | 82/100 [00:41<00:09, 1.91it/s]
 
82
  83%|█████████████████████████████ | 83/100 [00:42<00:08, 1.92it/s]
 
83
  84%|█████████████████████████████▍ | 84/100 [00:42<00:08, 1.92it/s]
 
84
  85%|█████████████████████████████▊ | 85/100 [00:43<00:07, 1.88it/s]
 
85
  86%|██████████████████████████████ | 86/100 [00:43<00:07, 1.90it/s]
 
86
  87%|██████████████████████████████▍ | 87/100 [00:44<00:06, 1.91it/s]
 
87
  88%|██████████████████████████████▊ | 88/100 [00:44<00:06, 1.92it/s]
 
88
  89%|███████████████████████████████▏ | 89/100 [00:45<00:05, 1.88it/s]
 
89
  90%|███████████████████████████████▌ | 90/100 [00:46<00:05, 1.90it/s]
 
90
  91%|███████████████████████████████▊ | 91/100 [00:46<00:04, 1.91it/s]
 
91
  92%|████████████████████████████████▏ | 92/100 [00:47<00:04, 1.92it/s]
 
92
  93%|████████████████████████████████▌ | 93/100 [00:47<00:03, 1.88it/s]
 
93
  94%|████████████████████████████████▉ | 94/100 [00:48<00:03, 1.90it/s]
 
94
  95%|█████████████████████████████████▎ | 95/100 [00:48<00:02, 1.91it/s]
 
95
  96%|█████████████████████████████████▌ | 96/100 [00:49<00:02, 1.92it/s]
 
96
  97%|█████████████████████████████████▉ | 97/100 [00:49<00:01, 1.88it/s]
 
97
  98%|██████████████████████████████████▎| 98/100 [00:50<00:01, 1.90it/s]
 
98
  99%|██████████████████████████████████▋| 99/100 [00:50<00:00, 1.91it/s]
 
99
 
 
100
 
 
101
  0%| | 0/120 [00:53<?, ?it/s]
 
 
102
  
103
  1%|▎ | 1/120 [01:03<2:06:12, 63.63s/it]
104
  2%|▌ | 2/120 [01:11<1:01:02, 31.04s/it]
105
  2%|▉ | 3/120 [01:20<40:13, 20.63s/it]
106
  3%|█▏ | 4/120 [01:28<30:26, 15.74s/it]
107
  4%|█▌ | 5/120 [01:36<24:59, 13.04s/it]
108
 
 
109
  4%|█▌ | 5/120 [01:36<24:59, 13.04s/it]
110
  5%|█▊ | 6/120 [01:44<21:41, 11.41s/it]
111
  6%|██ | 7/120 [01:53<19:33, 10.38s/it]
112
  7%|██▍ | 8/120 [02:01<18:06, 9.70s/it]
113
  8%|██▋ | 9/120 [02:09<17:06, 9.25s/it]
114
  8%| | 10/120 [02:17<16:23, 8
115
 
 
116
  8%| | 10/120 [02:17<16:23, 8
117
  9%|█████▌ | 11/120 [02:26<15:51, 8.73s/it]
118
  10%|████████████▍ | 12/120 [02:34<15:26, 8.58s/it]
119
  11%|█████████████▍ | 13/120 [02:40<14:02, 7.87s/it]
120
  12%|████ | 14/120 [02:50<15:11, 8.60s/it]
121
  12%|███████▋ | 15/120 [02:59<14:52, 8.50s/it]
122
 
 
123
  12%|███████▋ | 15/120 [02:59<14:52, 8.50s/it]
124
  13%|████████▏ | 16/120 [03:07<14:37, 8.44s/it]
125
  14%|████████▋ | 17/120 [03:15<14:24, 8.39s/it]
126
  15%|█████████▏ | 18/120 [03:23<14:12, 8.35s/it]
127
  16%|█████████▋ | 19/120 [03:32<14:01, 8.34s/it]
128
  17%|██████████▏ | 20/120 [03:40<13:51, 8.32s/it]
129
 
 
130
  17%|██████████▏ | 20/120 [03:40<13:51, 8.32s/it]
131
  18%|██████████▋ | 21/120 [03:48<13:42, 8.30s/it]
132
  18%|██████▍ | 22/120 [03:57<13:33, 8.30s/it]
133
  19%|██████▋ | 23/120 [04:05<13:24, 8.29s/it]
134
  20%|███████ | 24/120 [04:13<13:15, 8.29s/it]
135
  21%|███████▎ | 25/120 [04:21<13:07, 8.29s/it]
136
 
 
137
  21%|███████▎ | 25/120 [04:21<13:07, 8.29s/it]
138
  22%|███████▌ | 26/120 [04:28<12:02, 7.69s/it]
139
  22%|███████▉ | 27/120 [04:39<13:27, 8.68s/it]
140
  23%|████████▏ | 28/120 [04:47<13:07, 8.56s/it]
141
  24%|████████▍ | 29/120 [04:54<12:26, 8.20s/it]
142
  25%|████████▊ | 30/120 [05:03<12:19, 8.22s/it]
143
 
 
144
  25%|████████▊ | 30/120 [05:03<12:19, 8.22s/it]
145
  26%|█████████ | 31/120 [05:11<12:12, 8.23s/it]
146
  27%|█████████▎ | 32/120 [05:19<12:05, 8.24s/it]
147
  28%|█████████▋ | 33/120 [05:27<11:57, 8.25s/it]
148
  28%|█████████▉ | 34/120 [05:36<11:50, 8.26s/it]
149
  29%|██████████▏ | 35/120 [05:44<11:41, 8.26s/it]
150
 
 
151
  29%|██████████▏ | 35/120 [05:44<11:41, 8.26s/it]
152
  30%|██████████▌ | 36/120 [05:52<11:33, 8.26s/it]
153
  31%|██████████▊ | 37/120 [06:01<11:25, 8.26s/it]
154
  32%|███████████ | 38/120 [06:09<11:17, 8.26s/it]
155
  32%|███████████▍ | 39/120 [06:15<10:20, 7.66s/it]
156
  33%|███████████▋ | 40/120 [06:25<11:15, 8.44s/it]
157
 
 
158
  33%|███████████▋ | 40/120 [06:25<11:15, 8.44s/it]
159
  34%|███████████▉ | 41/120 [06:34<11:03, 8.39s/it]
160
  35%|████████████▎ | 42/120 [06:42<10:52, 8.36s/it]
161
  36%|████████████▌ | 43/120 [06:50<10:42, 8.34s/it]
162
  37%|████████████▊ | 44/120 [06:58<10:32, 8.32s/it]
163
  38%|█████████████▏ | 45/120 [07:07<10:22, 8.31s/it]
164
 
 
165
  38%|█████████████▏ | 45/120 [07:07<10:22, 8.31s/it]
166
  38%|█████████████▍ | 46/120 [07:15<10:13, 8.30s/it]
167
  39%|█████████████▋ | 47/120 [07:23<10:04, 8.29s/it]
168
  40%|██████████████ | 48/120 [07:32<09:56, 8.28s/it]
169
  41%|██████████████▎ | 49/120 [07:39<09:27, 8.00s/it]
170
  42%|██████████████▌ | 50/120 [07:47<09:25, 8.07s/it]
171
 
 
172
  42%|██████████████▌ | 50/120 [07:47<09:25, 8.07s/it]
173
  42%|██████████████▉ | 51/120 [07:55<09:20, 8.13s/it]
174
  43%|███████████████▏ | 52/120 [08:02<08:34, 7.57s/it]
175
  44%|███████████████▍ | 53/120 [08:12<09:17, 8.33s/it]
176
  45%|███████████████▊ | 54/120 [08:20<09:08, 8.31s/it]
177
  46%|████████████████ | 55/120 [08:28<08:59, 8.30s/it]
178
 
 
179
  46%|████████████████ | 55/120 [08:28<08:59, 8.30s/it]
180
  47%|████████████████▎ | 56/120 [08:37<08:50, 8.29s/it]
181
  48%|████████████████▋ | 57/120 [08:45<08:41, 8.28s/it]
182
  48%|████████████████▉ | 58/120 [08:53<08:33, 8.28s/it]
183
  49%|█████████████████▏ | 59/120 [09:01<08:24, 8.27s/it]
184
  50%|█████████████████▌ | 60/120 [09:10<08:16, 8.27s/it]
185
 
 
186
  50%|█████████████████▌ | 60/120 [09:10<08:16, 8.27s/it]
187
  51%|█████████████████▊ | 61/120 [09:18<08:08, 8.27s/it]
188
  52%|██████████████████ | 62/120 [09:26<07:59, 8.27s/it]
189
  52%|██████████████████▍ | 63/120 [09:34<07:51, 8.27s/it]
190
  53%|██████████████████▋ | 64/120 [09:43<07:42, 8.27s/it]
191
  54%|██████████████████▉ | 65/120 [09:49<07:01, 7.66s/it]
192
 
 
193
  54%|██████████████████▉ | 65/120 [09:49<07:01, 7.66s/it]
194
  55%|███████████████████▎ | 66/120 [09:59<07:34, 8.42s/it]
195
  56%|███████████████████▌ | 67/120 [10:07<07:23, 8.38s/it]
196
  57%|███████████████████▊ | 68/120 [10:16<07:13, 8.34s/it]
197
  57%|████████████████████▏ | 69/120 [10:24<07:04, 8.32s/it]
198
  58%|████████████████████▍ | 70/120 [10:32<06:55, 8.30s/it]
199
 
 
200
  58%|████████████████████▍ | 70/120 [10:32<06:55, 8.30s/it]
201
  59%|████████████████████▋ | 71/120 [10:40<06:46, 8.30s/it]
202
  60%|█████████████████████ | 72/120 [10:49<06:37, 8.29s/it]
203
  61%|█████████████████████▎ | 73/120 [10:57<06:29, 8.28s/it]
204
  62%|█████████████████████▌ | 74/120 [11:04<06:07, 8.00s/it]
205
  62%|█████████████████████▉ | 75/120 [11:13<06:03, 8.08s/it]
206
 
 
207
  62%|█████████████████████▉ | 75/120 [11:13<06:03, 8.08s/it]
208
  63%|██████████████████████▏ | 76/120 [11:21<05:58, 8.14s/it]
209
  64%|██████████████████████▍ | 77/120 [11:29<05:51, 8.18s/it]
210
  65%|██████████████████████▊ | 78/120 [11:35<05:19, 7.60s/it]
211
  66%|███████████████████████ | 79/120 [11:46<05:44, 8.40s/it]
212
  67%|███████████████████████▎ | 80/120 [11:54<05:34, 8.36s/it]
213
 
 
214
  67%|███████████████████████▎ | 80/120 [11:54<05:34, 8.36s/it]
215
  68%|███████████████████████▋ | 81/120 [12:02<05:24, 8.33s/it]
216
  68%|███████████████████████▉ | 82/120 [12:10<05:15, 8.31s/it]
217
  69%|████████████████████████▏ | 83/120 [12:19<05:07, 8.30s/it]
218
  70%|████████████████████████▌ | 84/120 [12:27<04:58, 8.29s/it]
219
  71%|████████████████████████▊ | 85/120 [12:35<04:49, 8.28s/it]
220
 
 
221
  71%|████████████████████████▊ | 85/120 [12:35<04:49, 8.28s/it]
222
  72%|█████████████████████████ | 86/120 [12:43<04:41, 8.27s/it]
223
  72%|█████████████████████████▍ | 87/120 [12:52<04:32, 8.27s/it]
224
  73%|█████████████████████████▋ | 88/120 [12:59<04:15, 7.99s/it]
225
  74%|█████████████████████████▉ | 89/120 [13:07<04:10, 8.07s/it]
226
  75%|██████████████████████████▎ | 90/120 [13:16<04:03, 8.13s/it]
227
 
 
228
  75%|██████████████████████████▎ | 90/120 [13:16<04:03, 8.13s/it]
229
  76%|██████████████████████████▌ | 91/120 [13:22<03:39, 7.57s/it]
230
  77%|██████████████████████████▊ | 92/120 [13:32<03:54, 8.36s/it]
231
  78%|███████████████████████████▏ | 93/120 [13:39<03:37, 8.05s/it]
232
  78%|███████████████████████████▍ | 94/120 [13:48<03:30, 8.11s/it]
233
  79%|███████████████████████████▋ | 95/120 [13:56<03:23, 8.15s/it]
234
 
 
235
  79%|███████████████████████████▋ | 95/120 [13:56<03:23, 8.15s/it]
236
  80%|████████████████████████████ | 96/120 [14:04<03:16, 8.18s/it]
237
  81%|████████████████████████████▎ | 97/120 [14:12<03:08, 8.20s/it]
238
  82%|████████████████████████████▌ | 98/120 [14:21<03:00, 8.22s/it]
239
  82%|████████████████████████████▉ | 99/120 [14:29<02:52, 8.23s/it]
240
  83%|████████████████████████████▎ | 100/120 [14:36<02:39, 7.96s/it]
241
 
 
242
  83%|████████████████████████████▎ | 100/120 [14:36<02:39, 7.96s/it][2026-01-24 13:07:30,848] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:6937] Running evaluation step...
 
 
243
  0%| | 0/100 [00:00<?, ?it/s]
 
244
  2%|▋ | 2/100 [00:00<00:25, 3.84it/s]
 
245
  3%|█ | 3/100 [00:01<00:35, 2.70it/s]
 
246
  4%|█▍ | 4/100 [00:01<00:41, 2.33it/s]
 
247
  5%|█▊ | 5/100 [00:02<00:54, 1.74it/s]
 
248
  6%|██▏ | 6/100 [00:02<00:52, 1.79it/s]
 
249
  7%|██▌ | 7/100 [00:03<00:50, 1.83it/s]
 
250
  8%|██▉ | 8/100 [00:03<00:49, 1.85it/s]
 
251
  9%|███▏ | 9/100 [00:04<00:49, 1.83it/s]
 
252
  10%|███▌ | 10/100 [00:05<00:48, 1.86it/s]
 
253
  11%|███▊ | 11/100 [00:05<00:47, 1.87it/s]
 
254
  12%|████▏ | 12/100 [00:06<00:46, 1.89it/s]
 
255
  13%|████▌ | 13/100 [00:06<00:46, 1.85it/s]
 
256
  14%|████▉ | 14/100 [00:07<00:45, 1.87it/s]
 
257
  15%|█████▎ | 15/100 [00:07<00:45, 1.88it/s]
 
258
  16%|█████▌ | 16/100 [00:08<00:44, 1.89it/s]
 
259
  17%|█████▉ | 17/100 [00:08<00:44, 1.86it/s]
 
260
  18%|██████▎ | 18/100 [00:09<00:43, 1.88it/s]
 
261
  19%|██████▋ | 19/100 [00:09<00:42, 1.89it/s]
 
262
  20%|███████ | 20/100 [00:10<00:42, 1.89it/s]
 
263
  21%|███████▎ | 21/100 [00:10<00:42, 1.86it/s]
 
264
  22%|███████▋ | 22/100 [00:11<00:41, 1.88it/s]
 
265
  23%|████████ | 23/100 [00:11<00:40, 1.89it/s]
 
266
  24%|████████▍ | 24/100 [00:12<00:40, 1.90it/s]
 
267
  25%|████████▊ | 25/100 [00:13<00:40, 1.86it/s]
 
268
  26%|█████████ | 26/100 [00:13<00:39, 1.88it/s]
 
269
  27%|█████████▍ | 27/100 [00:14<00:38, 1.89it/s]
 
270
  28%|█████████▊ | 28/100 [00:14<00:37, 1.90it/s]
 
271
  29%|██████████▏ | 29/100 [00:15<00:38, 1.86it/s]
 
272
  30%|██████████▌ | 30/100 [00:15<00:37, 1.88it/s]
 
273
  31%|██████████▊ | 31/100 [00:16<00:36, 1.89it/s]
 
274
  32%|███████████▏ | 32/100 [00:16<00:35, 1.90it/s]
 
275
  33%|███████████▌ | 33/100 [00:17<00:36, 1.86it/s]
 
276
  34%|███████████▉ | 34/100 [00:17<00:35, 1.88it/s]
 
277
  35%|████████████▎ | 35/100 [00:18<00:34, 1.89it/s]
 
278
  36%|████████████▌ | 36/100 [00:18<00:33, 1.90it/s]
 
279
  37%|████████████▉ | 37/100 [00:19<00:33, 1.86it/s]
 
280
  38%|█████████████▎ | 38/100 [00:19<00:32, 1.88it/s]
 
281
  39%|█████████████▋ | 39/100 [00:20<00:32, 1.89it/s]
 
282
  40%|██████████████ | 40/100 [00:21<00:31, 1.90it/s]
 
283
  41%|██████████████▎ | 41/100 [00:21<00:31, 1.86it/s]
 
284
  42%|██████████████▋ | 42/100 [00:22<00:30, 1.88it/s]
 
285
  43%|███████████████ | 43/100 [00:22<00:30, 1.89it/s]
 
286
  44%|███████████████▍ | 44/100 [00:23<00:29, 1.90it/s]
 
287
  45%|███████████████▊ | 45/100 [00:23<00:29, 1.86it/s]
 
288
  46%|████████████████ | 46/100 [00:24<00:28, 1.88it/s]
 
289
  47%|████████████████▍ | 47/100 [00:24<00:28, 1.89it/s]
 
290
  48%|████████████████▊ | 48/100 [00:25<00:27, 1.90it/s]
 
291
  49%|█████████████████▏ | 49/100 [00:25<00:27, 1.86it/s]
 
292
  50%|█████████████████▌ | 50/100 [00:26<00:26, 1.88it/s]
 
293
  51%|█████████████████▊ | 51/100 [00:26<00:25, 1.89it/s]
 
294
  52%|██████████████████▏ | 52/100 [00:27<00:25, 1.89it/s]
 
295
  53%|██████████████████▌ | 53/100 [00:27<00:25, 1.86it/s]
 
296
  54%|██████████████████▉ | 54/100 [00:28<00:24, 1.88it/s]
 
297
  55%|███████████████████▎ | 55/100 [00:28<00:23, 1.89it/s]
 
298
  56%|███████████████████▌ | 56/100 [00:29<00:23, 1.89it/s]
 
299
  57%|███████████████████▉ | 57/100 [00:30<00:23, 1.86it/s]
 
300
  58%|████████████████████▎ | 58/100 [00:30<00:22, 1.88it/s]
 
301
  59%|████████████████████▋ | 59/100 [00:31<00:21, 1.89it/s]
 
302
  60%|█████████████████████ | 60/100 [00:31<00:21, 1.89it/s]
 
303
  61%|█████████████████████▎ | 61/100 [00:32<00:20, 1.86it/s]
 
304
  62%|█████████████████████▋ | 62/100 [00:32<00:20, 1.88it/s]
 
305
  63%|██████████████████████ | 63/100 [00:33<00:19, 1.89it/s]
 
306
  64%|██████████████████████▍ | 64/100 [00:33<00:18, 1.90it/s]
 
307
  65%|██████████████████████▊ | 65/100 [00:34<00:18, 1.86it/s]
 
308
  66%|███████████████████████ | 66/100 [00:34<00:18, 1.88it/s]
 
309
  67%|███████████████████████▍ | 67/100 [00:35<00:17, 1.89it/s]
 
310
  68%|███████████████████████▊ | 68/100 [00:35<00:16, 1.90it/s]
 
311
  69%|████████████████████████▏ | 69/100 [00:36<00:16, 1.86it/s]
 
312
  70%|████████████████████████▌ | 70/100 [00:36<00:15, 1.88it/s]
 
313
  71%|████████████████████████▊ | 71/100 [00:37<00:15, 1.89it/s]
 
314
  72%|█████████████████████████▏ | 72/100 [00:38<00:14, 1.90it/s]
 
315
  73%|█████████████████████████▌ | 73/100 [00:38<00:14, 1.86it/s]
 
316
  74%|█████████████████████████▉ | 74/100 [00:39<00:13, 1.88it/s]
 
317
  75%|██████████████████████████▎ | 75/100 [00:39<00:13, 1.89it/s]
 
318
  76%|██████████████████████████▌ | 76/100 [00:40<00:12, 1.89it/s]
 
319
  77%|██████████████████████████▉ | 77/100 [00:40<00:12, 1.86it/s]
 
320
  78%|███████████████████████████▎ | 78/100 [00:41<00:11, 1.88it/s]
 
321
  79%|███████████████████████████▋ | 79/100 [00:41<00:11, 1.89it/s]
 
322
  80%|████████████████████████████ | 80/100 [00:42<00:10, 1.89it/s]
 
323
  81%|████████████████████████████▎ | 81/100 [00:42<00:10, 1.86it/s]
 
324
  82%|████████████████████████████▋ | 82/100 [00:43<00:09, 1.88it/s]
 
325
  83%|█████████████████████████████ | 83/100 [00:43<00:08, 1.89it/s]
 
326
  84%|█████████████████████████████▍ | 84/100 [00:44<00:08, 1.90it/s]
 
327
  85%|█████████████████████████████▊ | 85/100 [00:44<00:08, 1.86it/s]
 
328
  86%|██████████████████████████████ | 86/100 [00:45<00:07, 1.88it/s]
 
329
  87%|██████████████████████████████▍ | 87/100 [00:46<00:06, 1.89it/s]
 
330
  88%|██████████████████████████████▊ | 88/100 [00:46<00:06, 1.90it/s]
 
331
  89%|███████████████████████████████▏ | 89/100 [00:47<00:05, 1.86it/s]
 
332
  90%|███████████████████████████████▌ | 90/100 [00:47<00:05, 1.88it/s]
 
333
  91%|███████████████████████████████▊ | 91/100 [00:48<00:04, 1.89it/s]
 
334
  92%|████████████████████████████████▏ | 92/100 [00:48<00:04, 1.90it/s]
 
335
  93%|████████████████████████████████▌ | 93/100 [00:49<00:03, 1.86it/s]
 
336
  94%|████████████████████████████████▉ | 94/100 [00:49<00:03, 1.88it/s]
 
337
  95%|█████████████████████████████████▎ | 95/100 [00:50<00:02, 1.89it/s]
 
338
  96%|█████████████████████████████████▌ | 96/100 [00:50<00:02, 1.90it/s]
 
339
  97%|█████████████████████████████████▉ | 97/100 [00:51<00:01, 1.86it/s]
 
340
  98%|██████████████████████████████████▎| 98/100 [00:51<00:01, 1.88it/s]
 
341
  99%|██████████████████████████████████▋| 99/100 [00:52<00:00, 1.89it/s]
 
342
 
 
343
 
 
344
  83%|████████████████████████████▎ | 100/120 [15:30<02:39, 7.96s/it]
 
 
345
  
346
  84%|████████████████████████████▌ | 101/120 [15:39<07:42, 24.33s/it]
347
  85%|████████████████████████████▉ | 102/120 [15:47<05:51, 19.51s/it]
348
  86%|█████████████████████████████▏ | 103/120 [15:55<04:34, 16.14s/it]
349
  87%|█████████████████████████████▍ | 104/120 [16:02<03:30, 13.17s/it]
350
  88%|█████████████████████████████▊ | 105/120 [16:12<03:04, 12.32s/it]
351
 
 
352
  88%|█████████████████████████████▊ | 105/120 [16:12<03:04, 12.32s/it]
353
  88%|██████████████████████████████ | 106/120 [16:20<02:35, 11.11s/it]
354
  89%|██████████████████████████████▎ | 107/120 [16:28<02:13, 10.26s/it]
355
  90%|██████████████████████████████▌ | 108/120 [16:37<01:55, 9.66s/it]
356
  91%|██████████████████████████████▉ | 109/120 [16:45<01:41, 9.25s/it]
357
  92%|███████████████████████████████▏ | 110/120 [16:53<01:29, 8.96s/it]
358
 
 
359
  92%|███████████████████████████████▏ | 110/120 [16:53<01:29, 8.96s/it]
360
  92%|███████████████████████████████▍ | 111/120 [17:02<01:18, 8.75s/it]
361
  93%|███████████████████████████████▋ | 112/120 [17:10<01:08, 8.60s/it]
362
  94%|████████████████████████████████ | 113/120 [17:18<00:59, 8.50s/it]
363
  95%|████████████████████████████████▎ | 114/120 [17:26<00:50, 8.43s/it]
364
  96%|████████████████████████████████▌ | 115/120 [17:35<00:41, 8.38s/it]
365
 
 
366
  96%|████████████████████████████████▌ | 115/120 [17:35<00:41, 8.38s/it]
367
  97%|████████████████████████████████▊ | 116/120 [17:43<00:33, 8.34s/it]
368
  98%|█████████████████████████████████▏| 117/120 [17:49<00:23, 7.72s/it]
369
  98%|█████████████████████████████████▍| 118/120 [17:59<00:16, 8.47s/it]
370
  99%|█████████████████████████████████▋| 119/120 [18:08<00:08, 8.41s/it]
371
 
 
 
372
 
 
 
1
+ [2026-01-24 12:51:56,817] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:6937] baseline 0.000GB ()
2
+ [2026-01-24 12:51:56,820] [INFO] [axolotl.cli.config.load_cfg:259] [PID:6937] config:
3
+ {
4
+ "activation_offloading": false,
5
+ "adapter": "lora",
6
+ "axolotl_config_path": "using_axolotl/lora.yml",
7
+ "base_model": "microsoft/Phi-4-mini-instruct",
8
+ "base_model_config": "microsoft/Phi-4-mini-instruct",
9
+ "batch_size": 8,
10
+ "bf16": true,
11
+ "capabilities": {
12
+ "bf16": true,
13
+ "compute_capability": "sm_86",
14
+ "fp8": false,
15
+ "n_gpu": 1,
16
+ "n_node": 1
17
+ },
18
+ "chat_template": "tokenizer_default",
19
+ "context_parallel_size": 1,
20
+ "dataloader_num_workers": 1,
21
+ "dataloader_pin_memory": true,
22
+ "dataloader_prefetch_factor": 256,
23
+ "dataset_num_proc": 9,
24
+ "datasets": [
25
+ {
26
+ "message_property_mappings": {
27
+ "content": "content",
28
+ "role": "role"
29
+ },
30
+ "path": "DannyAI/African-History-QA-Dataset",
31
+ "split": "train",
32
+ "trust_remote_code": false,
33
+ "type": "alpaca_chat.load_qa"
34
+ }
35
+ ],
36
+ "ddp": false,
37
+ "device": "cuda:0",
38
+ "dion_rank_fraction": 1.0,
39
+ "dion_rank_multiple_of": 1,
40
+ "env_capabilities": {
41
+ "torch_version": "2.9.1"
42
+ },
43
+ "eval_batch_size": 2,
44
+ "eval_causal_lm_metrics": [
45
+ "sacrebleu",
46
+ "comet",
47
+ "ter",
48
+ "chrf"
49
+ ],
50
+ "eval_max_new_tokens": 128,
51
+ "eval_sample_packing": false,
52
+ "eval_steps": 100,
53
+ "eval_strategy": "steps",
54
+ "eval_table_size": 0,
55
+ "experimental_skip_move_to_device": true,
56
+ "fp16": false,
57
+ "gradient_accumulation_steps": 4,
58
+ "gradient_checkpointing": false,
59
+ "hub_model_id": "DannyAI/phi4_lora_axolotl",
60
+ "include_tkps": true,
61
+ "is_falcon_derived_model": false,
62
+ "is_llama_derived_model": false,
63
+ "is_mistral_derived_model": false,
64
+ "learning_rate": 2e-05,
65
+ "lisa_layers_attribute": "model.layers",
66
+ "load_best_model_at_end": false,
67
+ "load_in_4bit": false,
68
+ "load_in_8bit": false,
69
+ "local_rank": 0,
70
+ "logging_steps": 5,
71
+ "lora_alpha": 16,
72
+ "lora_dropout": 0.05,
73
+ "lora_r": 8,
74
+ "lora_target_modules": [
75
+ "q_proj",
76
+ "v_proj",
77
+ "k_proj",
78
+ "o_proj"
79
+ ],
80
+ "loraplus_lr_embedding": 1e-06,
81
+ "lr_scheduler": "cosine",
82
+ "mean_resizing_embeddings": false,
83
+ "micro_batch_size": 2,
84
+ "model_config_type": "phi3",
85
+ "num_epochs": 10.0,
86
+ "optimizer": "adamw_torch",
87
+ "otel_metrics_host": "localhost",
88
+ "otel_metrics_port": 8000,
89
+ "output_dir": "./phi4_african_history_lora_out",
90
+ "pad_to_sequence_len": true,
91
+ "pretrain_multipack_attn": true,
92
+ "profiler_steps_start": 0,
93
+ "qlora_sharded_model_loading": false,
94
+ "ray_num_workers": 1,
95
+ "remove_unused_columns": false,
96
+ "resources_per_worker": {
97
+ "GPU": 1
98
+ },
99
+ "sample_packing": true,
100
+ "sample_packing_bin_size": 200,
101
+ "sample_packing_group_size": 100000,
102
+ "save_only_model": false,
103
+ "save_safetensors": true,
104
+ "save_steps": 200,
105
+ "save_strategy": "steps",
106
+ "sequence_len": 2048,
107
+ "shuffle_before_merging_datasets": false,
108
+ "shuffle_merged_datasets": true,
109
+ "skip_prepare_dataset": false,
110
+ "streaming_multipack_buffer_size": 10000,
111
+ "strict": false,
112
+ "tensor_parallel_size": 1,
113
+ "test_datasets": [
114
+ {
115
+ "message_property_mappings": {
116
+ "content": "content",
117
+ "role": "role"
118
+ },
119
+ "path": "DannyAI/African-History-QA-Dataset",
120
+ "split": "validation",
121
+ "trust_remote_code": false,
122
+ "type": "alpaca_chat.load_qa"
123
+ }
124
+ ],
125
+ "tiled_mlp_use_original_mlp": true,
126
+ "tokenizer_config": "microsoft/Phi-4-mini-instruct",
127
+ "tokenizer_save_jinja_files": true,
128
+ "tokenizer_type": "AutoTokenizer",
129
+ "torch_dtype": "torch.bfloat16",
130
+ "train_on_inputs": false,
131
+ "trl": {
132
+ "log_completions": false,
133
+ "mask_truncated_completions": false,
134
+ "ref_model_mixup_alpha": 0.9,
135
+ "ref_model_sync_steps": 64,
136
+ "scale_rewards": true,
137
+ "sync_ref_model": false,
138
+ "use_vllm": false,
139
+ "vllm_server_host": "0.0.0.0",
140
+ "vllm_server_port": 8000
141
+ },
142
+ "type_of_model": "AutoModelForCausalLM",
143
+ "use_otel_metrics": false,
144
+ "use_ray": false,
145
+ "use_wandb": true,
146
+ "val_set_size": 0.0,
147
+ "vllm": {
148
+ "device": "auto",
149
+ "dtype": "auto",
150
+ "gpu_memory_utilization": 0.9,
151
+ "host": "0.0.0.0",
152
+ "port": 8000
153
+ },
154
+ "wandb_name": "phi4_lora_axolotl",
155
+ "wandb_project": "phi4_african_history",
156
+ "warmup_steps": 10,
157
+ "weight_decay": 0.0,
158
+ "world_size": 1
159
+ }
160
+ [2026-01-24 12:51:58,452] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:6937] EOS: 199999 / <|endoftext|>
161
+ [2026-01-24 12:51:58,452] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:6937] BOS: 199999 / <|endoftext|>
162
+ [2026-01-24 12:51:58,453] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:6937] PAD: 199999 / <|endoftext|>
163
+ [2026-01-24 12:51:58,453] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:6937] UNK: 199999 / <|endoftext|>
164
+ [2026-01-24 12:51:58,453] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:6937] Unable to find prepared dataset in last_run_prepared/89363fb9438bda5d225c172d067e1ebf
165
+ [2026-01-24 12:51:58,453] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:6937] Loading raw datasets...
166
+ [2026-01-24 12:51:58,453] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:6937] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
167
+ [2026-01-24 12:52:01,502] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:6937] Loading dataset: DannyAI/African-History-QA-Dataset with base_type: alpaca_chat.load_qa and prompt_style: None
168
+ [2026-01-24 12:52:01,722] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:224] [PID:6937] min_input_len: 52
169
+ [2026-01-24 12:52:01,722] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:226] [PID:6937] max_input_len: 179
170
+
171
+
172
+
173
+
174
+ [2026-01-24 12:52:03,950] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:6937] Unable to find prepared dataset in last_run_prepared/1affaed26259409613b775fd6050f3a2
175
+ [2026-01-24 12:52:03,951] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:6937] Loading raw datasets...
176
+ [2026-01-24 12:52:03,951] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:6937] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
177
+ [2026-01-24 12:52:05,236] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:6937] Loading dataset: DannyAI/African-History-QA-Dataset with base_type: alpaca_chat.load_qa and prompt_style: None
178
+ [2026-01-24 12:52:05,446] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:224] [PID:6937] min_input_len: 54
179
+ [2026-01-24 12:52:05,446] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:226] [PID:6937] max_input_len: 169
180
+
181
+
182
+
183
+
184
+ [2026-01-24 12:52:07,313] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:6937] total_num_tokens: 205_770
185
+ [2026-01-24 12:52:07,348] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:6937] `total_supervised_tokens: 94_469`
186
+ [2026-01-24 12:52:07,377] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:6937] Using single process for pack_parallel, running sequentially.
187
+ [2026-01-24 12:52:08,322] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:6937] Using single process for pack_parallel, running sequentially.
188
+ [2026-01-24 12:52:08,510] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:6937] generate_batches time: 0.18906092643737793
189
+ [2026-01-24 12:52:08,511] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:6937] Using single process for pack_parallel, running sequentially.
190
+ [2026-01-24 12:52:08,705] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:6937] generate_batches time: 0.19425106048583984
191
+ [2026-01-24 12:52:08,705] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:6937] Using single process for pack_parallel, running sequentially.
192
+ [2026-01-24 12:52:08,914] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:6937] generate_batches time: 0.20883750915527344
193
+ [2026-01-24 12:52:08,914] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:6937] Using single process for pack_parallel, running sequentially.
194
+ [2026-01-24 12:52:09,117] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:6937] generate_batches time: 0.20302033424377441
195
+ [2026-01-24 12:52:09,156] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:6937] gather_len_batches: [51]
196
+ [2026-01-24 12:52:09,156] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:494] [PID:6937] data_loader_len: 12
197
+ [2026-01-24 12:52:09,156] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:510] [PID:6937] sample_packing_eff_est across ranks: [0.9850356158088235]
198
+ [2026-01-24 12:52:09,156] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:6937] sample_packing_eff_est: 0.99
199
+ [2026-01-24 12:52:09,156] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:6937] total_num_steps: 120
200
+ [2026-01-24 12:52:09,157] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:6937] Maximum number of steps set at 120
201
+ [2026-01-24 12:52:09,212] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:6937] loading tokenizer... microsoft/Phi-4-mini-instruct
202
+ [2026-01-24 12:52:10,586] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:6937] EOS: 199999 / <|endoftext|>
203
+ [2026-01-24 12:52:10,586] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:6937] BOS: 199999 / <|endoftext|>
204
+ [2026-01-24 12:52:10,586] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:6937] PAD: 199999 / <|endoftext|>
205
+ [2026-01-24 12:52:10,586] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:6937] UNK: 199999 / <|endoftext|>
206
+ [2026-01-24 12:52:10,586] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:6937] Loading model
207
+ [2026-01-24 12:52:10,782] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:6937] Patched Trainer.evaluation_loop with nanmean loss calculation
208
+ [2026-01-24 12:52:10,788] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:6937] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
209
+ [2026-01-24 12:52:10,789] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:345] [PID:6937] Applying multipack dataloader patch for sample packing...
210
+
211
+ [2026-01-24 12:52:16,277] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:347] [PID:6937] Converting modules to torch.bfloat16
212
+ [2026-01-24 12:52:17,178] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:6937] Memory usage after model load 0.000GB ()
213
+ trainable params: 1,572,864 || all params: 3,837,594,624 || trainable%: 0.0410
214
+ [2026-01-24 12:52:17,232] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:6937] after adapters 0.000GB ()
215
+ [2026-01-24 12:52:47,260] [INFO] [axolotl.train.save_initial_configs:413] [PID:6937] Pre-saving adapter config to ./phi4_african_history_lora_out...
216
+ [2026-01-24 12:52:47,261] [INFO] [axolotl.train.save_initial_configs:417] [PID:6937] Pre-saving tokenizer to ./phi4_african_history_lora_out...
217
+ [2026-01-24 12:52:47,558] [INFO] [axolotl.train.save_initial_configs:422] [PID:6937] Pre-saving model config to ./phi4_african_history_lora_out...
218
+ [2026-01-24 12:52:47,563] [INFO] [axolotl.train.execute_training:212] [PID:6937] Starting trainer...
219
+ [2026-01-24 12:52:49,594] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:6937] generate_batches time: 0.7298610210418701
220
+ [2026-01-24 12:52:50,250] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:6937] generate_batches time: 0.6557362079620361
221
+ [2026-01-24 12:52:50,759] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:6937] generate_batches time: 0.5084567070007324
222
+ [2026-01-24 12:52:51,268] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:6937] generate_batches time: 0.5085635185241699
223
+ [2026-01-24 12:52:51,268] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:6937] gather_len_batches: [51]
224
+ wandb: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.
225
+ wandb: Currently logged in as: dannyai (dannyai-danny-the-analyst) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
226
+ wandb: ⢿ Waiting for wandb.init()...
227
+
228
+
229
+
230
+ wandb: Run data is saved locally in /workspace/wandb/run-20260124_125251-ehuutbte
231
+ wandb: Run `wandb offline` to turn off syncing.
232
+ wandb: Syncing run phi4_lora_axolotl
233
+ wandb: ⭐️ View project at https://wandb.ai/dannyai-danny-the-analyst/phi4_african_history
234
+ wandb: 🚀 View run at https://wandb.ai/dannyai-danny-the-analyst/phi4_african_history/runs/ehuutbte
235
+ wandb: Detected [huggingface_hub.inference] in use.
236
+ wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
237
+ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
238
+ wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
239
+ wandb: WARNING Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
240
+ [2026-01-24 12:52:54,066] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:6937] The Axolotl config has been saved to the WandB run under files.
241
+
242
  0%| | 0/120 [00:00<?, ?it/s][2026-01-24 12:52:54,083] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:6937] Running evaluation step...
243
+
244
+
245
  0%| | 0/100 [00:00<?, ?it/s]
246
+
247
  2%|▋ | 2/100 [00:00<00:08, 11.63it/s]
248
+
249
  4%|█▍ | 4/100 [00:01<00:32, 2.97it/s]
250
+
251
  5%|█▊ | 5/100 [00:01<00:37, 2.50it/s]
252
+
253
  6%|██▏ | 6/100 [00:02<00:40, 2.32it/s]
254
+
255
  7%|██▌ | 7/100 [00:02<00:42, 2.20it/s]
256
+
257
  8%|██▉ | 8/100 [00:03<00:43, 2.12it/s]
258
+
259
  9%|███▏ | 9/100 [00:03<00:45, 2.01it/s]
260
+
261
  10%|███▌ | 10/100 [00:04<00:44, 2.00it/s]
262
+
263
  11%|███▊ | 11/100 [00:04<00:44, 1.99it/s]
264
+
265
  12%|████▏ | 12/100 [00:05<00:44, 1.98it/s]
266
+
267
  13%|████▌ | 13/100 [00:05<00:45, 1.92it/s]
268
+
269
  14%|████▉ | 14/100 [00:06<00:44, 1.94it/s]
270
+
271
  15%|█████▎ | 15/100 [00:06<00:43, 1.94it/s]
272
+
273
  16%|█████▌ | 16/100 [00:07<00:43, 1.95it/s]
274
+
275
  17%|█████▉ | 17/100 [00:07<00:43, 1.91it/s]
276
+
277
  18%|██████▎ | 18/100 [00:08<00:42, 1.93it/s]
278
+
279
  19%|██████▋ | 19/100 [00:09<00:41, 1.94it/s]
280
+
281
  20%|███████ | 20/100 [00:09<00:41, 1.95it/s]
282
+
283
  21%|███████▎ | 21/100 [00:10<00:41, 1.91it/s]
284
+
285
  22%|███████▋ | 22/100 [00:10<00:40, 1.93it/s]
286
+
287
  23%|████████ | 23/100 [00:11<00:39, 1.94it/s]
288
+
289
  24%|████████▍ | 24/100 [00:11<00:39, 1.95it/s]
290
+
291
  25%|████████▊ | 25/100 [00:12<00:39, 1.90it/s]
292
+
293
  26%|█████████ | 26/100 [00:12<00:38, 1.93it/s]
294
+
295
  27%|█████████▍ | 27/100 [00:13<00:37, 1.94it/s]
296
+
297
  28%|█████████▊ | 28/100 [00:13<00:37, 1.94it/s]
298
+
299
  29%|██████████▏ | 29/100 [00:14<00:37, 1.90it/s]
300
+
301
  30%|██████████▌ | 30/100 [00:14<00:36, 1.92it/s]
302
+
303
  31%|██████████▊ | 31/100 [00:15<00:35, 1.93it/s]
304
+
305
  32%|███████████▏ | 32/100 [00:15<00:35, 1.94it/s]
306
+
307
  33%|███████████▌ | 33/100 [00:16<00:35, 1.90it/s]
308
+
309
  34%|███████████▉ | 34/100 [00:16<00:34, 1.92it/s]
310
+
311
  35%|████████████▎ | 35/100 [00:17<00:33, 1.93it/s]
312
+
313
  36%|████████████▌ | 36/100 [00:17<00:33, 1.94it/s]
314
+
315
  37%|████████████▉ | 37/100 [00:18<00:33, 1.90it/s]
316
+
317
  38%|█████████████▎ | 38/100 [00:18<00:32, 1.92it/s]
318
+
319
  39%|█████████████▋ | 39/100 [00:19<00:31, 1.93it/s]
320
+
321
  40%|██████████████ | 40/100 [00:19<00:30, 1.94it/s]
322
+
323
  41%|██████████████▎ | 41/100 [00:20<00:31, 1.90it/s]
324
+
325
  42%|██████████████▋ | 42/100 [00:20<00:30, 1.92it/s]
326
+
327
  43%|███████████████ | 43/100 [00:21<00:29, 1.93it/s]
328
+
329
  44%|███████████████▍ | 44/100 [00:21<00:28, 1.94it/s]
330
+
331
  45%|███████████████▊ | 45/100 [00:22<00:28, 1.90it/s]
332
+
333
  46%|████████████████ | 46/100 [00:23<00:28, 1.92it/s]
334
+
335
  47%|████████████████▍ | 47/100 [00:23<00:27, 1.93it/s]
336
+
337
  48%|████████████████▊ | 48/100 [00:24<00:26, 1.94it/s]
338
+
339
  49%|█████████████████▏ | 49/100 [00:24<00:26, 1.90it/s]
340
+
341
  50%|█████████████████▌ | 50/100 [00:25<00:26, 1.92it/s]
342
+
343
  51%|█████████████████▊ | 51/100 [00:25<00:25, 1.93it/s]
344
+
345
  52%|██████████████████▏ | 52/100 [00:26<00:24, 1.94it/s]
346
+
347
  53%|██████████████████▌ | 53/100 [00:26<00:24, 1.90it/s]
348
+
349
  54%|██████████████████▉ | 54/100 [00:27<00:23, 1.92it/s]
350
+
351
  55%|███████████████████▎ | 55/100 [00:27<00:23, 1.93it/s]
352
+
353
  56%|███████████████████▌ | 56/100 [00:28<00:22, 1.93it/s]
354
+
355
  57%|███████████████████▉ | 57/100 [00:28<00:22, 1.90it/s]
356
+
357
  58%|████████████████████▎ | 58/100 [00:29<00:21, 1.92it/s]
358
+
359
  59%|████████████████████▋ | 59/100 [00:29<00:21, 1.92it/s]
360
+
361
  60%|█████████████████████ | 60/100 [00:30<00:20, 1.93it/s]
362
+
363
  61%|█████████████████████▎ | 61/100 [00:30<00:20, 1.89it/s]
364
+
365
  62%|█████████████████████▋ | 62/100 [00:31<00:19, 1.91it/s]
366
+
367
  63%|██████████████████████ | 63/100 [00:31<00:19, 1.92it/s]
368
+
369
  64%|██████████████████████▍ | 64/100 [00:32<00:18, 1.93it/s]
370
+
371
  65%|██████████████████████▊ | 65/100 [00:32<00:18, 1.89it/s]
372
+
373
  66%|███████████████████████ | 66/100 [00:33<00:17, 1.91it/s]
374
+
375
  67%|███████████████████████▍ | 67/100 [00:34<00:17, 1.92it/s]
376
+
377
  68%|███████████████████████▊ | 68/100 [00:34<00:16, 1.93it/s]
378
+
379
  69%|████████████████████████▏ | 69/100 [00:35<00:16, 1.89it/s]
380
+
381
  70%|████████████████████████▌ | 70/100 [00:35<00:15, 1.91it/s]
382
+
383
  71%|████████████████████████▊ | 71/100 [00:36<00:15, 1.92it/s]
384
+
385
  72%|█████████████████████████▏ | 72/100 [00:36<00:14, 1.93it/s]
386
+
387
  73%|█████████████████████████▌ | 73/100 [00:37<00:14, 1.89it/s]
388
+
389
  74%|█████████████████████████▉ | 74/100 [00:37<00:13, 1.91it/s]
390
+
391
  75%|██████████████████████████▎ | 75/100 [00:38<00:13, 1.92it/s]
392
+
393
  76%|██████████████████████████▌ | 76/100 [00:38<00:12, 1.93it/s]
394
+
395
  77%|██████████████████████████▉ | 77/100 [00:39<00:12, 1.89it/s]
396
+
397
  78%|███████████████████████████▎ | 78/100 [00:39<00:11, 1.91it/s]
398
+
399
  79%|██████████████��████████████▋ | 79/100 [00:40<00:10, 1.92it/s]
400
+
401
  80%|████████████████████████████ | 80/100 [00:40<00:10, 1.92it/s]
402
+
403
  81%|████████████████████████████▎ | 81/100 [00:41<00:10, 1.88it/s]
404
+
405
  82%|████████████████████████████▋ | 82/100 [00:41<00:09, 1.91it/s]
406
+
407
  83%|█████████████████████████████ | 83/100 [00:42<00:08, 1.92it/s]
408
+
409
  84%|█████████████████████████████▍ | 84/100 [00:42<00:08, 1.92it/s]
410
+
411
  85%|█████████████████████████████▊ | 85/100 [00:43<00:07, 1.88it/s]
412
+
413
  86%|██████████████████████████████ | 86/100 [00:43<00:07, 1.90it/s]
414
+
415
  87%|██████████████████████████████▍ | 87/100 [00:44<00:06, 1.91it/s]
416
+
417
  88%|██████████████████████████████▊ | 88/100 [00:44<00:06, 1.92it/s]
418
+
419
  89%|███████████████████████████████▏ | 89/100 [00:45<00:05, 1.88it/s]
420
+
421
  90%|███████████████████████████████▌ | 90/100 [00:46<00:05, 1.90it/s]
422
+
423
  91%|███████████████████████████████▊ | 91/100 [00:46<00:04, 1.91it/s]
424
+
425
  92%|████████████████████████████████▏ | 92/100 [00:47<00:04, 1.92it/s]
426
+
427
  93%|████████████████████████████████▌ | 93/100 [00:47<00:03, 1.88it/s]
428
+
429
  94%|████████████████████████████████▉ | 94/100 [00:48<00:03, 1.90it/s]
430
+
431
  95%|█████████████████████████████████▎ | 95/100 [00:48<00:02, 1.91it/s]
432
+
433
  96%|█████████████████████████████████▌ | 96/100 [00:49<00:02, 1.92it/s]
434
+
435
  97%|█████████████████████████████████▉ | 97/100 [00:49<00:01, 1.88it/s]
436
+
437
  98%|██████████████████████████████████▎| 98/100 [00:50<00:01, 1.90it/s]
438
+
439
  99%|██████████████████████████████████▋| 99/100 [00:50<00:00, 1.91it/s]
440
+
441
 
442
+
443
 
444
+
445
  0%| | 0/120 [00:53<?, ?it/s]
446
+
447
+
448
  
449
  1%|▎ | 1/120 [01:03<2:06:12, 63.63s/it]
450
  2%|▌ | 2/120 [01:11<1:01:02, 31.04s/it]
451
  2%|▉ | 3/120 [01:20<40:13, 20.63s/it]
452
  3%|█▏ | 4/120 [01:28<30:26, 15.74s/it]
453
  4%|█▌ | 5/120 [01:36<24:59, 13.04s/it]
454
 
455
+
456
  4%|█▌ | 5/120 [01:36<24:59, 13.04s/it]
457
  5%|█▊ | 6/120 [01:44<21:41, 11.41s/it]
458
  6%|██ | 7/120 [01:53<19:33, 10.38s/it]
459
  7%|██▍ | 8/120 [02:01<18:06, 9.70s/it]
460
  8%|██▋ | 9/120 [02:09<17:06, 9.25s/it]
461
  8%| | 10/120 [02:17<16:23, 8
462
 
463
+
464
  8%| | 10/120 [02:17<16:23, 8
465
  9%|█████▌ | 11/120 [02:26<15:51, 8.73s/it]
466
  10%|████████████▍ | 12/120 [02:34<15:26, 8.58s/it]
467
  11%|█████████████▍ | 13/120 [02:40<14:02, 7.87s/it]
468
  12%|████ | 14/120 [02:50<15:11, 8.60s/it]
469
  12%|███████▋ | 15/120 [02:59<14:52, 8.50s/it]
470
 
471
+
472
  12%|███████▋ | 15/120 [02:59<14:52, 8.50s/it]
473
  13%|████████▏ | 16/120 [03:07<14:37, 8.44s/it]
474
  14%|████████▋ | 17/120 [03:15<14:24, 8.39s/it]
475
  15%|█████████▏ | 18/120 [03:23<14:12, 8.35s/it]
476
  16%|█████████▋ | 19/120 [03:32<14:01, 8.34s/it]
477
  17%|██████████▏ | 20/120 [03:40<13:51, 8.32s/it]
478
 
479
+
480
  17%|██████████▏ | 20/120 [03:40<13:51, 8.32s/it]
481
  18%|██████████▋ | 21/120 [03:48<13:42, 8.30s/it]
482
  18%|██████▍ | 22/120 [03:57<13:33, 8.30s/it]
483
  19%|██████▋ | 23/120 [04:05<13:24, 8.29s/it]
484
  20%|███████ | 24/120 [04:13<13:15, 8.29s/it]
485
  21%|███████▎ | 25/120 [04:21<13:07, 8.29s/it]
486
 
487
+
488
  21%|███████▎ | 25/120 [04:21<13:07, 8.29s/it]
489
  22%|███████▌ | 26/120 [04:28<12:02, 7.69s/it]
490
  22%|███████▉ | 27/120 [04:39<13:27, 8.68s/it]
491
  23%|████████▏ | 28/120 [04:47<13:07, 8.56s/it]
492
  24%|████████▍ | 29/120 [04:54<12:26, 8.20s/it]
493
  25%|████████▊ | 30/120 [05:03<12:19, 8.22s/it]
494
 
495
+
496
  25%|████████▊ | 30/120 [05:03<12:19, 8.22s/it]
497
  26%|█████████ | 31/120 [05:11<12:12, 8.23s/it]
498
  27%|█████████▎ | 32/120 [05:19<12:05, 8.24s/it]
499
  28%|█████████▋ | 33/120 [05:27<11:57, 8.25s/it]
500
  28%|█████████▉ | 34/120 [05:36<11:50, 8.26s/it]
501
  29%|██████████▏ | 35/120 [05:44<11:41, 8.26s/it]
502
 
503
+
504
  29%|██████████▏ | 35/120 [05:44<11:41, 8.26s/it]
505
  30%|██████████▌ | 36/120 [05:52<11:33, 8.26s/it]
506
  31%|██████████▊ | 37/120 [06:01<11:25, 8.26s/it]
507
  32%|███████████ | 38/120 [06:09<11:17, 8.26s/it]
508
  32%|███████████▍ | 39/120 [06:15<10:20, 7.66s/it]
509
  33%|███████████▋ | 40/120 [06:25<11:15, 8.44s/it]
510
 
511
+
512
  33%|███████████▋ | 40/120 [06:25<11:15, 8.44s/it]
513
  34%|███████████▉ | 41/120 [06:34<11:03, 8.39s/it]
514
  35%|████████████▎ | 42/120 [06:42<10:52, 8.36s/it]
515
  36%|████████████▌ | 43/120 [06:50<10:42, 8.34s/it]
516
  37%|████████████▊ | 44/120 [06:58<10:32, 8.32s/it]
517
  38%|█████████████▏ | 45/120 [07:07<10:22, 8.31s/it]
518
 
519
+
520
  38%|█████████████▏ | 45/120 [07:07<10:22, 8.31s/it]
521
  38%|█████████████▍ | 46/120 [07:15<10:13, 8.30s/it]
522
  39%|█████████████▋ | 47/120 [07:23<10:04, 8.29s/it]
523
  40%|██████████████ | 48/120 [07:32<09:56, 8.28s/it]
524
  41%|██████████████▎ | 49/120 [07:39<09:27, 8.00s/it]
525
  42%|██████████████▌ | 50/120 [07:47<09:25, 8.07s/it]
526
 
527
+
528
  42%|██████████████▌ | 50/120 [07:47<09:25, 8.07s/it]
529
  42%|██████████████▉ | 51/120 [07:55<09:20, 8.13s/it]
530
  43%|███████████████▏ | 52/120 [08:02<08:34, 7.57s/it]
531
  44%|███████████████▍ | 53/120 [08:12<09:17, 8.33s/it]
532
  45%|███████████████▊ | 54/120 [08:20<09:08, 8.31s/it]
533
  46%|████████████████ | 55/120 [08:28<08:59, 8.30s/it]
534
 
535
+
536
  46%|████████████████ | 55/120 [08:28<08:59, 8.30s/it]
537
  47%|████████████████▎ | 56/120 [08:37<08:50, 8.29s/it]
538
  48%|████████████████▋ | 57/120 [08:45<08:41, 8.28s/it]
539
  48%|████████████████▉ | 58/120 [08:53<08:33, 8.28s/it]
540
  49%|█████████████████▏ | 59/120 [09:01<08:24, 8.27s/it]
541
  50%|█████████████████▌ | 60/120 [09:10<08:16, 8.27s/it]
542
 
543
+
544
  50%|█████████████████▌ | 60/120 [09:10<08:16, 8.27s/it]
545
  51%|█████████████████▊ | 61/120 [09:18<08:08, 8.27s/it]
546
  52%|██████████████████ | 62/120 [09:26<07:59, 8.27s/it]
547
  52%|██████████████████▍ | 63/120 [09:34<07:51, 8.27s/it]
548
  53%|██████████████████▋ | 64/120 [09:43<07:42, 8.27s/it]
549
  54%|██████████████████▉ | 65/120 [09:49<07:01, 7.66s/it]
550
 
551
+
552
  54%|██████████████████▉ | 65/120 [09:49<07:01, 7.66s/it]
553
  55%|███████████████████▎ | 66/120 [09:59<07:34, 8.42s/it]
554
  56%|███████████████████▌ | 67/120 [10:07<07:23, 8.38s/it]
555
  57%|███████████████████▊ | 68/120 [10:16<07:13, 8.34s/it]
556
  57%|████████████████████▏ | 69/120 [10:24<07:04, 8.32s/it]
557
  58%|████████████████████▍ | 70/120 [10:32<06:55, 8.30s/it]
558
 
559
+
560
  58%|████████████████████▍ | 70/120 [10:32<06:55, 8.30s/it]
561
  59%|████████████████████▋ | 71/120 [10:40<06:46, 8.30s/it]
562
  60%|█████████████████████ | 72/120 [10:49<06:37, 8.29s/it]
563
  61%|█████████████████████▎ | 73/120 [10:57<06:29, 8.28s/it]
564
  62%|█████████████████████▌ | 74/120 [11:04<06:07, 8.00s/it]
565
  62%|█████████████████████▉ | 75/120 [11:13<06:03, 8.08s/it]
566
 
567
+
568
  62%|█████████████████████▉ | 75/120 [11:13<06:03, 8.08s/it]
569
  63%|██████████████████████▏ | 76/120 [11:21<05:58, 8.14s/it]
570
  64%|██████████████████████▍ | 77/120 [11:29<05:51, 8.18s/it]
571
  65%|██████████████████████▊ | 78/120 [11:35<05:19, 7.60s/it]
572
  66%|███████████████████████ | 79/120 [11:46<05:44, 8.40s/it]
573
  67%|███████████████████████▎ | 80/120 [11:54<05:34, 8.36s/it]
574
 
575
+
576
  67%|███████████████████████▎ | 80/120 [11:54<05:34, 8.36s/it]
577
  68%|███████████████████████▋ | 81/120 [12:02<05:24, 8.33s/it]
578
  68%|███████████████████████▉ | 82/120 [12:10<05:15, 8.31s/it]
579
  69%|████████████████████████▏ | 83/120 [12:19<05:07, 8.30s/it]
580
  70%|████████████████████████▌ | 84/120 [12:27<04:58, 8.29s/it]
581
  71%|████████████████████████▊ | 85/120 [12:35<04:49, 8.28s/it]
582
 
583
+
584
  71%|████████████████████████▊ | 85/120 [12:35<04:49, 8.28s/it]
585
  72%|█████████████████████████ | 86/120 [12:43<04:41, 8.27s/it]
586
  72%|█████████████████████████▍ | 87/120 [12:52<04:32, 8.27s/it]
587
  73%|█████████████████████████▋ | 88/120 [12:59<04:15, 7.99s/it]
588
  74%|█████████████████████████▉ | 89/120 [13:07<04:10, 8.07s/it]
589
  75%|██████████████████████████▎ | 90/120 [13:16<04:03, 8.13s/it]
590
 
591
+
592
  75%|██████████████████████████▎ | 90/120 [13:16<04:03, 8.13s/it]
593
  76%|██████████████████████████▌ | 91/120 [13:22<03:39, 7.57s/it]
594
  77%|██████████████████████████▊ | 92/120 [13:32<03:54, 8.36s/it]
595
  78%|███████████████████████████▏ | 93/120 [13:39<03:37, 8.05s/it]
596
  78%|███████████████████████████▍ | 94/120 [13:48<03:30, 8.11s/it]
597
  79%|███████████████████████████▋ | 95/120 [13:56<03:23, 8.15s/it]
598
 
599
+
600
  79%|███████████████████████████▋ | 95/120 [13:56<03:23, 8.15s/it]
601
  80%|████████████████████████████ | 96/120 [14:04<03:16, 8.18s/it]
602
  81%|████████████████████████████▎ | 97/120 [14:12<03:08, 8.20s/it]
603
  82%|████████████████████████████▌ | 98/120 [14:21<03:00, 8.22s/it]
604
  82%|████████████████████████████▉ | 99/120 [14:29<02:52, 8.23s/it]
605
  83%|████████████████████████████▎ | 100/120 [14:36<02:39, 7.96s/it]
606
 
607
+
608
  83%|████████████████████████████▎ | 100/120 [14:36<02:39, 7.96s/it][2026-01-24 13:07:30,848] [INFO] [axolotl.core.trainers.base.evaluate:400] [PID:6937] Running evaluation step...
609
+
610
+
611
  0%| | 0/100 [00:00<?, ?it/s]
612
+
613
  2%|▋ | 2/100 [00:00<00:25, 3.84it/s]
614
+
615
  3%|█ | 3/100 [00:01<00:35, 2.70it/s]
616
+
617
  4%|█▍ | 4/100 [00:01<00:41, 2.33it/s]
618
+
619
  5%|█▊ | 5/100 [00:02<00:54, 1.74it/s]
620
+
621
  6%|██▏ | 6/100 [00:02<00:52, 1.79it/s]
622
+
623
  7%|██▌ | 7/100 [00:03<00:50, 1.83it/s]
624
+
625
  8%|██▉ | 8/100 [00:03<00:49, 1.85it/s]
626
+
627
  9%|███▏ | 9/100 [00:04<00:49, 1.83it/s]
628
+
629
  10%|███▌ | 10/100 [00:05<00:48, 1.86it/s]
630
+
631
  11%|███▊ | 11/100 [00:05<00:47, 1.87it/s]
632
+
633
  12%|████▏ | 12/100 [00:06<00:46, 1.89it/s]
634
+
635
  13%|████▌ | 13/100 [00:06<00:46, 1.85it/s]
636
+
637
  14%|████▉ | 14/100 [00:07<00:45, 1.87it/s]
638
+
639
  15%|█████▎ | 15/100 [00:07<00:45, 1.88it/s]
640
+
641
  16%|█████▌ | 16/100 [00:08<00:44, 1.89it/s]
642
+
643
  17%|█████▉ | 17/100 [00:08<00:44, 1.86it/s]
644
+
645
  18%|██████▎ | 18/100 [00:09<00:43, 1.88it/s]
646
+
647
  19%|██████▋ | 19/100 [00:09<00:42, 1.89it/s]
648
+
649
  20%|███████ | 20/100 [00:10<00:42, 1.89it/s]
650
+
651
  21%|███████▎ | 21/100 [00:10<00:42, 1.86it/s]
652
+
653
  22%|███████▋ | 22/100 [00:11<00:41, 1.88it/s]
654
+
655
  23%|████████ | 23/100 [00:11<00:40, 1.89it/s]
656
+
657
  24%|████████▍ | 24/100 [00:12<00:40, 1.90it/s]
658
+
659
  25%|████████▊ | 25/100 [00:13<00:40, 1.86it/s]
660
+
661
  26%|█████████ | 26/100 [00:13<00:39, 1.88it/s]
662
+
663
  27%|█████████▍ | 27/100 [00:14<00:38, 1.89it/s]
664
+
665
  28%|█████████▊ | 28/100 [00:14<00:37, 1.90it/s]
666
+
667
  29%|██████████▏ | 29/100 [00:15<00:38, 1.86it/s]
668
+
669
  30%|██████████▌ | 30/100 [00:15<00:37, 1.88it/s]
670
+
671
  31%|██████████▊ | 31/100 [00:16<00:36, 1.89it/s]
672
+
673
  32%|███████████▏ | 32/100 [00:16<00:35, 1.90it/s]
674
+
675
  33%|███████████▌ | 33/100 [00:17<00:36, 1.86it/s]
676
+
677
  34%|███████████▉ | 34/100 [00:17<00:35, 1.88it/s]
678
+
679
  35%|████████████▎ | 35/100 [00:18<00:34, 1.89it/s]
680
+
681
  36%|████████████▌ | 36/100 [00:18<00:33, 1.90it/s]
682
+
683
  37%|████████████▉ | 37/100 [00:19<00:33, 1.86it/s]
684
+
685
  38%|█████████████▎ | 38/100 [00:19<00:32, 1.88it/s]
686
+
687
  39%|█████████████▋ | 39/100 [00:20<00:32, 1.89it/s]
688
+
689
  40%|██████████████ | 40/100 [00:21<00:31, 1.90it/s]
690
+
691
  41%|██████████████▎ | 41/100 [00:21<00:31, 1.86it/s]
692
+
693
  42%|██████████████▋ | 42/100 [00:22<00:30, 1.88it/s]
694
+
695
  43%|███████████████ | 43/100 [00:22<00:30, 1.89it/s]
696
+
697
  44%|███████████████▍ | 44/100 [00:23<00:29, 1.90it/s]
698
+
699
  45%|███████████████▊ | 45/100 [00:23<00:29, 1.86it/s]
700
+
701
  46%|████████████████ | 46/100 [00:24<00:28, 1.88it/s]
702
+
703
  47%|████████████████▍ | 47/100 [00:24<00:28, 1.89it/s]
704
+
705
  48%|████████████████▊ | 48/100 [00:25<00:27, 1.90it/s]
706
+
707
  49%|█████████████████▏ | 49/100 [00:25<00:27, 1.86it/s]
708
+
709
  50%|█████████████████▌ | 50/100 [00:26<00:26, 1.88it/s]
710
+
711
  51%|█████████████████▊ | 51/100 [00:26<00:25, 1.89it/s]
712
+
713
  52%|██████████████████▏ | 52/100 [00:27<00:25, 1.89it/s]
714
+
715
  53%|██████████████████▌ | 53/100 [00:27<00:25, 1.86it/s]
716
+
717
  54%|██████████████████▉ | 54/100 [00:28<00:24, 1.88it/s]
718
+
719
  55%|███████████████████▎ | 55/100 [00:28<00:23, 1.89it/s]
720
+
721
  56%|███████████████████▌ | 56/100 [00:29<00:23, 1.89it/s]
722
+
723
  57%|███████████████████▉ | 57/100 [00:30<00:23, 1.86it/s]
724
+
725
  58%|████████████████████▎ | 58/100 [00:30<00:22, 1.88it/s]
726
+
727
  59%|████████████████████▋ | 59/100 [00:31<00:21, 1.89it/s]
728
+
729
  60%|█████████████████████ | 60/100 [00:31<00:21, 1.89it/s]
730
+
731
  61%|█████████████████████▎ | 61/100 [00:32<00:20, 1.86it/s]
732
+
733
  62%|█████████████████████▋ | 62/100 [00:32<00:20, 1.88it/s]
734
+
735
  63%|██████████████████████ | 63/100 [00:33<00:19, 1.89it/s]
736
+
737
  64%|██████████████████████▍ | 64/100 [00:33<00:18, 1.90it/s]
738
+
739
  65%|██████████████████████▊ | 65/100 [00:34<00:18, 1.86it/s]
740
+
741
  66%|███████████████████████ | 66/100 [00:34<00:18, 1.88it/s]
742
+
743
  67%|███████████████████████▍ | 67/100 [00:35<00:17, 1.89it/s]
744
+
745
  68%|███████████████████████▊ | 68/100 [00:35<00:16, 1.90it/s]
746
+
747
  69%|████████████████████████▏ | 69/100 [00:36<00:16, 1.86it/s]
748
+
749
  70%|████████████████████████▌ | 70/100 [00:36<00:15, 1.88it/s]
750
+
751
  71%|████████████████████████▊ | 71/100 [00:37<00:15, 1.89it/s]
752
+
753
  72%|█████████████████████████▏ | 72/100 [00:38<00:14, 1.90it/s]
754
+
755
  73%|█████████████████████████▌ | 73/100 [00:38<00:14, 1.86it/s]
756
+
757
  74%|█████████████████████████▉ | 74/100 [00:39<00:13, 1.88it/s]
758
+
759
  75%|██████████████████████████▎ | 75/100 [00:39<00:13, 1.89it/s]
760
+
761
  76%|██████████████████████████▌ | 76/100 [00:40<00:12, 1.89it/s]
762
+
763
  77%|██████████████████████████▉ | 77/100 [00:40<00:12, 1.86it/s]
764
+
765
  78%|███████████████████████████▎ | 78/100 [00:41<00:11, 1.88it/s]
766
+
767
  79%|███████████████████████████▋ | 79/100 [00:41<00:11, 1.89it/s]
768
+
769
  80%|████████████████████████████ | 80/100 [00:42<00:10, 1.89it/s]
770
+
771
  81%|████████████████████████████▎ | 81/100 [00:42<00:10, 1.86it/s]
772
+
773
  82%|████████████████████████████▋ | 82/100 [00:43<00:09, 1.88it/s]
774
+
775
  83%|█████████████████████████████ | 83/100 [00:43<00:08, 1.89it/s]
776
+
777
  84%|█████████████████████████████▍ | 84/100 [00:44<00:08, 1.90it/s]
778
+
779
  85%|█████████████████████████████▊ | 85/100 [00:44<00:08, 1.86it/s]
780
+
781
  86%|██████████████████████████████ | 86/100 [00:45<00:07, 1.88it/s]
782
+
783
  87%|██████████████████████████████▍ | 87/100 [00:46<00:06, 1.89it/s]
784
+
785
  88%|██████████████████████████████▊ | 88/100 [00:46<00:06, 1.90it/s]
786
+
787
  89%|███████████████████████████████▏ | 89/100 [00:47<00:05, 1.86it/s]
788
+
789
  90%|███████████████████████████████▌ | 90/100 [00:47<00:05, 1.88it/s]
790
+
791
  91%|███████████████████████████████▊ | 91/100 [00:48<00:04, 1.89it/s]
792
+
793
  92%|████████████████████████████████▏ | 92/100 [00:48<00:04, 1.90it/s]
794
+
795
  93%|████████████████████████████████▌ | 93/100 [00:49<00:03, 1.86it/s]
796
+
797
  94%|████████████████████████████████▉ | 94/100 [00:49<00:03, 1.88it/s]
798
+
799
  95%|█████████████████████████████████▎ | 95/100 [00:50<00:02, 1.89it/s]
800
+
801
  96%|█████████████████████████████████▌ | 96/100 [00:50<00:02, 1.90it/s]
802
+
803
  97%|█████████████████████████████████▉ | 97/100 [00:51<00:01, 1.86it/s]
804
+
805
  98%|██████████████████████████████████▎| 98/100 [00:51<00:01, 1.88it/s]
806
+
807
  99%|██████████████████████████████████▋| 99/100 [00:52<00:00, 1.89it/s]
808
+
809
 
810
+
811
 
812
+
813
  83%|████████████████████████████▎ | 100/120 [15:30<02:39, 7.96s/it]
814
+
815
+
816
  
817
  84%|████████████████████████████▌ | 101/120 [15:39<07:42, 24.33s/it]
818
  85%|████████████████████████████▉ | 102/120 [15:47<05:51, 19.51s/it]
819
  86%|█████████████████████████████▏ | 103/120 [15:55<04:34, 16.14s/it]
820
  87%|█████████████████████████████▍ | 104/120 [16:02<03:30, 13.17s/it]
821
  88%|█████████████████████████████▊ | 105/120 [16:12<03:04, 12.32s/it]
822
 
823
+
824
  88%|█████████████████████████████▊ | 105/120 [16:12<03:04, 12.32s/it]
825
  88%|██████████████████████████████ | 106/120 [16:20<02:35, 11.11s/it]
826
  89%|██████████████████████████████▎ | 107/120 [16:28<02:13, 10.26s/it]
827
  90%|██████████████████████████████▌ | 108/120 [16:37<01:55, 9.66s/it]
828
  91%|██████████████████████████████▉ | 109/120 [16:45<01:41, 9.25s/it]
829
  92%|███████████████████████████████▏ | 110/120 [16:53<01:29, 8.96s/it]
830
 
831
+
832
  92%|███████████████████████████████▏ | 110/120 [16:53<01:29, 8.96s/it]
833
  92%|███████████████████████████████▍ | 111/120 [17:02<01:18, 8.75s/it]
834
  93%|███████████████████████████████▋ | 112/120 [17:10<01:08, 8.60s/it]
835
  94%|████████████████████████████████ | 113/120 [17:18<00:59, 8.50s/it]
836
  95%|████████████████████████████████▎ | 114/120 [17:26<00:50, 8.43s/it]
837
  96%|████████████████████████████████▌ | 115/120 [17:35<00:41, 8.38s/it]
838
 
839
+
840
  96%|████████████████████████████████▌ | 115/120 [17:35<00:41, 8.38s/it]
841
  97%|████████████████████████████████▊ | 116/120 [17:43<00:33, 8.34s/it]
842
  98%|█████████████████████████████████▏| 117/120 [17:49<00:23, 7.72s/it]
843
  98%|█████████████████████████████████▍| 118/120 [17:59<00:16, 8.47s/it]
844
  99%|█████████████████████████████████▋| 119/120 [18:08<00:08, 8.41s/it]
845
 
846
+
847
+
848
 
849
+
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:382cc235b56c725945e149cc25f191da667c836655efd0857b004320e90e91ea
3
+ size 15524095
tokenizer_config.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "199999": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "200018": {
15
+ "content": "<|endofprompt|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "200019": {
23
+ "content": "<|assistant|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": true,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "200020": {
31
+ "content": "<|end|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": true,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "200021": {
39
+ "content": "<|user|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": true,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "200022": {
47
+ "content": "<|system|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": true,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "200023": {
55
+ "content": "<|tool|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": true,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "200024": {
63
+ "content": "<|/tool|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": true,
67
+ "single_word": false,
68
+ "special": false
69
+ },
70
+ "200025": {
71
+ "content": "<|tool_call|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": true,
75
+ "single_word": false,
76
+ "special": false
77
+ },
78
+ "200026": {
79
+ "content": "<|/tool_call|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": true,
83
+ "single_word": false,
84
+ "special": false
85
+ },
86
+ "200027": {
87
+ "content": "<|tool_response|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": true,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "200028": {
95
+ "content": "<|tag|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": true,
99
+ "single_word": false,
100
+ "special": true
101
+ }
102
+ },
103
+ "bos_token": "<|endoftext|>",
104
+ "clean_up_tokenization_spaces": false,
105
+ "eos_token": "<|endoftext|>",
106
+ "extra_special_tokens": {},
107
+ "model_max_length": 131072,
108
+ "pad_token": "<|endoftext|>",
109
+ "tokenizer_class": "GPT2Tokenizer",
110
+ "unk_token": "<|endoftext|>"
111
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c63c7f9cbcde7e7d09bc87889471974cc0f6f0a055096966e9ecffca626e8b2
3
+ size 7761
vocab.json ADDED
The diff for this file is too large to render. See raw diff