mazesmazes commited on
Commit
3542a34
·
verified ·
1 Parent(s): 6daaed2

Training in progress, step 1000

Browse files
Files changed (4) hide show
  1. config.json +69 -20
  2. generation_config.json +8 -6
  3. model.safetensors +1 -1
  4. training_args.bin +1 -1
config.json CHANGED
@@ -119,9 +119,10 @@
119
  "type": "audio"
120
  }
121
  },
122
- "do_sample": false,
123
  "downsample_rate": 5,
124
  "dtype": "bfloat16",
 
125
  "encoder": {
126
  "_name_or_path": "zai-org/GLM-ASR-Nano-2512",
127
  "architectures": [
@@ -274,25 +275,24 @@
274
  "qformer_num_heads": 16,
275
  "qformer_num_layers": 2,
276
  "qformer_window_size": 15,
277
- "repetition_penalty": 1.0,
278
  "router_aux_loss_coef": 0.01,
279
  "system_prompt": "",
280
- "temperature": null,
281
  "text_config": {
282
- "_name_or_path": "Qwen/Qwen3-1.7B",
283
  "architectures": [
284
- "Qwen3ForCausalLM"
285
  ],
286
  "attention_bias": false,
287
  "attention_dropout": 0.0,
288
  "bos_token_id": null,
289
  "dtype": "bfloat16",
290
- "eos_token_id": 151645,
291
- "head_dim": 128,
292
  "hidden_act": "silu",
293
  "hidden_size": 2048,
294
  "initializer_range": 0.02,
295
- "intermediate_size": 6144,
296
  "layer_types": [
297
  "full_attention",
298
  "full_attention",
@@ -321,33 +321,82 @@
321
  "full_attention",
322
  "full_attention",
323
  "full_attention",
 
 
 
 
 
 
 
 
324
  "full_attention"
325
  ],
326
- "max_position_embeddings": 40960,
327
  "max_window_layers": 28,
328
- "model_type": "qwen3",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  "num_attention_heads": 16,
330
- "num_hidden_layers": 28,
331
- "num_key_value_heads": 8,
332
- "pad_token_id": 151643,
 
333
  "rms_norm_eps": 1e-06,
334
  "rope_parameters": {
335
- "rope_theta": 1000000,
336
  "rope_type": "default"
337
  },
338
  "sliding_window": null,
339
  "tie_word_embeddings": true,
340
- "use_cache": true,
341
  "use_sliding_window": false,
342
- "vocab_size": 151670
343
  },
344
- "text_model_id": "Qwen/Qwen3-1.7B",
345
  "time_mask_length": 100,
346
- "top_k": null,
347
- "top_p": null,
348
  "transformers_version": "5.0.0",
349
  "use_cache": false,
350
  "use_lora": false,
351
  "use_specaugment": true,
352
- "vocab_size": 151670
353
  }
 
119
  "type": "audio"
120
  }
121
  },
122
+ "do_sample": true,
123
  "downsample_rate": 5,
124
  "dtype": "bfloat16",
125
+ "enable_thinking": true,
126
  "encoder": {
127
  "_name_or_path": "zai-org/GLM-ASR-Nano-2512",
128
  "architectures": [
 
275
  "qformer_num_heads": 16,
276
  "qformer_num_layers": 2,
277
  "qformer_window_size": 15,
278
+ "repetition_penalty": 1.1,
279
  "router_aux_loss_coef": 0.01,
280
  "system_prompt": "",
281
+ "temperature": 1.0,
282
  "text_config": {
283
+ "_name_or_path": "HuggingFaceTB/SmolLM3-3B",
284
  "architectures": [
285
+ "SmolLM3ForCausalLM"
286
  ],
287
  "attention_bias": false,
288
  "attention_dropout": 0.0,
289
  "bos_token_id": null,
290
  "dtype": "bfloat16",
291
+ "eos_token_id": 128012,
 
292
  "hidden_act": "silu",
293
  "hidden_size": 2048,
294
  "initializer_range": 0.02,
295
+ "intermediate_size": 11008,
296
  "layer_types": [
297
  "full_attention",
298
  "full_attention",
 
321
  "full_attention",
322
  "full_attention",
323
  "full_attention",
324
+ "full_attention",
325
+ "full_attention",
326
+ "full_attention",
327
+ "full_attention",
328
+ "full_attention",
329
+ "full_attention",
330
+ "full_attention",
331
+ "full_attention",
332
  "full_attention"
333
  ],
334
+ "max_position_embeddings": 65536,
335
  "max_window_layers": 28,
336
+ "mlp_bias": false,
337
+ "model_type": "smollm3",
338
+ "no_rope_layer_interval": 4,
339
+ "no_rope_layers": [
340
+ 1,
341
+ 1,
342
+ 1,
343
+ 0,
344
+ 1,
345
+ 1,
346
+ 1,
347
+ 0,
348
+ 1,
349
+ 1,
350
+ 1,
351
+ 0,
352
+ 1,
353
+ 1,
354
+ 1,
355
+ 0,
356
+ 1,
357
+ 1,
358
+ 1,
359
+ 0,
360
+ 1,
361
+ 1,
362
+ 1,
363
+ 0,
364
+ 1,
365
+ 1,
366
+ 1,
367
+ 0,
368
+ 1,
369
+ 1,
370
+ 1,
371
+ 0,
372
+ 1,
373
+ 1,
374
+ 1,
375
+ 0
376
+ ],
377
  "num_attention_heads": 16,
378
+ "num_hidden_layers": 36,
379
+ "num_key_value_heads": 4,
380
+ "pad_token_id": 128004,
381
+ "pretraining_tp": 2,
382
  "rms_norm_eps": 1e-06,
383
  "rope_parameters": {
384
+ "rope_theta": 5000000.0,
385
  "rope_type": "default"
386
  },
387
  "sliding_window": null,
388
  "tie_word_embeddings": true,
389
+ "use_cache": false,
390
  "use_sliding_window": false,
391
+ "vocab_size": 128257
392
  },
393
+ "text_model_id": "HuggingFaceTB/SmolLM3-3B",
394
  "time_mask_length": 100,
395
+ "top_k": 0,
396
+ "top_p": 0.01,
397
  "transformers_version": "5.0.0",
398
  "use_cache": false,
399
  "use_lora": false,
400
  "use_specaugment": true,
401
+ "vocab_size": 128257
402
  }
generation_config.json CHANGED
@@ -1,17 +1,19 @@
1
  {
2
- "bos_token_id": 151643,
3
- "do_sample": false,
4
  "eos_token_id": [
5
- 151645,
6
- 151643
7
  ],
8
  "length_penalty": 1.0,
9
  "max_new_tokens": 128,
10
  "min_new_tokens": 0,
11
  "no_repeat_ngram_size": 0,
12
  "num_beams": 1,
13
- "pad_token_id": 151643,
14
- "repetition_penalty": 1.0,
 
 
 
15
  "transformers_version": "5.0.0",
16
  "use_cache": true
17
  }
 
1
  {
2
+ "bos_token_id": 128000,
3
+ "do_sample": true,
4
  "eos_token_id": [
5
+ 128012
 
6
  ],
7
  "length_penalty": 1.0,
8
  "max_new_tokens": 128,
9
  "min_new_tokens": 0,
10
  "no_repeat_ngram_size": 0,
11
  "num_beams": 1,
12
+ "pad_token_id": 128004,
13
+ "repetition_penalty": 1.1,
14
+ "temperature": 1.0,
15
+ "top_k": 0,
16
+ "top_p": 0.01,
17
  "transformers_version": "5.0.0",
18
  "use_cache": true
19
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18e5986e79a0bf7b6c609ef997ab8da9d47208a6753713e66aa703df851fb488
3
  size 14682440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31bee93148f1023fd9a3e8383f2d796f69652a50e205d7b0b3e5582346b4ea28
3
  size 14682440
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:158cc8e5370e21a00c365676358f28052402152962cd24952873fbc3921655ed
3
  size 5265
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f727a975ff62955fdbc5015fb655f4e0d82736cd6c637b7cf87e5e59e811fe1
3
  size 5265