andrebarrosilva1123 commited on
Commit
4240128
·
verified ·
1 Parent(s): a4582fd

epoch 22427 | block 8073904 | steps 175 | loss 1.0663 | sim_abs 0.568 sq_acc 0.10

Browse files
Files changed (3) hide show
  1. config.json +11 -9
  2. generation_config.json +2 -3
  3. model.safetensors +2 -2
config.json CHANGED
@@ -1,15 +1,17 @@
1
  {
2
  "architectures": [
3
- "Qwen2ForCausalLM"
4
  ],
 
5
  "attention_dropout": 0.0,
6
  "bos_token_id": 151643,
7
  "dtype": "bfloat16",
8
  "eos_token_id": 151645,
 
9
  "hidden_act": "silu",
10
- "hidden_size": 1536,
11
  "initializer_range": 0.02,
12
- "intermediate_size": 8960,
13
  "layer_types": [
14
  "full_attention",
15
  "full_attention",
@@ -40,16 +42,16 @@
40
  "full_attention",
41
  "full_attention"
42
  ],
43
- "max_position_embeddings": 32768,
44
- "max_window_layers": 21,
45
- "model_type": "qwen2",
46
- "num_attention_heads": 12,
47
  "num_hidden_layers": 28,
48
- "num_key_value_heads": 2,
49
  "pad_token_id": null,
50
  "rms_norm_eps": 1e-06,
51
  "rope_parameters": {
52
- "rope_theta": 1000000.0,
53
  "rope_type": "default"
54
  },
55
  "sliding_window": null,
 
1
  {
2
  "architectures": [
3
+ "Qwen3ForCausalLM"
4
  ],
5
+ "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 151643,
8
  "dtype": "bfloat16",
9
  "eos_token_id": 151645,
10
+ "head_dim": 128,
11
  "hidden_act": "silu",
12
+ "hidden_size": 2048,
13
  "initializer_range": 0.02,
14
+ "intermediate_size": 6144,
15
  "layer_types": [
16
  "full_attention",
17
  "full_attention",
 
42
  "full_attention",
43
  "full_attention"
44
  ],
45
+ "max_position_embeddings": 40960,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
  "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
  "pad_token_id": null,
52
  "rms_norm_eps": 1e-06,
53
  "rope_parameters": {
54
+ "rope_theta": 1000000,
55
  "rope_type": "default"
56
  },
57
  "sliding_window": null,
generation_config.json CHANGED
@@ -6,9 +6,8 @@
6
  151643
7
  ],
8
  "pad_token_id": 151643,
9
- "repetition_penalty": 1.1,
10
- "temperature": 0.7,
11
  "top_k": 20,
12
- "top_p": 0.8,
13
  "transformers_version": "5.6.2"
14
  }
 
6
  151643
7
  ],
8
  "pad_token_id": 151643,
9
+ "temperature": 0.6,
 
10
  "top_k": 20,
11
+ "top_p": 0.95,
12
  "transformers_version": "5.6.2"
13
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47d41fed457727a87ad02720acad950f3039d142f5cf0f7fb90f2ec21f8df658
3
- size 3087467144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f27d007440d494ae1fe78920ce736de8f0e1738d554e66d61a92cf2e835df5c
3
+ size 3441185608