liuxz0801 commited on
Commit
390dc41
·
verified ·
1 Parent(s): 9275929

Update configuration_telechat3.py

Browse files
Files changed (1) hide show
  1. configuration_telechat3.py +106 -106
configuration_telechat3.py CHANGED
@@ -1,106 +1,106 @@
1
- # coding=utf-8
2
- # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """ Telechat configuration"""
17
-
18
- from transformers.configuration_utils import PretrainedConfig
19
-
20
-
21
- class Telechat3Config(PretrainedConfig):
22
- model_type = "telechat3"
23
- keys_to_ignore_at_inference = ["past_key_values"]
24
- base_model_tp_plan = {
25
- "layers.*.self_attn.q_proj": "colwise",
26
- "layers.*.self_attn.k_proj": "colwise",
27
- "layers.*.self_attn.v_proj": "colwise",
28
- "layers.*.self_attn.o_proj": "rowwise",
29
- "layers.*.mlp.gate_proj": "colwise",
30
- "layers.*.mlp.up_proj": "colwise",
31
- "layers.*.mlp.down_proj": "rowwise",
32
- }
33
- base_model_pp_plan = {
34
- "embed_tokens": (["input_ids"], ["inputs_embeds"]),
35
- "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
36
- "norm": (["hidden_states"], ["hidden_states"]),
37
- }
38
-
39
- def __init__(
40
- self,
41
- attention_bias=False,
42
- attention_dropout=0.0,
43
- bos_token_id=1,
44
- eos_token_id=2,
45
- head_dim=128,
46
- hidden_act="silu",
47
- hidden_size=6144,
48
- initializer_range=0.0048,
49
- intermediate_size=24576,
50
- max_position_embeddings=2048,
51
- mlp_bias=False,
52
- model_type="telechat3",
53
- num_attention_heads=48,
54
- num_hidden_layers=64,
55
- num_key_value_heads=None,
56
- original_max_position_embeddings=8192,
57
- pad_token_id=None,
58
- pretraining_tp=1,
59
- rms_norm_eps=1e-5,
60
- rope_scaling=None,
61
- rope_theta=1000000.0,
62
- tie_word_embeddings=False,
63
- use_cache=True,
64
- vocab_size=131072,
65
- **kwargs,
66
- ):
67
- self.attention_bias = attention_bias
68
- self.attention_dropout = attention_dropout
69
- self.hidden_size = hidden_size
70
- self.hidden_act = hidden_act
71
- self.intermediate_size = intermediate_size
72
- self.mlp_bias = mlp_bias
73
- self.max_position_embeddings = max_position_embeddings
74
- self.num_hidden_layers = num_hidden_layers
75
- self.num_attention_heads = num_attention_heads
76
-
77
- # for backward compatibility
78
- if num_key_value_heads is None:
79
- num_key_value_heads = num_attention_heads
80
- self.num_key_value_heads = num_key_value_heads
81
-
82
- self.initializer_range = initializer_range
83
-
84
- self.pretraining_tp = pretraining_tp
85
- self.rms_norm_eps = rms_norm_eps
86
- self.rope_theta = rope_theta
87
- self.rope_scaling = rope_scaling
88
- self.use_cache = use_cache
89
- self.vocab_size = vocab_size
90
-
91
- if head_dim is not None and head_dim != self.hidden_size // self.num_attention_heads:
92
- raise ValueError("head_dim != hidden_size//num_attention_head.Please check the config.")
93
- self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
94
-
95
- # Validate the correctness of rotary position embeddings parameters
96
- # BC: if there is a 'type' field, copy it it to 'rope_type'.
97
- if self.rope_scaling is not None and "type" in self.rope_scaling:
98
- self.rope_scaling["rope_type"] = self.rope_scaling["type"]
99
-
100
- super().__init__(
101
- pad_token_id=pad_token_id,
102
- bos_token_id=bos_token_id,
103
- eos_token_id=eos_token_id,
104
- tie_word_embeddings=tie_word_embeddings,
105
- **kwargs,
106
- )
 
1
+ # coding=utf-8
2
+ # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """ Telechat configuration"""
17
+
18
+ from transformers.configuration_utils import PretrainedConfig
19
+
20
+
21
+ class TeleChat3Config(PretrainedConfig):
22
+ model_type = "telechat3"
23
+ keys_to_ignore_at_inference = ["past_key_values"]
24
+ base_model_tp_plan = {
25
+ "layers.*.self_attn.q_proj": "colwise",
26
+ "layers.*.self_attn.k_proj": "colwise",
27
+ "layers.*.self_attn.v_proj": "colwise",
28
+ "layers.*.self_attn.o_proj": "rowwise",
29
+ "layers.*.mlp.gate_proj": "colwise",
30
+ "layers.*.mlp.up_proj": "colwise",
31
+ "layers.*.mlp.down_proj": "rowwise",
32
+ }
33
+ base_model_pp_plan = {
34
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
35
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
36
+ "norm": (["hidden_states"], ["hidden_states"]),
37
+ }
38
+
39
+ def __init__(
40
+ self,
41
+ attention_bias=False,
42
+ attention_dropout=0.0,
43
+ bos_token_id=1,
44
+ eos_token_id=2,
45
+ head_dim=128,
46
+ hidden_act="silu",
47
+ hidden_size=6144,
48
+ initializer_range=0.0048,
49
+ intermediate_size=24576,
50
+ max_position_embeddings=2048,
51
+ mlp_bias=False,
52
+ model_type="telechat3",
53
+ num_attention_heads=48,
54
+ num_hidden_layers=64,
55
+ num_key_value_heads=None,
56
+ original_max_position_embeddings=8192,
57
+ pad_token_id=None,
58
+ pretraining_tp=1,
59
+ rms_norm_eps=1e-5,
60
+ rope_scaling=None,
61
+ rope_theta=1000000.0,
62
+ tie_word_embeddings=False,
63
+ use_cache=True,
64
+ vocab_size=131072,
65
+ **kwargs,
66
+ ):
67
+ self.attention_bias = attention_bias
68
+ self.attention_dropout = attention_dropout
69
+ self.hidden_size = hidden_size
70
+ self.hidden_act = hidden_act
71
+ self.intermediate_size = intermediate_size
72
+ self.mlp_bias = mlp_bias
73
+ self.max_position_embeddings = max_position_embeddings
74
+ self.num_hidden_layers = num_hidden_layers
75
+ self.num_attention_heads = num_attention_heads
76
+
77
+ # for backward compatibility
78
+ if num_key_value_heads is None:
79
+ num_key_value_heads = num_attention_heads
80
+ self.num_key_value_heads = num_key_value_heads
81
+
82
+ self.initializer_range = initializer_range
83
+
84
+ self.pretraining_tp = pretraining_tp
85
+ self.rms_norm_eps = rms_norm_eps
86
+ self.rope_theta = rope_theta
87
+ self.rope_scaling = rope_scaling
88
+ self.use_cache = use_cache
89
+ self.vocab_size = vocab_size
90
+
91
+ if head_dim is not None and head_dim != self.hidden_size // self.num_attention_heads:
92
+ raise ValueError("head_dim != hidden_size//num_attention_head.Please check the config.")
93
+ self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
94
+
95
+ # Validate the correctness of rotary position embeddings parameters
96
+ # BC: if there is a 'type' field, copy it it to 'rope_type'.
97
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
98
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
99
+
100
+ super().__init__(
101
+ pad_token_id=pad_token_id,
102
+ bos_token_id=bos_token_id,
103
+ eos_token_id=eos_token_id,
104
+ tie_word_embeddings=tie_word_embeddings,
105
+ **kwargs,
106
+ )