Tele-AI
/

TeleChat3-Coder-36B-Thinking

Safetensors

telechat3

custom_code

Model card Files Files and versions

xet

Community

liuxz0801 commited on 27 days ago

Commit

1138da1

verified ·

1 Parent(s): f814882

Update modeling_telechat3.py

Browse files

Files changed (1) hide show

modeling_telechat3.py +33 -33

modeling_telechat3.py CHANGED Viewed

@@ -44,7 +44,7 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import LossKwargs, auto_docstring, can_return_tuple, logging
-from .configuration_telechat3 import Telechat3Config
 logger = logging.get_logger(__name__)
@@ -152,10 +152,10 @@ ROPE_INIT_FUNCTIONS['telechat3-yarn'] = _compute_telechat_yarn_parameters
 @use_kernel_forward_from_hub("RMSNorm")
-class Telechat3RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        Telechat3RMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -172,8 +172,8 @@ class Telechat3RMSNorm(nn.Module):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-class Telechat3RotaryEmbedding(nn.Module):
-    def __init__(self, config: Telechat3Config, device=None):
         super().__init__()
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
@@ -240,7 +240,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed, k_embed
-class Telechat3MLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -294,10 +294,10 @@ def eager_attention_forward(
     return attn_output, attn_weights
-class Telechat3Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
-    def __init__(self, config: Telechat3Config, layer_idx: int):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -364,16 +364,16 @@ class Telechat3Attention(nn.Module):
         return attn_output, attn_weights
-class Telechat3DecoderLayer(GradientCheckpointingLayer):
-    def __init__(self, config: Telechat3Config, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = Telechat3Attention(config=config, layer_idx=layer_idx)
-        self.mlp = Telechat3MLP(config)
-        self.input_layernorm = Telechat3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Telechat3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
             self,
@@ -418,11 +418,11 @@ class Telechat3DecoderLayer(GradientCheckpointingLayer):
 @auto_docstring
-class Telechat3PreTrainedModel(PreTrainedModel):
-    config_class = Telechat3Config
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["Telechat3DecoderLayer"]
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_3 = True
     _supports_flash_attn_2 = True
@@ -443,23 +443,23 @@ class Telechat3PreTrainedModel(PreTrainedModel):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, Telechat3RMSNorm):
             module.weight.data.fill_(1.0)
 @auto_docstring
-class Telechat3Model(Telechat3PreTrainedModel):
-    def __init__(self, config: Telechat3Config):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
-            [Telechat3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
-        self.norm = Telechat3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.rotary_emb = Telechat3RotaryEmbedding(config=config)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -577,14 +577,14 @@ class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
 @auto_docstring
-class Telechat3ForCausalLM(Telechat3PreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     _tp_plan = {"lm_head": "colwise_rep"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     def __init__(self, config):
         super().__init__(config)
-        self.model = Telechat3Model(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -666,9 +666,9 @@ class Telechat3ForCausalLM(Telechat3PreTrainedModel, GenerationMixin):
 @auto_docstring(
     custom_intro="""
-    The Telechat3 Model transformer with a sequence classification head on top (linear layer).
-    [`Telechat3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -678,11 +678,11 @@ class Telechat3ForCausalLM(Telechat3PreTrainedModel, GenerationMixin):
     each row of the batch).
     """
 )
-class Telechat3ForSequenceClassification(Telechat3PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = Telechat3Model(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
@@ -765,13 +765,13 @@ class Telechat3ForSequenceClassification(Telechat3PreTrainedModel):
 @auto_docstring
-class Telechat3ForQuestionAnswering(Telechat3PreTrainedModel):
     base_model_prefix = "transformer"
     # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Telechat3
     def __init__(self, config):
         super().__init__(config)
-        self.transformer = Telechat3Model(config)
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
         # Initialize weights and apply final processing
@@ -829,11 +829,11 @@ class Telechat3ForQuestionAnswering(Telechat3PreTrainedModel):
 @auto_docstring
-class Telechat3ForTokenClassification(Telechat3PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = Telechat3Model(config)
         if getattr(config, "classifier_dropout", None) is not None:
             classifier_dropout = config.classifier_dropout
         elif getattr(config, "hidden_dropout", None) is not None:

 from transformers.processing_utils import Unpack
 from transformers.utils import LossKwargs, auto_docstring, can_return_tuple, logging
+from .configuration_telechat3 import TeleChat3Config
 logger = logging.get_logger(__name__)
 @use_kernel_forward_from_hub("RMSNorm")
+class TeleChat3RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
+        TeleChat3RMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class TeleChat3RotaryEmbedding(nn.Module):
+    def __init__(self, config: TeleChat3Config, device=None):
         super().__init__()
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
     return q_embed, k_embed
+class TeleChat3MLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
     return attn_output, attn_weights
+class TeleChat3Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: TeleChat3Config, layer_idx: int):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         return attn_output, attn_weights
+class TeleChat3DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: TeleChat3Config, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.self_attn = TeleChat3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = TeleChat3MLP(config)
+        self.input_layernorm = TeleChat3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = TeleChat3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
             self,
 @auto_docstring
+class TeleChat3PreTrainedModel(PreTrainedModel):
+    config_class = TeleChat3Config
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["TeleChat3DecoderLayer"]
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_3 = True
     _supports_flash_attn_2 = True
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, TeleChat3RMSNorm):
             module.weight.data.fill_(1.0)
 @auto_docstring
+class TeleChat3Model(TeleChat3PreTrainedModel):
+    def __init__(self, config: TeleChat3Config):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
+            [TeleChat3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
+        self.norm = TeleChat3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = TeleChat3RotaryEmbedding(config=config)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
 @auto_docstring
+class TeleChat3ForCausalLM(TeleChat3PreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     _tp_plan = {"lm_head": "colwise_rep"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     def __init__(self, config):
         super().__init__(config)
+        self.model = TeleChat3Model(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 @auto_docstring(
     custom_intro="""
+    The TeleChat3 Model transformer with a sequence classification head on top (linear layer).
+    [`TeleChat3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     each row of the batch).
     """
 )
+class TeleChat3ForSequenceClassification(TeleChat3PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = TeleChat3Model(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
 @auto_docstring
+class TeleChat3ForQuestionAnswering(TeleChat3PreTrainedModel):
     base_model_prefix = "transformer"
     # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Telechat3
     def __init__(self, config):
         super().__init__(config)
+        self.transformer = TeleChat3Model(config)
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
         # Initialize weights and apply final processing
 @auto_docstring
+class TeleChat3ForTokenClassification(TeleChat3PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = TeleChat3Model(config)
         if getattr(config, "classifier_dropout", None) is not None:
             classifier_dropout = config.classifier_dropout
         elif getattr(config, "hidden_dropout", None) is not None: