| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """Ernie model configuration""" |
| import copy |
|
|
| from typing import List, Optional, Tuple, Union |
|
|
| from transformers import PretrainedConfig |
|
|
|
|
| __all__ = [ |
| "ERNIE_PRETRAINED_INIT_CONFIGURATION", |
| "Ernie4_5_Config", |
| "Ernie4_5_MoEConfig", |
| "Ernie4_5_VLMoEConfig", |
| ] |
|
|
|
|
| class DFNRopeVisionTransformerConfig(PretrainedConfig): |
| """ |
| Configuration class for DFNRopeVisionTransformer model. |
| This class inherits from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| """ |
|
|
| model_type = "DFNRope_vision_transformer" |
| base_model_tp_plan = {} |
|
|
| def __init__( |
| self, |
| depth=32, |
| embed_dim=1280, |
| hidden_size=3584, |
| hidden_act="quick_gelu", |
| mlp_ratio=4, |
| num_heads=16, |
| in_channels=3, |
| patch_size=14, |
| spatial_merge_size=2, |
| attn_implementation="eager", |
| pp_data_balance=False, |
| recompute=False, |
| attn_sep=False, |
| vit_first_fwd_bsz=128, |
| vit_num_recompute_layers=10000, |
| **kwargs, |
| ): |
| """ |
| Initialize DFNRopeVisionTransformer model configuration with default or specified parameters. |
| |
| Args: |
| depth (int): Number of transformer layers in the model. |
| embed_dim (int): Dimensionality of the embedding layer. |
| hidden_size (int): Dimensionality of the feedforward network. |
| hidden_act (str): Activation function for the feedforward network. |
| mlp_ratio (float): Ratio between the number of input features and |
| the number of output features in the feedforward network. |
| num_heads (int): Number of attention heads in each attention layer. |
| in_channels (int): Number of channels in the input image. |
| patch_size (int): |
| Size of patches in the input image. Defaults to 14. |
| spatial_merge_size (int): |
| Spatial merge size for the spatial transformer module. Defaults to 2. |
| attn_implementation (str): Attention implementation type. Defaults to "eager". |
| pp_data_balance (bool): Whether to balance data during preprocessing. Defaults to False. |
| recompute (bool): Whether to use recompute. Defaults to False. |
| attn_sep (bool): Whether to separate attention computation into two stages. Defaults to False. |
| vit_first_fwd_bsz (int): First forward batch size for ViT. Defaults to 128. |
| vit_num_recompute_layers (int): Number of recomputed layers for ViT. Defaults to |
| """ |
| super().__init__(**kwargs) |
|
|
| self.depth = depth |
| self.embed_dim = embed_dim |
| self.hidden_size = hidden_size |
| self.hidden_act = hidden_act |
| self.mlp_ratio = mlp_ratio |
| self.num_heads = num_heads |
| self.in_channels = in_channels |
| self.patch_size = patch_size |
| self.spatial_merge_size = spatial_merge_size |
| self.attn_implementation = attn_implementation |
| self.pp_data_balance = pp_data_balance |
| self.recompute = recompute |
| self.attn_sep = attn_sep |
| self.vit_first_fwd_bsz = vit_first_fwd_bsz |
| self.vit_num_recompute_layers = vit_num_recompute_layers |
|
|
| def get(self, key, default=None): |
| """get config value by key""" |
| if hasattr(self, key): |
| return getattr(self, key) |
| else: |
| return default |
|
|
|
|
| ERNIE_PRETRAINED_INIT_CONFIGURATION = { |
| "ernie/tiny-random-ernie": { |
| "hidden_size": 768, |
| "initializer_range": 0.02, |
| "intermediate_size": 11008, |
| "max_position_embeddings": 2048, |
| "model_type": "ernie", |
| "num_attention_heads": 2, |
| "num_hidden_layers": 2, |
| "rms_norm_eps": 1e-06, |
| "vocab_size": 32000, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "pad_token_id": 0, |
| "use_cache": False, |
| "recompute": False, |
| "use_flash_attn": True, |
| "use_pure_fp16": False, |
| }, |
| } |
|
|
|
|
| class Ernie4_5_Config(PretrainedConfig): |
| """ |
| Configuration class for ERNIE model. |
| |
| This class stores the configuration of an ERNIE model, defining the model architecture. |
| It inherits from PretrainedConfig and can be used to control model outputs. |
| """ |
|
|
| model_type = "ernie" |
| pretrained_init_configuration = ERNIE_PRETRAINED_INIT_CONFIGURATION |
| base_model_tp_plan = {} |
|
|
| def __init__( |
| self, |
| vocab_size=32000, |
| hidden_size=768, |
| intermediate_size=11008, |
| max_position_embeddings=32768, |
| num_hidden_layers=2, |
| num_attention_heads=2, |
| initializer_range=0.02, |
| rms_norm_eps=1e-6, |
| use_cache=False, |
| use_flash_attention=True, |
| use_sparse_flash_attn=True, |
| use_var_len_flash_attn=False, |
| recompute=False, |
| recompute_granularity="core_attn", |
| recompute_use_reentrant=False, |
| use_rmsnorm=True, |
| fuse_rms_norm=False, |
| fuse_ln=False, |
| pad_token_id=0, |
| bos_token_id=1, |
| eos_token_id=2, |
| fuse_swiglu=False, |
| use_bias=False, |
| rope_theta=10000, |
| fuse_rope=False, |
| fuse_softmax_mask=False, |
| use_fast_ln=False, |
| weight_share_add_bias=True, |
| fuse_linear=False, |
| max_sequence_length=None, |
| ignored_index=-100, |
| add_tail_layers=False, |
| use_recompute_lm_head=False, |
| use_recompute_loss_fn=False, |
| refined_recompute=dict(), |
| attention_probs_dropout_prob=0.0, |
| hidden_dropout_prob=0.0, |
| compression_ratio: float = 1.0, |
| num_key_value_heads=None, |
| use_sparse_head_and_loss_fn=False, |
| micro_batch_size=-1, |
| use_ep_comm_overlap=False, |
| use_fused_head_and_loss_fn=False, |
| token_balance_loss=False, |
| token_balance_seqlen=False, |
| cachekv_quant: bool = False, |
| pp_seg_method="layer:ErnieDecoderLayer|EmptyLayer", |
| **kwargs, |
| ): |
| """ |
| Initialize ERNIE model configuration with default or specified parameters. |
| |
| Args: |
| vocab_size (int): Size of the vocabulary (number of unique tokens) |
| hidden_size (int): Dimensionality of the encoder layers and the pooler layer |
| intermediate_size (int): Dimensionality of the "intermediate" (feed-forward) layer |
| max_position_embeddings (int): Maximum sequence length the model can handle |
| num_hidden_layers (int): Number of hidden layers in the Transformer encoder |
| num_attention_heads (int): Number of attention heads for each attention layer |
| rms_norm_eps (float): The epsilon used by the RMS normalization layers |
| use_cache (bool): Whether to use caching for faster generation (decoding) |
| use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation |
| use_sparse_flash_attn (bool): Whether to use sparse FlashAttention |
| use_var_len_flash_attn (bool): Whether to use variable-length FlashAttention |
| recompute (bool): Whether to use gradient checkpointing to save memory |
| recompute_granularity (str): Granularity of recomputation ("core_attn", "full", etc.) |
| recompute_use_reentrant (bool): Whether to use reentrant checkpointing |
| use_rmsnorm (bool): Whether to use RMSNorm instead of LayerNorm |
| fuse_rms_norm (bool): Whether to fuse RMSNorm operations for optimization |
| fuse_ln (bool): Whether to fuse LayerNorm operations |
| pad_token_id (int): Token ID used for padding sequences |
| bos_token_id (int): Token ID used for beginning-of-sequence |
| eos_token_id (int): Token ID used for end-of-sequence |
| fuse_swiglu (bool): Whether to fuse SwiGLU operations |
| use_bias (bool): Whether to use bias terms in linear layers |
| rope_theta (float): The base period of the RoPE embeddings |
| fuse_rope (bool): Whether to fuse RoPE operations |
| use_fast_ln (bool): Whether to use optimized LayerNorm implementation |
| weight_share_add_bias (bool): Whether to share bias weights in certain layers |
| fuse_linear (bool): Whether to fuse linear operations |
| max_sequence_length (int): Maximum sequence length for positional embeddings |
| ignored_index (int): Target value that is ignored during loss computation |
| add_tail_layers (bool): Whether to add additional layers at the end |
| use_recompute_lm_head (bool): Whether to recompute gradients for language model head |
| use_recompute_loss_fn (bool): Whether to recompute gradients for loss function |
| refined_recompute (dict): Dictionary specifying refined recomputation settings |
| attention_probs_dropout_prob (float): Dropout probability for attention weights |
| hidden_dropout_prob (float): Dropout probability for hidden layers |
| compression_ratio (float): Ratio for KV cache compression (1.0 = no compression) |
| num_key_value_heads (int): Number of key/value heads (for Grouped Query Attention) |
| use_sparse_head_and_loss_fn (bool): Whether to use sparse attention head and loss function |
| micro_batch_size (int): Size of micro batches (-1 for automatic) |
| use_ep_comm_overlap (bool): Whether to overlap communication with computation |
| use_fused_head_loss_fn (bool): Whether to use fused head and loss function |
| token_balance_loss (bool): Whether to balance loss by token count |
| token_balance_seqlen (bool): Whether to balance sequence lengths |
| cachekv_quant (bool): Whether to quantize key-value cache |
| pp_seg_method (str): Method for pipeline parallel segmentation |
| **kwargs: Additional keyword arguments passed to parent class |
| """ |
|
|
| |
| if "tie_word_embeddings" not in kwargs: |
| kwargs["tie_word_embeddings"] = False |
| super().__init__( |
| pad_token_id=pad_token_id, |
| bos_token_id=bos_token_id, |
| eos_token_id=eos_token_id, |
| **kwargs, |
| ) |
| self.vocab_size = vocab_size |
| self.hidden_size = hidden_size |
| self.intermediate_size = intermediate_size |
| self.max_position_embeddings = max_position_embeddings |
| self.num_hidden_layers = num_hidden_layers |
| self.num_attention_heads = num_attention_heads |
| self.initializer_range = initializer_range |
| self.rms_norm_eps = rms_norm_eps |
| self.use_cache = use_cache |
| self.recompute = recompute |
| self.recompute_granularity = recompute_granularity |
| self.use_flash_attention = use_flash_attention |
| self.use_sparse_flash_attn = use_sparse_flash_attn |
| self.recompute_use_reentrant = recompute_use_reentrant |
| self.use_var_len_flash_attn = use_var_len_flash_attn |
| self.pad_token_id = pad_token_id |
| self.bos_token_id = bos_token_id |
| self.eos_token_id = eos_token_id |
| self.fuse_swiglu = fuse_swiglu |
| self.fuse_rms_norm = fuse_rms_norm |
| self.fuse_ln = fuse_ln |
| self.use_rmsnorm = use_rmsnorm |
| self.micro_batch_size = micro_batch_size |
|
|
| self.max_sequence_length = max_sequence_length |
| self.use_bias = use_bias |
| self.weight_share_add_bias = weight_share_add_bias |
| self.rope_theta = rope_theta |
| self.fuse_rope = fuse_rope |
| self.fuse_softmax_mask = fuse_softmax_mask |
| self.use_fast_ln = use_fast_ln |
|
|
| self.fuse_linear = fuse_linear |
| self.ignored_index = ignored_index |
| self.add_tail_layers = add_tail_layers |
| self.use_recompute_lm_head = use_recompute_lm_head |
| self.use_recompute_loss_fn = use_recompute_loss_fn |
|
|
| self.refined_recompute = refined_recompute |
| self.skip_recompute_ops = dict() |
| """ |
| `refined_recompute` is a dictionary that specifies fine-grained gradient recomputation settings, |
| which currently only takes effect in Pipeline Parallel (PP) mode. |
| |
| In PP mode, this dictionary populates `self.skip_recompute_ops` with the following structure: |
| - Key (`op_name`): The operation name to configure, with possible values: |
| * "mlp_row_ln" - MLP row-wise layer normalization |
| * "flash_attn" - Flash attention operation |
| * "attention_row_ln" - Attention row-wise layer normalization |
| * "attention_column_ln" - Attention column-wise layer normalization |
| * "mlp_column_ln" - MLP column-wise layer normalization |
| |
| - Value (`skip_num`): Controls how many times to skip recomputation: |
| * 0: Never skip recomputation (minimum memory usage) |
| * -1: Always skip recomputation (maximum memory usage) |
| * [0,1,...,12]: Skip recomputation for specified number of times |
| * ≥12: Equivalent to -1 (always skip recomputation) |
| |
| This allows precise control over memory/computation tradeoffs for different operations. |
| """ |
| self.attention_probs_dropout_prob = attention_probs_dropout_prob |
| self.hidden_dropout_prob = hidden_dropout_prob |
| self.compression_ratio = compression_ratio |
| self.num_key_value_heads = num_key_value_heads |
| self.use_sparse_head_and_loss_fn = use_sparse_head_and_loss_fn |
| self.use_ep_comm_overlap = use_ep_comm_overlap |
| self.use_fused_head_and_loss_fn = use_fused_head_and_loss_fn |
| self.token_balance_loss = token_balance_loss |
| self.token_balance_seqlen = token_balance_seqlen |
| self.cachekv_quant = cachekv_quant |
| self.pp_seg_method = pp_seg_method |
|
|
| def get(self, key, default=None): |
| """get config value by key""" |
| if hasattr(self, key): |
| return getattr(self, key) |
| else: |
| return default |
|
|
|
|
| class Ernie4_5_MoEConfig(Ernie4_5_Config): |
| r""" |
| Configuration class for ErnieMoE model architecture. |
| |
| This class stores the configuration for a [`~ErnieModel`] and is used to instantiate |
| an ErnieMoE model according to the specified arguments. Inherits from [`PretrainedConfig`] |
| and can control model outputs. |
| |
| Attributes: |
| Inherits all attributes from Ernie4_5_Config and adds MoE-specific configurations. |
| """ |
|
|
| model_type = "ernie" |
| attribute_map = { |
| "n_positions": "max_position_embeddings", |
| "n_embd": "hidden_size", |
| "n_layer": "num_hidden_layers", |
| "n_head": "num_attention_heads", |
| "n_inner": "intermediate_size", |
| "activation_function": "hidden_act", |
| } |
| pretrained_init_configuration = ERNIE_PRETRAINED_INIT_CONFIGURATION |
| base_model_tp_plan = {} |
|
|
| def __init__( |
| self, |
| moe_num_experts: Union[int, list] = 0, |
| use_recompute_moe=False, |
| moe_capacity=(), |
| moe_layer_interval=2, |
| moe_layer_start_index=0, |
| moe_layer_end_index=-1, |
| moe_aux_loss_lambda=1e-2, |
| moe_z_loss_lambda=1e-4, |
| moe_orthogonal_loss_lambda=1e-2, |
| sinkhorn_2gate=True, |
| sinkhorn_temp=3e-2, |
| global_aux_loss=False, |
| moe_dropout_prob=0.0, |
| moe_group="world", |
| moe_gate="top2", |
| moe_intermediate_size: Union[int, list] = 0, |
| moe_num_shared_experts: int = 0, |
| moe_reverse_token_drop: bool = False, |
| moe_gate_act: str = "softmax", |
| moe_norm_gate_logits=True, |
| moe_all_to_all_dropout: float = 0.0, |
| moe_k=2, |
| moe_use_aux_free: bool = False, |
| |
| moe_group_experts: bool = False, |
| moe_group_orthogonal_loss: bool = True, |
| enable_delay_scale_loss: bool = True, |
| num_acc_steps: int = 1, |
| fuse_gate_detach_matmul: bool = False, |
| dpo_config=None, |
| moe_multimodal_dispatch_use_allgather: str = "", |
| moe_use_hard_gate=False, |
| moe_dense_experts_token_type_id=3, |
| **kwargs, |
| ): |
| """ |
| Initialize ErnieMoE configuration with MoE-specific parameters. |
| |
| Args: |
| moe_num_experts: Number of experts in MoE layers |
| use_recompute_moe: Whether to use recomputation for MoE layers |
| moe_capacity: Capacity configuration for MoE layers |
| moe_layer_interval: Interval between MoE layers |
| moe_layer_start_index: Starting layer index for MoE |
| moe_layer_end_index: Ending layer index for MoE (-1 means last layer) |
| moe_aux_loss_lambda: Weight for auxiliary loss |
| moe_z_loss_lambda: Weight for z-loss |
| moe_orthogonal_loss_lambda: Weight for orthogonal loss |
| sinkhorn_2gate: Whether to use sinkhorn 2-gate routing |
| sinkhorn_temp: Temperature for sinkhorn routing |
| global_aux_loss: Whether to use global auxiliary loss |
| moe_dropout_prob: Dropout probability for MoE layers |
| moe_group: Group configuration for MoE experts |
| moe_gate: Type of gating mechanism ('top2', etc.) |
| moe_intermediate_size: Intermediate size for MoE layers |
| moe_num_shared_experts: Number of shared experts |
| moe_reverse_token_drop: Whether to use reverse token dropping |
| moe_gate_act: Activation function for gating |
| moe_norm_gate_logits: Whether to normalize gate logits |
| moe_all_to_all_dropout: Dropout for all-to-all communication |
| moe_k: Number of experts to route to |
| moe_use_aux_free: Whether to use auxiliary-free routing |
| moe_group_experts: Whether to group experts (requires hard gating) |
| moe_group_orthogonal_loss: Whether to use group orthogonal loss |
| enable_delay_scale_loss: Whether to enable delayed loss scaling |
| num_acc_steps: Number of accumulation steps |
| fuse_gate_detach_matmul: Whether to fuse gate detach matmul |
| **kwargs: Additional base model configuration parameters |
| |
| Note: |
| When use_recompute_moe is True, recompute_granularity will be changed to full_attn. |
| """ |
|
|
| if use_recompute_moe: |
| logger.warning( |
| "set `use_recompute_moe`=True, disabling `recompute_granularity=full`, change to full_attn." |
| ) |
| if kwargs["recompute"] and kwargs["recompute_granularity"] == "full": |
| kwargs["recompute_granularity"] = "full_attn" |
| super().__init__(**kwargs) |
|
|
| self.moe_num_experts = moe_num_experts |
| self.use_recompute_moe = use_recompute_moe |
| self.moe_capacity = moe_capacity |
| self.moe_aux_loss_lambda = moe_aux_loss_lambda |
| self.moe_z_loss_lambda = moe_z_loss_lambda |
| self.moe_orthogonal_loss_lambda = moe_orthogonal_loss_lambda |
| self.global_aux_loss = global_aux_loss |
| self.sinkhorn_2gate = sinkhorn_2gate |
| self.sinkhorn_temp = sinkhorn_temp |
| self.moe_layer_interval = moe_layer_interval |
| self.moe_dropout_prob = moe_dropout_prob |
| self.moe_group = moe_group |
| self.moe_gate = moe_gate |
| self.moe_intermediate_size = moe_intermediate_size |
| self.moe_num_shared_experts = moe_num_shared_experts |
| self.moe_reverse_token_drop = moe_reverse_token_drop |
| self.moe_k = moe_k |
| self.moe_all_to_all_dropout = moe_all_to_all_dropout |
| self.moe_group_experts = moe_group_experts |
| self.moe_group_orthogonal_loss = moe_group_orthogonal_loss |
| self.enable_delay_scale_loss = enable_delay_scale_loss |
| self.num_acc_steps = num_acc_steps |
| self.moe_layer_start_index = moe_layer_start_index |
| self.moe_layer_end_index = ( |
| self.num_hidden_layers - 1 |
| if moe_layer_end_index == -1 |
| else moe_layer_end_index |
| ) |
| self.moe_gate_act = moe_gate_act |
| self.moe_norm_gate_logits = moe_norm_gate_logits |
| self.moe_use_aux_free = moe_use_aux_free |
| self.fuse_gate_detach_matmul = fuse_gate_detach_matmul |
| self.dpo_config = dpo_config |
| self.moe_multimodal_dispatch_use_allgather = ( |
| moe_multimodal_dispatch_use_allgather |
| ) |
| self.moe_use_hard_gate = moe_use_hard_gate |
| self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id |
|
|
| @property |
| def multimodel_experts(self) -> bool: |
| """multimodel experts.""" |
| return ( |
| isinstance(self.moe_num_experts, (tuple, list)) |
| and len(self.moe_num_experts) > 1 |
| ) |
|
|
| @property |
| def use_moe(self) -> bool: |
| """ |
| Check if model is using MoE architecture. |
| |
| Returns: |
| bool: True if moe_num_experts > 0, False otherwise |
| """ |
| return self.moe_num_experts > 0 |
|
|
|
|
| class Ernie4_5_VLMoEConfig(Ernie4_5_MoEConfig): |
| """ |
| This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie |
| model according to the specified arguments, defining the model architecture. Instantiating a configuration with the |
| defaults will yield a similar configuration to that of the Ernie-7B. |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| Args: |
| vocab_size (`int`, *optional*, defaults to 32000): |
| Vocabulary size of the Ernie model. Defines the number of different tokens that can be represented by the |
| `inputs_ids` passed when calling [`~ErnieModel`] or [`~TFErnieModel`]. |
| hidden_size (`int`, *optional*, defaults to 4096): |
| Dimension of the hidden representations. |
| intermediate_size (`int`, *optional*, defaults to 11008): |
| Dimension of the MLP representations. |
| num_hidden_layers (`int`, *optional*, defaults to 32): |
| Number of hidden layers in the Transformer encoder. |
| num_attention_heads (`int`, *optional*, defaults to 32): |
| Number of attention heads for each attention layer in the Transformer encoder. |
| hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): |
| The non-linear activation function (function or string) in the decoder. |
| initializer_range (`float`, *optional*, defaults to 0.02): |
| The standard deviation of the truncated_normal_initializer for initializing all weight matrices. |
| rms_norm_eps (`float`, *optional*, defaults to 1e-12): |
| The epsilon used by the rms normalization layers. |
| use_cache (`bool`, *optional*, defaults to `True`): |
| Whether or not the model should return the last key/values attentions (not used by all models). Only |
| relevant if `config.is_decoder=True`. |
| tie_word_embeddings(`bool`, *optional*, defaults to `False`): |
| Whether to tie weight embeddings |
| """ |
|
|
| model_type = "ernie4_5_moe_vl" |
| attribute_map = { |
| "n_positions": "max_position_embeddings", |
| "n_embd": "hidden_size", |
| "n_layer": "num_hidden_layers", |
| "n_head": "num_attention_heads", |
| "n_inner": "intermediate_size", |
| "activation_function": "hidden_act", |
| } |
| base_model_tp_plan = { |
| "model.layers.*.self_attn.q_proj": "colwise_rep", |
| "model.layers.*.self_attn.k_proj": "colwise_rep", |
| "model.layers.*.self_attn.v_proj": "colwise_rep", |
| "model.layers.*.self_attn.o_proj": "rowwise_rep", |
| "model.layers.*.mlp.experts.*.gate_proj": "colwise", |
| "model.layers.*.mlp.experts.*.up_proj": "colwise", |
| "model.layers.*.mlp.experts.*.down_proj": "rowwise", |
| "model.layers.*.mlp_text.experts.*.gate_proj": "colwise", |
| "model.layers.*.mlp_text.experts.*.up_proj": "colwise", |
| "model.layers.*.mlp_text.experts.*.down_proj": "rowwise", |
| "model.layers.*.mlp.gate_proj": "colwise", |
| "model.layers.*.mlp.up_proj": "colwise", |
| "model.layers.*.mlp.down_proj": "rowwise" |
| } |
|
|
| def __init__( |
| self, |
| vision_config=None, |
| im_patch_id=None, |
| pixel_hidden_size=None, |
| modality_detach=False, |
| temporal_conv_size=2, |
| spatial_conv_size=2, |
| mm_vocab_size=0, |
| max_text_id=None, |
| use_temporal_conv=True, |
| moe_use_size_all2all=False, |
| moe_num_attn_experts=False, |
| moe_dense_experts_token_type_id: int = 3, |
| moe_use_hard_gate: bool = True, |
| moe_fuse_experts: bool = False, |
| moe_use_token_type_bias: bool = False, |
| disable_ffn_model_parallel=False, |
| fuse_attn_ffn=True, |
| rope_3d=True, |
| freq_allocation=20, |
| using_precision_check=False, |
| use_recompute_resampler=False, |
| resampler_fuse_rms_norm=False, |
| moe_layer_feed_fake_token=False, |
| tensor_parallel_degree=1, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| if isinstance(vision_config, dict): |
| self.vision_config = DFNRopeVisionTransformerConfig(**vision_config) |
| else: |
| self.vision_config = DFNRopeVisionTransformerConfig() |
| self.im_patch_id = im_patch_id |
| self.pixel_hidden_size = pixel_hidden_size |
| self.modality_detach = modality_detach |
| self.temporal_conv_size = temporal_conv_size |
| self.spatial_conv_size = spatial_conv_size |
| self.mm_vocab_size = mm_vocab_size |
| self.max_text_id = max_text_id |
| self.use_temporal_conv = use_temporal_conv |
|
|
| self.moe_use_size_all2all = moe_use_size_all2all |
| self.moe_num_attn_experts = moe_num_attn_experts |
| self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id |
| self.moe_use_hard_gate = moe_use_hard_gate |
| self.moe_fuse_experts = moe_fuse_experts |
| self.moe_use_token_type_bias = moe_use_token_type_bias |
| self.disable_ffn_model_parallel = disable_ffn_model_parallel |
|
|
| self.fuse_attn_ffn = fuse_attn_ffn |
| self.rope_3d = rope_3d |
| self.freq_allocation = freq_allocation |
| self.using_precision_check = using_precision_check |
| self.use_recompute_resampler = use_recompute_resampler |
| self.resampler_fuse_rms_norm = resampler_fuse_rms_norm |
| self.moe_layer_feed_fake_token = moe_layer_feed_fake_token |
|
|
| self.tensor_parallel_degree = tensor_parallel_degree |
|
|
| @property |
| def multimodel_experts(self) -> bool: |
| """Check if model is using more than 1 multimodel experts.""" |
| return ( |
| isinstance(self.moe_num_experts, (tuple, list)) |
| and len(self.moe_num_experts) > 1 |
| ) |
|
|
| @property |
| def use_moe(self) -> bool: |
| """ |
| Check if model is using MoE architecture. |
| |
| Returns: |
| bool: True if moe_num_experts > 0, False otherwise |
| """ |
| return ( |
| sum(self.moe_num_experts) > 0 |
| if self.multimodel_experts |
| else self.moe_num_experts > 0 |
| ) |
|
|
| def to_dict(self, saving_file=False): |
| """to_dict""" |
| output = copy.deepcopy(self.__dict__) |
| if self.vision_config: |
| output["vision_config"] = ( |
| self.vision_config.to_dict() |
| if isinstance(self.vision_config, (DFNRopeVisionTransformerConfig)) |
| else self.vision_config |
| ) |
|
|
| output["model_type"] = self.__class__.model_type |
| return output |
|
|