Update configuration_ernie4_5_vl.py

#13
by hmellor HF Staff - opened
Files changed (1) hide show
  1. configuration_ernie4_5_vl.py +2 -3
configuration_ernie4_5_vl.py CHANGED
@@ -430,14 +430,12 @@ class Ernie4_5_MoEConfig(Ernie4_5_Config):
430
  Note:
431
  When use_recompute_moe is True, recompute_granularity will be changed to full_attn.
432
  """
433
-
434
  if use_recompute_moe:
435
  logger.warning(
436
  "set `use_recompute_moe`=True, disabling `recompute_granularity=full`, change to full_attn."
437
  )
438
  if kwargs["recompute"] and kwargs["recompute_granularity"] == "full":
439
  kwargs["recompute_granularity"] = "full_attn"
440
- super().__init__(**kwargs)
441
 
442
  self.moe_num_experts = moe_num_experts
443
  self.use_recompute_moe = use_recompute_moe
@@ -477,6 +475,7 @@ class Ernie4_5_MoEConfig(Ernie4_5_Config):
477
  )
478
  self.moe_use_hard_gate = moe_use_hard_gate
479
  self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id
 
480
 
481
  @property
482
  def multimodel_experts(self) -> bool:
@@ -582,7 +581,6 @@ class Ernie4_5_VLMoEConfig(Ernie4_5_MoEConfig):
582
  tensor_parallel_degree=1,
583
  **kwargs,
584
  ):
585
- super().__init__(**kwargs)
586
  if isinstance(vision_config, dict):
587
  self.vision_config = DFNRopeVisionTransformerConfig(**vision_config)
588
  else:
@@ -613,6 +611,7 @@ class Ernie4_5_VLMoEConfig(Ernie4_5_MoEConfig):
613
  self.moe_layer_feed_fake_token = moe_layer_feed_fake_token
614
 
615
  self.tensor_parallel_degree = tensor_parallel_degree
 
616
 
617
  @property
618
  def multimodel_experts(self) -> bool:
 
430
  Note:
431
  When use_recompute_moe is True, recompute_granularity will be changed to full_attn.
432
  """
 
433
  if use_recompute_moe:
434
  logger.warning(
435
  "set `use_recompute_moe`=True, disabling `recompute_granularity=full`, change to full_attn."
436
  )
437
  if kwargs["recompute"] and kwargs["recompute_granularity"] == "full":
438
  kwargs["recompute_granularity"] = "full_attn"
 
439
 
440
  self.moe_num_experts = moe_num_experts
441
  self.use_recompute_moe = use_recompute_moe
 
475
  )
476
  self.moe_use_hard_gate = moe_use_hard_gate
477
  self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id
478
+ super().__init__(**kwargs)
479
 
480
  @property
481
  def multimodel_experts(self) -> bool:
 
581
  tensor_parallel_degree=1,
582
  **kwargs,
583
  ):
 
584
  if isinstance(vision_config, dict):
585
  self.vision_config = DFNRopeVisionTransformerConfig(**vision_config)
586
  else:
 
611
  self.moe_layer_feed_fake_token = moe_layer_feed_fake_token
612
 
613
  self.tensor_parallel_degree = tensor_parallel_degree
614
+ super().__init__(**kwargs)
615
 
616
  @property
617
  def multimodel_experts(self) -> bool: