Update configuration_ernie4_5_vl.py
#13
by hmellor HF Staff - opened
configuration_ernie4_5_vl.py
CHANGED
|
@@ -430,14 +430,12 @@ class Ernie4_5_MoEConfig(Ernie4_5_Config):
|
|
| 430 |
Note:
|
| 431 |
When use_recompute_moe is True, recompute_granularity will be changed to full_attn.
|
| 432 |
"""
|
| 433 |
-
|
| 434 |
if use_recompute_moe:
|
| 435 |
logger.warning(
|
| 436 |
"set `use_recompute_moe`=True, disabling `recompute_granularity=full`, change to full_attn."
|
| 437 |
)
|
| 438 |
if kwargs["recompute"] and kwargs["recompute_granularity"] == "full":
|
| 439 |
kwargs["recompute_granularity"] = "full_attn"
|
| 440 |
-
super().__init__(**kwargs)
|
| 441 |
|
| 442 |
self.moe_num_experts = moe_num_experts
|
| 443 |
self.use_recompute_moe = use_recompute_moe
|
|
@@ -477,6 +475,7 @@ class Ernie4_5_MoEConfig(Ernie4_5_Config):
|
|
| 477 |
)
|
| 478 |
self.moe_use_hard_gate = moe_use_hard_gate
|
| 479 |
self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id
|
|
|
|
| 480 |
|
| 481 |
@property
|
| 482 |
def multimodel_experts(self) -> bool:
|
|
@@ -582,7 +581,6 @@ class Ernie4_5_VLMoEConfig(Ernie4_5_MoEConfig):
|
|
| 582 |
tensor_parallel_degree=1,
|
| 583 |
**kwargs,
|
| 584 |
):
|
| 585 |
-
super().__init__(**kwargs)
|
| 586 |
if isinstance(vision_config, dict):
|
| 587 |
self.vision_config = DFNRopeVisionTransformerConfig(**vision_config)
|
| 588 |
else:
|
|
@@ -613,6 +611,7 @@ class Ernie4_5_VLMoEConfig(Ernie4_5_MoEConfig):
|
|
| 613 |
self.moe_layer_feed_fake_token = moe_layer_feed_fake_token
|
| 614 |
|
| 615 |
self.tensor_parallel_degree = tensor_parallel_degree
|
|
|
|
| 616 |
|
| 617 |
@property
|
| 618 |
def multimodel_experts(self) -> bool:
|
|
|
|
| 430 |
Note:
|
| 431 |
When use_recompute_moe is True, recompute_granularity will be changed to full_attn.
|
| 432 |
"""
|
|
|
|
| 433 |
if use_recompute_moe:
|
| 434 |
logger.warning(
|
| 435 |
"set `use_recompute_moe`=True, disabling `recompute_granularity=full`, change to full_attn."
|
| 436 |
)
|
| 437 |
if kwargs["recompute"] and kwargs["recompute_granularity"] == "full":
|
| 438 |
kwargs["recompute_granularity"] = "full_attn"
|
|
|
|
| 439 |
|
| 440 |
self.moe_num_experts = moe_num_experts
|
| 441 |
self.use_recompute_moe = use_recompute_moe
|
|
|
|
| 475 |
)
|
| 476 |
self.moe_use_hard_gate = moe_use_hard_gate
|
| 477 |
self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id
|
| 478 |
+
super().__init__(**kwargs)
|
| 479 |
|
| 480 |
@property
|
| 481 |
def multimodel_experts(self) -> bool:
|
|
|
|
| 581 |
tensor_parallel_degree=1,
|
| 582 |
**kwargs,
|
| 583 |
):
|
|
|
|
| 584 |
if isinstance(vision_config, dict):
|
| 585 |
self.vision_config = DFNRopeVisionTransformerConfig(**vision_config)
|
| 586 |
else:
|
|
|
|
| 611 |
self.moe_layer_feed_fake_token = moe_layer_feed_fake_token
|
| 612 |
|
| 613 |
self.tensor_parallel_degree = tensor_parallel_degree
|
| 614 |
+
super().__init__(**kwargs)
|
| 615 |
|
| 616 |
@property
|
| 617 |
def multimodel_experts(self) -> bool:
|