Upload mythos-fineweb-moe.py with huggingface_hub
Browse files- mythos-fineweb-moe.py +2 -1
mythos-fineweb-moe.py
CHANGED
|
@@ -1035,6 +1035,7 @@ def main():
|
|
| 1035 |
cfg.vocab_size = vocab_size
|
| 1036 |
|
| 1037 |
bf16_ok = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
|
|
|
|
| 1038 |
amp_dtype = torch.bfloat16 if bf16_ok else torch.float16
|
| 1039 |
|
| 1040 |
model = SpiderPortalForConditionalGeneration(cfg).to(torch.bfloat16)
|
|
@@ -1058,7 +1059,7 @@ def main():
|
|
| 1058 |
model = model.to(device)
|
| 1059 |
|
| 1060 |
if master:
|
| 1061 |
-
logger.info("MoE mode:
|
| 1062 |
|
| 1063 |
# MoE init checkpoint (skip dense conversion, load MoE weights directly)
|
| 1064 |
moe_init_ckpt = os.environ.get("MOE_INIT_CKPT", "")
|
|
|
|
| 1035 |
cfg.vocab_size = vocab_size
|
| 1036 |
|
| 1037 |
bf16_ok = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
|
| 1038 |
+
use_mxfp8 = os.environ.get("MXFP8", "0") == "1"
|
| 1039 |
amp_dtype = torch.bfloat16 if bf16_ok else torch.float16
|
| 1040 |
|
| 1041 |
model = SpiderPortalForConditionalGeneration(cfg).to(torch.bfloat16)
|
|
|
|
| 1059 |
model = model.to(device)
|
| 1060 |
|
| 1061 |
if master:
|
| 1062 |
+
logger.info(f"MoE mode: {amp_dtype} | MXFP8 hardware acceleration: {'ENABLED' if use_mxfp8 else 'disabled (set MXFP8=1)'}")
|
| 1063 |
|
| 1064 |
# MoE init checkpoint (skip dense conversion, load MoE weights directly)
|
| 1065 |
moe_init_ckpt = os.environ.get("MOE_INIT_CKPT", "")
|