MXFP8 Kernel Assertion Failure — Non-aligned dimension n=3392 not divisible by block size 128
#5
by R-omk - opened
sglang-mimo-v2.5-nvfp4:
extends:
file: templates.yaml
service: gpus_all_rtx_pro_6000
# Specialized image for MiMo V2.5 NVFP4 quantization
image: docker.io/lukealonso/sglang-cuda13-b12x
command:
- python3
- -m
- sglang.launch_server
- --model-path
- lukealonso/MiMo-V2.5-NVFP4
- --served-model-name
- MiMo-V2.5-NVFP4
- --tp-size
- "4"
- --page-size
- "64"
- --host
- 0.0.0.0
- --port
- "8000"
- --kv-cache-dtype
- fp8_e4m3
- --mem-fraction-static
- "0.85"
- --swa-full-tokens-ratio
- "0.3"
- --chunked-prefill-size
- "8192"
- --speculative-algorithm
- EAGLE
- --speculative-num-steps
- "3"
- --speculative-eagle-topk
- "1"
- --speculative-num-draft-tokens
- "4"
- --enable-pcie-oneshot-allreduce
- --enable-multi-layer-eagle
- --reasoning-parser
- mimo
- --tool-call-parser
- mimo
- --max-running-requests
- "8"
- --moe-runner-backend
- b12x
- --attention-backend
- b12x
- --mm-attention-backend
- b12x
- --fp4-gemm-backend
- b12x
environment:
CUDA_DEVICE_ORDER: PCI_BUS_ID
TRANSFORMERS_OFFLINE: 1
HF_HUB_OFFLINE: 1
OMP_NUM_THREADS: "16"
SAFETENSORS_FAST_GPU: 1
CUTE_DSL_ARCH: sm_120a
shm_size: 32g
ipc: host
logs
==========
== CUDA ==
==========
CUDA Version 13.0.3
Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
This container image and its contents are governed by the NVIDIA Deep Learning Container License.
By pulling and using the container, you accept the terms and conditions of this license:
https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license
A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.
/opt/venv/lib/python3.12/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
/opt/sglang/python/sglang/launch_server.py:58: UserWarning: 'python -m sglang.launch_server' is still supported, but 'sglang serve' is the recommended entrypoint.
Example: sglang serve --model-path <model> [options]
warnings.warn(
[transformers] The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
[2026-05-05 19:25:58] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
[2026-05-05 19:25:58] Enable multi-layer EAGLE speculative decoding for MiMoV2 model.
[2026-05-05 19:25:58] Spec v2 is enabled by default for eagle/eagle3/standalone speculative decoding.
[2026-05-05 19:26:00] server_args=ServerArgs(model_path='lukealonso/MiMo-V2.5-NVFP4', tokenizer_path='lukealonso/MiMo-V2.5-NVFP4', tokenizer_mode='auto', tokenizer_backend='huggingface', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, context_length=None, is_embedding=False, enable_multimodal=None, revision=None, model_impl='auto', host='0.0.0.0', port=8000, fastapi_root_path='', grpc_mode=False, skip_server_warmup=False, warmups=None, nccl_port=None, checkpoint_engine_wait_weights_before_ready=False, ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_keyfile_password=None, enable_ssl_refresh=False, enable_http2=False, dtype='auto', quantization=None, quantization_param_path=None, kv_cache_dtype='fp8_e4m3', enable_fp32_lm_head=False, modelopt_quant=None, modelopt_checkpoint_restore_path=None, modelopt_checkpoint_save_path=None, modelopt_export_path=None, quantize_and_serve=False, rl_quant_profile=None, mem_fraction_static=0.85, max_running_requests=8, max_queued_requests=None, max_total_tokens=None, chunked_prefill_size=8192, enable_dynamic_chunking=False, max_prefill_tokens=16384, prefill_max_requests=None, schedule_policy='fcfs', enable_priority_scheduling=False, disable_priority_preemption=False, default_priority_value=None, abort_on_priority_when_disabled=False, schedule_low_priority_values_first=False, priority_scheduling_preemption_threshold=10, schedule_conservativeness=1.0, page_size=64, swa_full_tokens_ratio=0.3, disable_hybrid_swa_memory=False, radix_eviction_policy='lru', enable_prefill_delayer=False, prefill_delayer_max_delay_passes=30, prefill_delayer_token_usage_low_watermark=None, prefill_delayer_forward_passes_buckets=None, prefill_delayer_wait_seconds_buckets=None, device='cuda', tp_size=4, pp_size=1, pp_max_micro_batch_size=None, pp_async_batch_depth=0, stream_interval=1, batch_notify_size=16, stream_response_default_include_usage=False, incremental_streaming_output=False, enable_streaming_session=False, random_seed=842411346, constrained_json_whitespace_pattern=None, constrained_json_disable_any_whitespace=False, watchdog_timeout=300, soft_watchdog_timeout=None, dist_timeout=None, download_dir=None, model_checksum=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, use_ray=False, custom_sigquit_handler=None, log_level='info', log_level_http=None, log_requests=False, log_requests_level=2, log_requests_format='text', log_requests_target=None, uvicorn_access_log_exclude_prefixes=[], crash_dump_folder=None, show_time_cost=False, enable_metrics=False, grpc_http_sidecar_port=None, enable_mfu_metrics=False, enable_metrics_for_all_schedulers=False, tokenizer_metrics_custom_labels_header='x-custom-labels', tokenizer_metrics_allowed_custom_labels=None, extra_metric_labels=None, bucket_time_to_first_token=None, bucket_inter_token_latency=None, bucket_e2e_request_latency=None, prompt_tokens_buckets=None, generation_tokens_buckets=None, gc_warning_threshold_secs=0.0, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, enable_trace=False, otlp_traces_endpoint='localhost:4317', export_metrics_to_file=False, export_metrics_to_file_dir=None, api_key=None, admin_api_key=None, served_model_name='MiMo-V2.5-NVFP4', weight_version='default', chat_template=None, hf_chat_template_name=None, completion_template=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser='mimo', strip_thinking_cache=False, tool_call_parser='mimo', tool_server=None, sampling_defaults='model', dp_size=1, load_balance_method='round_robin', attn_cp_size=1, moe_dp_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, enable_lora=None, enable_lora_overlap_loading=None, max_lora_rank=None, lora_target_modules=None, lora_paths=None, max_loaded_loras=None, max_loras_per_batch=8, lora_eviction_policy='lru', lora_backend='csgmv', max_lora_chunk_size=16, experts_shared_outer_loras=None, lora_use_virtual_experts=False, lora_strict_loading=False, lora_drain_wait_threshold=0.0, attention_backend='b12x', decode_attention_backend=None, prefill_attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend='b12x', fp8_gemm_runner_backend='auto', fp4_gemm_runner_backend='b12x', nsa_prefill_backend=None, nsa_decode_backend=None, disable_flashinfer_autotune=False, mamba_backend='triton', speculative_algorithm='EAGLE', speculative_draft_model_path=None, speculative_draft_model_revision=None, speculative_draft_load_format=None, speculative_num_steps=3, speculative_eagle_topk=1, speculative_num_draft_tokens=4, speculative_dflash_block_size=None, speculative_dflash_draft_window_size=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, speculative_attention_mode='prefill', speculative_draft_attention_backend=None, speculative_moe_runner_backend='b12x', speculative_moe_a2a_backend=None, speculative_draft_model_quantization=None, speculative_adaptive=False, speculative_adaptive_config=None, speculative_skip_dp_mlp_sync=False, speculative_ngram_min_bfs_breadth=1, speculative_ngram_max_bfs_breadth=10, speculative_ngram_match_type='BFS', speculative_ngram_max_trie_depth=18, speculative_ngram_capacity=10000000, speculative_ngram_external_corpus_path=None, speculative_ngram_external_sam_budget=0, speculative_ngram_external_corpus_max_tokens=10000000, enable_multi_layer_eagle=True, ep_size=1, moe_a2a_backend='none', moe_runner_backend='b12x', record_nolora_graph=True, flashinfer_mxfp4_moe_precision='default', enable_flashinfer_allreduce_fusion=False, enforce_disable_flashinfer_allreduce_fusion=False, enable_aiter_allreduce_fusion=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm=None, init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, eplb_min_rebalancing_utilization_threshold=1.0, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, elastic_ep_backend=None, enable_elastic_expert_backup=False, mooncake_ib_device=None, elastic_ep_rejoin=False, max_mamba_cache_size=None, mamba_ssm_dtype=None, mamba_full_memory_ratio=0.9, mamba_scheduler_strategy='no_buffer', mamba_track_interval=256, linear_attn_backend='triton', linear_attn_decode_backend=None, linear_attn_prefill_backend=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through', hicache_io_backend='kernel', hicache_mem_layout='layer_first', hicache_storage_backend=None, hicache_storage_prefetch_policy='best_effort', hicache_storage_backend_extra_config=None, enable_hisparse=False, hisparse_config=None, enable_lmcache=False, kt_weight_path=None, kt_method='AMXINT4', kt_cpuinfer=None, kt_threadpool_count=2, kt_num_gpu_experts=None, kt_max_deferred_experts_per_token=None, dllm_algorithm=None, dllm_algorithm_config=None, cpu_offload_gb=0, offload_group_size=-1, offload_num_in_group=1, offload_prefetch_step=1, offload_mode='cpu', enable_mis=False, disable_radix_cache=False, cuda_graph_max_bs=512, cuda_graph_bs=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 40, 44, 48, 52, 56, 60, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], disable_cuda_graph=False, disable_cuda_graph_padding=False, enable_breakable_cuda_graph=False, enable_profile_cuda_graph=False, enable_cudagraph_gc=False, debug_cuda_graph=False, enable_layerwise_nvtx_marker=False, enable_nccl_nvls=False, enable_symm_mem=False, disable_flashinfer_cutlass_moe_fp4_allgather=False, enable_tokenizer_batch_encode=False, disable_tokenizer_batch_decode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_pcie_oneshot_allreduce=True, pcie_oneshot_allreduce_max_size='64KB', enable_mscclpp=False, enable_torch_symm_mem=False, pre_warm_nccl=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_attention_local_control_broadcast=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_single_batch_overlap=False, tbo_token_distribution_threshold=0.48, enable_torch_compile=False, disable_piecewise_cuda_graph=True, enforce_piecewise_cuda_graph=False, enable_torch_compile_debug_mode=False, torch_compile_max_bs=32, piecewise_cuda_graph_max_tokens=8192, piecewise_cuda_graph_tokens=[4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584, 3840, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192], piecewise_cuda_graph_compiler='eager', torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, triton_attention_split_tile_size=None, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, enable_weights_cpu_backup=False, enable_draft_weights_cpu_backup=False, allow_auto_truncate=False, enable_custom_logit_processor=False, flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, enforce_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, keep_mm_feature_on_device=False, enable_return_hidden_states=False, enable_return_routed_experts=False, scheduler_recv_interval=1, numa_node=None, enable_deterministic_inference=False, rl_on_policy_target=None, enable_attn_tp_input_scattered=False, gc_threshold=None, enable_nsa_prefill_context_parallel=False, nsa_prefill_cp_mode='round-robin-split', enable_fused_qk_norm_rope=False, enable_precise_embedding_interpolation=False, enable_fused_moe_sum_all_reduce=False, enable_prefill_context_parallel=False, prefill_cp_mode='in-seq-split', enable_dynamic_batch_tokenizer=False, dynamic_batch_tokenizer_batch_size=32, dynamic_batch_tokenizer_batch_timeout=0.002, debug_tensor_dump_output_folder=None, debug_tensor_dump_layers=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_ib_device=None, disaggregation_decode_enable_radix_cache=False, disaggregation_decode_enable_offload_kvcache=False, num_reserved_decode_tokens=512, disaggregation_decode_polling_interval=1, encoder_only=False, language_only=False, encoder_transfer_backend='zmq_to_scheduler', encoder_urls=[], enable_adaptive_dispatch_to_encoder=False, custom_weight_loader=[], weight_loader_disable_mmap=False, weight_loader_prefetch_checkpoints=False, weight_loader_prefetch_num_threads=4, remote_instance_weight_loader_seed_instance_ip=None, remote_instance_weight_loader_seed_instance_service_port=None, remote_instance_weight_loader_send_weights_group_ports=None, remote_instance_weight_loader_backend='nccl', remote_instance_weight_loader_start_seed_via_transfer_engine=False, engine_info_bootstrap_port=6789, modelexpress_config=None, enable_pdmux=False, pdmux_config_path=None, sm_group_num=8, enable_broadcast_mm_inputs_process=False, enable_prefix_mm_cache=False, mm_enable_dp_encoder=False, mm_process_config={}, limit_mm_data_per_request=None, enable_mm_global_cache=False, decrypted_config_file=None, decrypted_draft_config_file=None, forward_hooks=None, enable_quant_communications=False, msprobe_dump_config=None)
/opt/venv/lib/python3.12/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
/opt/venv/lib/python3.12/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
[transformers] The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
[2026-05-05 19:26:05 TP0] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
[transformers] The `use_fast` parameter is deprecated and will be removed in a future version. Use `backend="torchvision"` instead of `use_fast=True`, or `backend="pil"` instead of `use_fast=False`.
[2026-05-05 19:26:07 TP0] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
[2026-05-05 19:26:07 TP0] Init torch distributed begin.
/opt/venv/lib/python3.12/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
[transformers] The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
[2026-05-05 19:26:10 TP1] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
[transformers] The `use_fast` parameter is deprecated and will be removed in a future version. Use `backend="torchvision"` instead of `use_fast=True`, or `backend="pil"` instead of `use_fast=False`.
[2026-05-05 19:26:12 TP1] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
[2026-05-05 19:26:12 TP1] Init torch distributed begin.
/opt/venv/lib/python3.12/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
[transformers] The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
[2026-05-05 19:26:15 TP2] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
[transformers] The `use_fast` parameter is deprecated and will be removed in a future version. Use `backend="torchvision"` instead of `use_fast=True`, or `backend="pil"` instead of `use_fast=False`.
[2026-05-05 19:26:17 TP2] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
[2026-05-05 19:26:17 TP2] Init torch distributed begin.
/opt/venv/lib/python3.12/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
[transformers] The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
[2026-05-05 19:26:21 TP3] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
[transformers] The `use_fast` parameter is deprecated and will be removed in a future version. Use `backend="torchvision"` instead of `use_fast=True`, or `backend="pil"` instead of `use_fast=False`.
[2026-05-05 19:26:22 TP3] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
[2026-05-05 19:26:22 TP3] Init torch distributed begin.
[2026-05-05 19:26:22 TP0] sglang is using nccl==2.28.9
[2026-05-05 19:26:26] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
[transformers] The `use_fast` parameter is deprecated and will be removed in a future version. Use `backend="torchvision"` instead of `use_fast=True`, or `backend="pil"` instead of `use_fast=False`.
[2026-05-05 19:26:27] Using default HuggingFace chat template with detected content format: openai
[2026-05-05 19:26:51 TP2] Init torch distributed ends. elapsed=34.79 s, mem usage=0.33 GB
[2026-05-05 19:26:51 TP0] Init torch distributed ends. elapsed=44.61 s, mem usage=0.33 GB
[2026-05-05 19:26:51 TP3] Init torch distributed ends. elapsed=29.62 s, mem usage=0.33 GB
[2026-05-05 19:26:51 TP1] Init torch distributed ends. elapsed=39.84 s, mem usage=0.33 GB
[2026-05-05 19:26:52 TP2] Load weight begin. avail mem=94.07 GB
[2026-05-05 19:26:52 TP2] Using ModelOptModelLoader due to ModelOpt quantization config.
[2026-05-05 19:26:52 TP2] ModelOptModelLoader: Loading base model...
[2026-05-05 19:26:52 TP2] Model is already quantized, loading directly...
[2026-05-05 19:26:52 TP2] Detected ModelOpt FP8 checkpoint. The format is experimental and subject to change.
[2026-05-05 19:26:52 TP2] Detected nvfp4 checkpoint. Please note that the format is experimental and subject to change.
[2026-05-05 19:26:52 TP2] Skipping block quantization checks for weight partition.
[2026-05-05 19:26:52 TP3] Load weight begin. avail mem=94.07 GB
[2026-05-05 19:26:52 TP3] Using ModelOptModelLoader due to ModelOpt quantization config.
[2026-05-05 19:26:52 TP3] ModelOptModelLoader: Loading base model...
[2026-05-05 19:26:52 TP3] Model is already quantized, loading directly...
[2026-05-05 19:26:52 TP1] Load weight begin. avail mem=94.07 GB
[2026-05-05 19:26:52 TP1] Using ModelOptModelLoader due to ModelOpt quantization config.
[2026-05-05 19:26:52 TP1] ModelOptModelLoader: Loading base model...
[2026-05-05 19:26:52 TP1] Model is already quantized, loading directly...
[2026-05-05 19:26:52 TP3] Detected ModelOpt FP8 checkpoint. The format is experimental and subject to change.
[2026-05-05 19:26:52 TP3] Detected nvfp4 checkpoint. Please note that the format is experimental and subject to change.
[2026-05-05 19:26:52 TP1] Detected ModelOpt FP8 checkpoint. The format is experimental and subject to change.
[2026-05-05 19:26:52 TP1] Detected nvfp4 checkpoint. Please note that the format is experimental and subject to change.
[2026-05-05 19:26:52 TP0] Load weight begin. avail mem=94.07 GB
[2026-05-05 19:26:52 TP0] Using ModelOptModelLoader due to ModelOpt quantization config.
[2026-05-05 19:26:52 TP0] ModelOptModelLoader: Loading base model...
[2026-05-05 19:26:52 TP0] Model is already quantized, loading directly...
[2026-05-05 19:26:52 TP0] Detected ModelOpt FP8 checkpoint. The format is experimental and subject to change.
[2026-05-05 19:26:52 TP0] Detected fp8 checkpoint.
[2026-05-05 19:26:52 TP0] Detected nvfp4 checkpoint. Please note that the format is experimental and subject to change.
[2026-05-05 19:26:52 TP3] Skipping block quantization checks for weight partition.
[2026-05-05 19:26:52 TP1] Skipping block quantization checks for weight partition.
[2026-05-05 19:26:52 TP0] Skipping block quantization checks for weight partition.
[2026-05-05 19:26:52 TP2] Using b12x as multimodal attention backend.
[2026-05-05 19:26:53 TP3] Using b12x as multimodal attention backend.
[2026-05-05 19:26:53 TP1] Using b12x as multimodal attention backend.
[2026-05-05 19:26:53 TP0] Using b12x as multimodal attention backend.
[2026-05-05 19:26:53 TP0] Found local HF snapshot for lukealonso/MiMo-V2.5-NVFP4 at /root/.cache/huggingface/hub/models--lukealonso--MiMo-V2.5-NVFP4/snapshots/e31cc15ee78c5fbfd662f350e5b47d844311303f; skipping download.
Multi-thread loading shards: 0% Completed | 0/37 [00:00<?, ?it/s]
Multi-thread loading shards: 3% Completed | 1/37 [00:01<00:51, 1.42s/it]
Multi-thread loading shards: 5% Completed | 2/37 [00:01<00:31, 1.12it/s]
Multi-thread loading shards: 8% Completed | 3/37 [00:03<00:33, 1.03it/s]
Multi-thread loading shards: 11% Completed | 4/37 [00:03<00:26, 1.22it/s]
Multi-thread loading shards: 14% Completed | 5/37 [00:04<00:23, 1.35it/s]
Multi-thread loading shards: 16% Completed | 6/37 [00:04<00:21, 1.47it/s]
Multi-thread loading shards: 19% Completed | 7/37 [00:05<00:23, 1.27it/s]
Multi-thread loading shards: 22% Completed | 8/37 [00:06<00:20, 1.40it/s]
Multi-thread loading shards: 24% Completed | 9/37 [00:06<00:18, 1.51it/s]
Multi-thread loading shards: 27% Completed | 10/37 [00:07<00:17, 1.58it/s]
Multi-thread loading shards: 30% Completed | 11/37 [00:07<00:15, 1.67it/s]
Multi-thread loading shards: 32% Completed | 12/37 [00:08<00:14, 1.73it/s]
Multi-thread loading shards: 35% Completed | 13/37 [00:09<00:13, 1.73it/s]
Multi-thread loading shards: 38% Completed | 14/37 [00:09<00:13, 1.72it/s]
Multi-thread loading shards: 41% Completed | 15/37 [00:10<00:12, 1.74it/s]
Multi-thread loading shards: 43% Completed | 16/37 [00:10<00:11, 1.76it/s]
Multi-thread loading shards: 46% Completed | 17/37 [00:11<00:11, 1.77it/s]
Multi-thread loading shards: 49% Completed | 18/37 [00:11<00:10, 1.77it/s]
Multi-thread loading shards: 51% Completed | 19/37 [00:12<00:10, 1.77it/s]
Multi-thread loading shards: 54% Completed | 20/37 [00:13<00:09, 1.75it/s]
Multi-thread loading shards: 57% Completed | 21/37 [00:13<00:09, 1.75it/s]
Multi-thread loading shards: 59% Completed | 22/37 [00:14<00:08, 1.75it/s]
Multi-thread loading shards: 62% Completed | 23/37 [00:14<00:07, 1.77it/s]
Multi-thread loading shards: 65% Completed | 24/37 [00:15<00:07, 1.78it/s]
Multi-thread loading shards: 68% Completed | 25/37 [00:15<00:06, 1.79it/s]
Multi-thread loading shards: 70% Completed | 26/37 [00:16<00:06, 1.82it/s]
Multi-thread loading shards: 73% Completed | 27/37 [00:17<00:07, 1.27it/s]
Multi-thread loading shards: 76% Completed | 28/37 [00:18<00:06, 1.42it/s]
Multi-thread loading shards: 78% Completed | 29/37 [00:18<00:05, 1.54it/s]
Multi-thread loading shards: 81% Completed | 30/37 [00:19<00:04, 1.63it/s]
Multi-thread loading shards: 84% Completed | 31/37 [00:19<00:03, 1.71it/s]
Multi-thread loading shards: 86% Completed | 32/37 [00:20<00:02, 1.74it/s]
Multi-thread loading shards: 89% Completed | 33/37 [00:20<00:02, 1.77it/s]
Multi-thread loading shards: 92% Completed | 34/37 [00:21<00:01, 1.92it/s]
Multi-thread loading shards: 95% Completed | 35/37 [00:21<00:00, 2.33it/s][2026-05-05 19:27:16 TP2] Skipping draft-only MiMo-V2 MTP weights while loading the target model; MiMoV2MTP loads these weights in the draft model runner.
Multi-thread loading shards: 97% Completed | 36/37 [00:23<00:00, 1.28it/s][2026-05-05 19:27:17 TP0] Skipping draft-only MiMo-V2 MTP weights while loading the target model; MiMoV2MTP loads these weights in the draft model runner.
Multi-thread loading shards: 100% Completed | 37/37 [00:23<00:00, 1.60it/s]
[2026-05-05 19:27:17 TP1] Skipping draft-only MiMo-V2 MTP weights while loading the target model; MiMoV2MTP loads these weights in the draft model runner.
[2026-05-05 19:27:17 TP3] Skipping draft-only MiMo-V2 MTP weights while loading the target model; MiMoV2MTP loads these weights in the draft model runner.
[2026-05-05 19:27:28 TP2] Using FP8 KV cache but no scaling factors provided. Defaulting to scaling factors of 1.0. This may lead to less accurate results!
[2026-05-05 19:27:28 TP2] Load weight end. elapsed=36.06 s, type=MiMoV2ForCausalLM, quant=modelopt_mixed, quant_algo=MIXED_PRECISION, avail mem=49.91 GB, mem usage=44.16 GB.
[2026-05-05 19:27:28 TP0] Using FP8 KV cache but no scaling factors provided. Defaulting to scaling factors of 1.0. This may lead to less accurate results!
[2026-05-05 19:27:28 TP0] Load weight end. elapsed=35.91 s, type=MiMoV2ForCausalLM, quant=modelopt_mixed, quant_algo=MIXED_PRECISION, avail mem=49.91 GB, mem usage=44.16 GB.
[2026-05-05 19:27:28 TP1] Using FP8 KV cache but no scaling factors provided. Defaulting to scaling factors of 1.0. This may lead to less accurate results!
[2026-05-05 19:27:28 TP1] Load weight end. elapsed=36.14 s, type=MiMoV2ForCausalLM, quant=modelopt_mixed, quant_algo=MIXED_PRECISION, avail mem=49.91 GB, mem usage=44.16 GB.
[2026-05-05 19:27:28 TP3] Using FP8 KV cache but no scaling factors provided. Defaulting to scaling factors of 1.0. This may lead to less accurate results!
[2026-05-05 19:27:28 TP3] Load weight end. elapsed=36.45 s, type=MiMoV2ForCausalLM, quant=modelopt_mixed, quant_algo=MIXED_PRECISION, avail mem=49.91 GB, mem usage=44.16 GB.
[2026-05-05 19:27:28 TP0] Using KV cache dtype: torch.float8_e4m3fn
[2026-05-05 19:27:28 TP0] Use sliding window memory pool. full_layer_tokens=3706432, swa_layer_tokens=1111872
[2026-05-05 19:27:28 TP3] Use sliding window memory pool. full_layer_tokens=3706432, swa_layer_tokens=1111872
[2026-05-05 19:27:28 TP2] Use sliding window memory pool. full_layer_tokens=3706432, swa_layer_tokens=1111872
[2026-05-05 19:27:28 TP1] Use sliding window memory pool. full_layer_tokens=3706432, swa_layer_tokens=1111872
[2026-05-05 19:27:28 TP3] KV Cache is allocated. #tokens: 1111872, K size: 15.51 GB, V size: 10.34 GB
[2026-05-05 19:27:28 TP2] KV Cache is allocated. #tokens: 1111872, K size: 15.51 GB, V size: 10.34 GB
[2026-05-05 19:27:28 TP0] KV Cache is allocated. #tokens: 1111872, K size: 15.51 GB, V size: 10.34 GB
[2026-05-05 19:27:28 TP1] KV Cache is allocated. #tokens: 1111872, K size: 15.51 GB, V size: 10.34 GB
[2026-05-05 19:27:28 TP2] KV Cache is allocated. #tokens: 3706432, K size: 5.96 GB, V size: 3.98 GB
[2026-05-05 19:27:28 TP3] KV Cache is allocated. #tokens: 3706432, K size: 5.96 GB, V size: 3.98 GB
[2026-05-05 19:27:28 TP2] SWAKVPool mem usage: 35.79 GB, swa size: 1111872, full size: 3706432
[2026-05-05 19:27:28 TP3] SWAKVPool mem usage: 35.79 GB, swa size: 1111872, full size: 3706432
[2026-05-05 19:27:28 TP2] Memory pool end. avail mem=13.97 GB
[2026-05-05 19:27:28 TP0] KV Cache is allocated. #tokens: 3706432, K size: 5.96 GB, V size: 3.98 GB
[2026-05-05 19:27:28 TP3] Memory pool end. avail mem=13.97 GB
[2026-05-05 19:27:28 TP0] SWAKVPool mem usage: 35.79 GB, swa size: 1111872, full size: 3706432
[2026-05-05 19:27:28 TP1] KV Cache is allocated. #tokens: 3706432, K size: 5.96 GB, V size: 3.98 GB
[2026-05-05 19:27:28 TP1] SWAKVPool mem usage: 35.79 GB, swa size: 1111872, full size: 3706432
[2026-05-05 19:27:28 TP0] Memory pool end. avail mem=13.97 GB
[2026-05-05 19:27:28 TP1] Memory pool end. avail mem=13.97 GB
[2026-05-05 19:27:29 TP3] Capture cuda graph begin. This can take up to several minutes. avail mem=13.07 GB
[2026-05-05 19:27:29 TP1] Capture cuda graph begin. This can take up to several minutes. avail mem=13.07 GB
[2026-05-05 19:27:29 TP2] Capture cuda graph begin. This can take up to several minutes. avail mem=13.07 GB
[2026-05-05 19:27:29 TP0] Capture cuda graph begin. This can take up to several minutes. avail mem=13.07 GB
[2026-05-05 19:27:29 TP0] Capture cuda graph bs [1, 2, 3, 4, 5, 6, 7, 8]
0%| | 0/8 [00:00<?, ?it/s]
Capturing batches (bs=8 avail_mem=13.02 GB): 0%| | 0/8 [00:00<?, ?it/s]
Capturing batches (bs=8 avail_mem=13.02 GB): 0%| | 0/8 [00:02<?, ?it/s]
[2026-05-05 19:27:32 TP1] Scheduler hit an exception: Traceback (most recent call last):
File "/opt/sglang/python/sglang/srt/managers/scheduler.py", line 3963, in run_scheduler_process
scheduler = Scheduler(
^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/managers/scheduler.py", line 433, in __init__
self.init_model_worker()
File "/opt/sglang/python/sglang/srt/managers/scheduler.py", line 703, in init_model_worker
self.init_tp_model_worker()
File "/opt/sglang/python/sglang/srt/managers/scheduler.py", line 658, in init_tp_model_worker
self.tp_worker = TpModelWorker(**worker_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/managers/tp_worker.py", line 260, in __init__
self._init_model_runner()
File "/opt/sglang/python/sglang/srt/managers/tp_worker.py", line 345, in _init_model_runner
self._model_runner = ModelRunner(
^^^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/model_executor/model_runner.py", line 511, in __init__
self.initialize(pre_model_load_memory)
File "/opt/sglang/python/sglang/srt/model_executor/model_runner.py", line 774, in initialize
self.init_device_graphs()
File "/opt/sglang/python/sglang/srt/model_executor/model_runner.py", line 2796, in init_device_graphs
self.graph_runner = graph_runners[self.device](self)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 730, in __init__
self.capture()
File "/opt/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 913, in capture
_capture_one_stream()
File "/opt/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 901, in _capture_one_stream
) = self.capture_one_batch_size(bs, forward, stream_idx)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 1188, in capture_one_batch_size
run_once()
File "/opt/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 1166, in run_once
logits_output_or_pp_proxy_tensors = forward(
^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/models/mimo_v2.py", line 1173, in forward
hidden_states, hidden_states_before_norm = general_mm_embed_routine(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/managers/mm_utils.py", line 1106, in general_mm_embed_routine
hidden_states = language_model(
^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/models/mimo_v2.py", line 1002, in forward
hidden_states, residual = layer(
^^^^^^
File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/models/mimo_v2.py", line 797, in forward
hidden_states = self.self_attn(
^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/models/mimo_v2.py", line 652, in forward
qkv, _ = self.qkv_proj(hidden_states)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/layers/linear.py", line 466, in forward
output_parallel = self.quant_method.apply(self, input_, bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/layers/quantization/fp8.py", line 751, in apply
return self.w8a8_mxfp8_linear(
^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/torch/_ops.py", line 1269, in __call__
return self._op(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/layers/quantization/fp8_utils.py", line 1016, in triton_mxfp8_blockscaled_linear
return _raw_triton_mxfp8_blockscaled_linear(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/sglang/python/sglang/srt/layers/quantization/fp8_utils.py", line 941, in _raw_triton_mxfp8_blockscaled_linear
assert n % block_n == 0, f"{n=} must be divisible by {block_n}"
^^^^^^^^^^^^^^^^
AssertionError: n=3392 must be divisible by 128
[2026-05-05 19:27:32] Received sigquit from a child process. It usually means the child failed.
--fp8-gemm-backend flashinfer_cutlass