MXFP8 Kernel Assertion Failure — Non-aligned dimension n=3392 not divisible by block size 128

#5
by R-omk - opened
  sglang-mimo-v2.5-nvfp4:
    extends:
      file: templates.yaml
      service: gpus_all_rtx_pro_6000

    # Specialized image for MiMo V2.5 NVFP4 quantization
    image: docker.io/lukealonso/sglang-cuda13-b12x

    command:
      - python3
      - -m
      - sglang.launch_server
      - --model-path
      - lukealonso/MiMo-V2.5-NVFP4
      - --served-model-name
      - MiMo-V2.5-NVFP4
      - --tp-size
      - "4"
      - --page-size
      - "64"
      - --host
      - 0.0.0.0
      - --port
      - "8000"
      - --kv-cache-dtype
      - fp8_e4m3
      - --mem-fraction-static
      - "0.85"
      - --swa-full-tokens-ratio
      - "0.3"
      - --chunked-prefill-size
      - "8192"
      - --speculative-algorithm
      - EAGLE
      - --speculative-num-steps
      - "3"
      - --speculative-eagle-topk
      - "1"
      - --speculative-num-draft-tokens
      - "4"
      - --enable-pcie-oneshot-allreduce
      - --enable-multi-layer-eagle
      - --reasoning-parser
      - mimo
      - --tool-call-parser
      - mimo
      - --max-running-requests
      - "8"
      - --moe-runner-backend
      - b12x
      - --attention-backend
      - b12x
      - --mm-attention-backend
      - b12x
      - --fp4-gemm-backend
      - b12x

    environment:
      CUDA_DEVICE_ORDER: PCI_BUS_ID
      TRANSFORMERS_OFFLINE: 1
      HF_HUB_OFFLINE: 1
      OMP_NUM_THREADS: "16"
      SAFETENSORS_FAST_GPU: 1
      CUTE_DSL_ARCH: sm_120a

    shm_size: 32g
    ipc: host
logs

 
 ==========
 == CUDA ==
 ==========
 
 CUDA Version 13.0.3
 
 Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 This container image and its contents are governed by the NVIDIA Deep Learning Container License.
 By pulling and using the container, you accept the terms and conditions of this license:
 https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license
 
 A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.
 
 /opt/venv/lib/python3.12/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
   import pynvml  # type: ignore[import]
 /opt/sglang/python/sglang/launch_server.py:58: UserWarning: 'python -m sglang.launch_server' is still supported, but 'sglang serve' is the recommended entrypoint.
   Example: sglang serve --model-path <model> [options]
   warnings.warn(
 [transformers] The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 [2026-05-05 19:25:58] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [2026-05-05 19:25:58] Enable multi-layer EAGLE speculative decoding for MiMoV2 model.
 [2026-05-05 19:25:58] Spec v2 is enabled by default for eagle/eagle3/standalone speculative decoding.
 [2026-05-05 19:26:00] server_args=ServerArgs(model_path='lukealonso/MiMo-V2.5-NVFP4', tokenizer_path='lukealonso/MiMo-V2.5-NVFP4', tokenizer_mode='auto', tokenizer_backend='huggingface', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, context_length=None, is_embedding=False, enable_multimodal=None, revision=None, model_impl='auto', host='0.0.0.0', port=8000, fastapi_root_path='', grpc_mode=False, skip_server_warmup=False, warmups=None, nccl_port=None, checkpoint_engine_wait_weights_before_ready=False, ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_keyfile_password=None, enable_ssl_refresh=False, enable_http2=False, dtype='auto', quantization=None, quantization_param_path=None, kv_cache_dtype='fp8_e4m3', enable_fp32_lm_head=False, modelopt_quant=None, modelopt_checkpoint_restore_path=None, modelopt_checkpoint_save_path=None, modelopt_export_path=None, quantize_and_serve=False, rl_quant_profile=None, mem_fraction_static=0.85, max_running_requests=8, max_queued_requests=None, max_total_tokens=None, chunked_prefill_size=8192, enable_dynamic_chunking=False, max_prefill_tokens=16384, prefill_max_requests=None, schedule_policy='fcfs', enable_priority_scheduling=False, disable_priority_preemption=False, default_priority_value=None, abort_on_priority_when_disabled=False, schedule_low_priority_values_first=False, priority_scheduling_preemption_threshold=10, schedule_conservativeness=1.0, page_size=64, swa_full_tokens_ratio=0.3, disable_hybrid_swa_memory=False, radix_eviction_policy='lru', enable_prefill_delayer=False, prefill_delayer_max_delay_passes=30, prefill_delayer_token_usage_low_watermark=None, prefill_delayer_forward_passes_buckets=None, prefill_delayer_wait_seconds_buckets=None, device='cuda', tp_size=4, pp_size=1, pp_max_micro_batch_size=None, pp_async_batch_depth=0, stream_interval=1, batch_notify_size=16, stream_response_default_include_usage=False, incremental_streaming_output=False, enable_streaming_session=False, random_seed=842411346, constrained_json_whitespace_pattern=None, constrained_json_disable_any_whitespace=False, watchdog_timeout=300, soft_watchdog_timeout=None, dist_timeout=None, download_dir=None, model_checksum=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, use_ray=False, custom_sigquit_handler=None, log_level='info', log_level_http=None, log_requests=False, log_requests_level=2, log_requests_format='text', log_requests_target=None, uvicorn_access_log_exclude_prefixes=[], crash_dump_folder=None, show_time_cost=False, enable_metrics=False, grpc_http_sidecar_port=None, enable_mfu_metrics=False, enable_metrics_for_all_schedulers=False, tokenizer_metrics_custom_labels_header='x-custom-labels', tokenizer_metrics_allowed_custom_labels=None, extra_metric_labels=None, bucket_time_to_first_token=None, bucket_inter_token_latency=None, bucket_e2e_request_latency=None, prompt_tokens_buckets=None, generation_tokens_buckets=None, gc_warning_threshold_secs=0.0, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, enable_trace=False, otlp_traces_endpoint='localhost:4317', export_metrics_to_file=False, export_metrics_to_file_dir=None, api_key=None, admin_api_key=None, served_model_name='MiMo-V2.5-NVFP4', weight_version='default', chat_template=None, hf_chat_template_name=None, completion_template=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser='mimo', strip_thinking_cache=False, tool_call_parser='mimo', tool_server=None, sampling_defaults='model', dp_size=1, load_balance_method='round_robin', attn_cp_size=1, moe_dp_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, enable_lora=None, enable_lora_overlap_loading=None, max_lora_rank=None, lora_target_modules=None, lora_paths=None, max_loaded_loras=None, max_loras_per_batch=8, lora_eviction_policy='lru', lora_backend='csgmv', max_lora_chunk_size=16, experts_shared_outer_loras=None, lora_use_virtual_experts=False, lora_strict_loading=False, lora_drain_wait_threshold=0.0, attention_backend='b12x', decode_attention_backend=None, prefill_attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend='b12x', fp8_gemm_runner_backend='auto', fp4_gemm_runner_backend='b12x', nsa_prefill_backend=None, nsa_decode_backend=None, disable_flashinfer_autotune=False, mamba_backend='triton', speculative_algorithm='EAGLE', speculative_draft_model_path=None, speculative_draft_model_revision=None, speculative_draft_load_format=None, speculative_num_steps=3, speculative_eagle_topk=1, speculative_num_draft_tokens=4, speculative_dflash_block_size=None, speculative_dflash_draft_window_size=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, speculative_attention_mode='prefill', speculative_draft_attention_backend=None, speculative_moe_runner_backend='b12x', speculative_moe_a2a_backend=None, speculative_draft_model_quantization=None, speculative_adaptive=False, speculative_adaptive_config=None, speculative_skip_dp_mlp_sync=False, speculative_ngram_min_bfs_breadth=1, speculative_ngram_max_bfs_breadth=10, speculative_ngram_match_type='BFS', speculative_ngram_max_trie_depth=18, speculative_ngram_capacity=10000000, speculative_ngram_external_corpus_path=None, speculative_ngram_external_sam_budget=0, speculative_ngram_external_corpus_max_tokens=10000000, enable_multi_layer_eagle=True, ep_size=1, moe_a2a_backend='none', moe_runner_backend='b12x', record_nolora_graph=True, flashinfer_mxfp4_moe_precision='default', enable_flashinfer_allreduce_fusion=False, enforce_disable_flashinfer_allreduce_fusion=False, enable_aiter_allreduce_fusion=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm=None, init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, eplb_min_rebalancing_utilization_threshold=1.0, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, elastic_ep_backend=None, enable_elastic_expert_backup=False, mooncake_ib_device=None, elastic_ep_rejoin=False, max_mamba_cache_size=None, mamba_ssm_dtype=None, mamba_full_memory_ratio=0.9, mamba_scheduler_strategy='no_buffer', mamba_track_interval=256, linear_attn_backend='triton', linear_attn_decode_backend=None, linear_attn_prefill_backend=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through', hicache_io_backend='kernel', hicache_mem_layout='layer_first', hicache_storage_backend=None, hicache_storage_prefetch_policy='best_effort', hicache_storage_backend_extra_config=None, enable_hisparse=False, hisparse_config=None, enable_lmcache=False, kt_weight_path=None, kt_method='AMXINT4', kt_cpuinfer=None, kt_threadpool_count=2, kt_num_gpu_experts=None, kt_max_deferred_experts_per_token=None, dllm_algorithm=None, dllm_algorithm_config=None, cpu_offload_gb=0, offload_group_size=-1, offload_num_in_group=1, offload_prefetch_step=1, offload_mode='cpu', enable_mis=False, disable_radix_cache=False, cuda_graph_max_bs=512, cuda_graph_bs=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 40, 44, 48, 52, 56, 60, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], disable_cuda_graph=False, disable_cuda_graph_padding=False, enable_breakable_cuda_graph=False, enable_profile_cuda_graph=False, enable_cudagraph_gc=False, debug_cuda_graph=False, enable_layerwise_nvtx_marker=False, enable_nccl_nvls=False, enable_symm_mem=False, disable_flashinfer_cutlass_moe_fp4_allgather=False, enable_tokenizer_batch_encode=False, disable_tokenizer_batch_decode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_pcie_oneshot_allreduce=True, pcie_oneshot_allreduce_max_size='64KB', enable_mscclpp=False, enable_torch_symm_mem=False, pre_warm_nccl=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_attention_local_control_broadcast=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_single_batch_overlap=False, tbo_token_distribution_threshold=0.48, enable_torch_compile=False, disable_piecewise_cuda_graph=True, enforce_piecewise_cuda_graph=False, enable_torch_compile_debug_mode=False, torch_compile_max_bs=32, piecewise_cuda_graph_max_tokens=8192, piecewise_cuda_graph_tokens=[4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584, 3840, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192], piecewise_cuda_graph_compiler='eager', torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, triton_attention_split_tile_size=None, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, enable_weights_cpu_backup=False, enable_draft_weights_cpu_backup=False, allow_auto_truncate=False, enable_custom_logit_processor=False, flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, enforce_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, keep_mm_feature_on_device=False, enable_return_hidden_states=False, enable_return_routed_experts=False, scheduler_recv_interval=1, numa_node=None, enable_deterministic_inference=False, rl_on_policy_target=None, enable_attn_tp_input_scattered=False, gc_threshold=None, enable_nsa_prefill_context_parallel=False, nsa_prefill_cp_mode='round-robin-split', enable_fused_qk_norm_rope=False, enable_precise_embedding_interpolation=False, enable_fused_moe_sum_all_reduce=False, enable_prefill_context_parallel=False, prefill_cp_mode='in-seq-split', enable_dynamic_batch_tokenizer=False, dynamic_batch_tokenizer_batch_size=32, dynamic_batch_tokenizer_batch_timeout=0.002, debug_tensor_dump_output_folder=None, debug_tensor_dump_layers=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_ib_device=None, disaggregation_decode_enable_radix_cache=False, disaggregation_decode_enable_offload_kvcache=False, num_reserved_decode_tokens=512, disaggregation_decode_polling_interval=1, encoder_only=False, language_only=False, encoder_transfer_backend='zmq_to_scheduler', encoder_urls=[], enable_adaptive_dispatch_to_encoder=False, custom_weight_loader=[], weight_loader_disable_mmap=False, weight_loader_prefetch_checkpoints=False, weight_loader_prefetch_num_threads=4, remote_instance_weight_loader_seed_instance_ip=None, remote_instance_weight_loader_seed_instance_service_port=None, remote_instance_weight_loader_send_weights_group_ports=None, remote_instance_weight_loader_backend='nccl', remote_instance_weight_loader_start_seed_via_transfer_engine=False, engine_info_bootstrap_port=6789, modelexpress_config=None, enable_pdmux=False, pdmux_config_path=None, sm_group_num=8, enable_broadcast_mm_inputs_process=False, enable_prefix_mm_cache=False, mm_enable_dp_encoder=False, mm_process_config={}, limit_mm_data_per_request=None, enable_mm_global_cache=False, decrypted_config_file=None, decrypted_draft_config_file=None, forward_hooks=None, enable_quant_communications=False, msprobe_dump_config=None)
 /opt/venv/lib/python3.12/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
   import pynvml  # type: ignore[import]
 /opt/venv/lib/python3.12/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
   import pynvml  # type: ignore[import]
 [transformers] The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 [2026-05-05 19:26:05 TP0] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [transformers] The `use_fast` parameter is deprecated and will be removed in a future version. Use `backend="torchvision"` instead of `use_fast=True`, or `backend="pil"` instead of `use_fast=False`.
 [2026-05-05 19:26:07 TP0] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [2026-05-05 19:26:07 TP0] Init torch distributed begin.
 /opt/venv/lib/python3.12/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
   import pynvml  # type: ignore[import]
 [transformers] The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 [2026-05-05 19:26:10 TP1] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [transformers] The `use_fast` parameter is deprecated and will be removed in a future version. Use `backend="torchvision"` instead of `use_fast=True`, or `backend="pil"` instead of `use_fast=False`.
 [2026-05-05 19:26:12 TP1] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [2026-05-05 19:26:12 TP1] Init torch distributed begin.
 /opt/venv/lib/python3.12/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
   import pynvml  # type: ignore[import]
 [transformers] The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 [2026-05-05 19:26:15 TP2] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [transformers] The `use_fast` parameter is deprecated and will be removed in a future version. Use `backend="torchvision"` instead of `use_fast=True`, or `backend="pil"` instead of `use_fast=False`.
 [2026-05-05 19:26:17 TP2] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [2026-05-05 19:26:17 TP2] Init torch distributed begin.
 /opt/venv/lib/python3.12/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
   import pynvml  # type: ignore[import]
 [transformers] The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 [2026-05-05 19:26:21 TP3] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [transformers] The `use_fast` parameter is deprecated and will be removed in a future version. Use `backend="torchvision"` instead of `use_fast=True`, or `backend="pil"` instead of `use_fast=False`.
 [2026-05-05 19:26:22 TP3] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [2026-05-05 19:26:22 TP3] Init torch distributed begin.
 [2026-05-05 19:26:22 TP0] sglang is using nccl==2.28.9
 [2026-05-05 19:26:26] DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.
 [transformers] The `use_fast` parameter is deprecated and will be removed in a future version. Use `backend="torchvision"` instead of `use_fast=True`, or `backend="pil"` instead of `use_fast=False`.
 [2026-05-05 19:26:27] Using default HuggingFace chat template with detected content format: openai
 [2026-05-05 19:26:51 TP2] Init torch distributed ends. elapsed=34.79 s, mem usage=0.33 GB
 [2026-05-05 19:26:51 TP0] Init torch distributed ends. elapsed=44.61 s, mem usage=0.33 GB
 [2026-05-05 19:26:51 TP3] Init torch distributed ends. elapsed=29.62 s, mem usage=0.33 GB
 [2026-05-05 19:26:51 TP1] Init torch distributed ends. elapsed=39.84 s, mem usage=0.33 GB
 [2026-05-05 19:26:52 TP2] Load weight begin. avail mem=94.07 GB
 [2026-05-05 19:26:52 TP2] Using ModelOptModelLoader due to ModelOpt quantization config.
 [2026-05-05 19:26:52 TP2] ModelOptModelLoader: Loading base model...
 [2026-05-05 19:26:52 TP2] Model is already quantized, loading directly...
 [2026-05-05 19:26:52 TP2] Detected ModelOpt FP8 checkpoint. The format is experimental and subject to change.
 [2026-05-05 19:26:52 TP2] Detected nvfp4 checkpoint. Please note that the format is experimental and subject to change.
 [2026-05-05 19:26:52 TP2] Skipping block quantization checks for weight partition.
 [2026-05-05 19:26:52 TP3] Load weight begin. avail mem=94.07 GB
 [2026-05-05 19:26:52 TP3] Using ModelOptModelLoader due to ModelOpt quantization config.
 [2026-05-05 19:26:52 TP3] ModelOptModelLoader: Loading base model...
 [2026-05-05 19:26:52 TP3] Model is already quantized, loading directly...
 [2026-05-05 19:26:52 TP1] Load weight begin. avail mem=94.07 GB
 [2026-05-05 19:26:52 TP1] Using ModelOptModelLoader due to ModelOpt quantization config.
 [2026-05-05 19:26:52 TP1] ModelOptModelLoader: Loading base model...
 [2026-05-05 19:26:52 TP1] Model is already quantized, loading directly...
 [2026-05-05 19:26:52 TP3] Detected ModelOpt FP8 checkpoint. The format is experimental and subject to change.
 [2026-05-05 19:26:52 TP3] Detected nvfp4 checkpoint. Please note that the format is experimental and subject to change.
 [2026-05-05 19:26:52 TP1] Detected ModelOpt FP8 checkpoint. The format is experimental and subject to change.
 [2026-05-05 19:26:52 TP1] Detected nvfp4 checkpoint. Please note that the format is experimental and subject to change.
 [2026-05-05 19:26:52 TP0] Load weight begin. avail mem=94.07 GB
 [2026-05-05 19:26:52 TP0] Using ModelOptModelLoader due to ModelOpt quantization config.
 [2026-05-05 19:26:52 TP0] ModelOptModelLoader: Loading base model...
 [2026-05-05 19:26:52 TP0] Model is already quantized, loading directly...
 [2026-05-05 19:26:52 TP0] Detected ModelOpt FP8 checkpoint. The format is experimental and subject to change.
 [2026-05-05 19:26:52 TP0] Detected fp8 checkpoint.
 [2026-05-05 19:26:52 TP0] Detected nvfp4 checkpoint. Please note that the format is experimental and subject to change.
 [2026-05-05 19:26:52 TP3] Skipping block quantization checks for weight partition.
 [2026-05-05 19:26:52 TP1] Skipping block quantization checks for weight partition.
 [2026-05-05 19:26:52 TP0] Skipping block quantization checks for weight partition.
 [2026-05-05 19:26:52 TP2] Using b12x as multimodal attention backend.
 [2026-05-05 19:26:53 TP3] Using b12x as multimodal attention backend.
 [2026-05-05 19:26:53 TP1] Using b12x as multimodal attention backend.
 [2026-05-05 19:26:53 TP0] Using b12x as multimodal attention backend.
 [2026-05-05 19:26:53 TP0] Found local HF snapshot for lukealonso/MiMo-V2.5-NVFP4 at /root/.cache/huggingface/hub/models--lukealonso--MiMo-V2.5-NVFP4/snapshots/e31cc15ee78c5fbfd662f350e5b47d844311303f; skipping download.
 
Multi-thread loading shards:   0% Completed | 0/37 [00:00<?, ?it/s]
Multi-thread loading shards:   3% Completed | 1/37 [00:01<00:51,  1.42s/it]
Multi-thread loading shards:   5% Completed | 2/37 [00:01<00:31,  1.12it/s]
Multi-thread loading shards:   8% Completed | 3/37 [00:03<00:33,  1.03it/s]
Multi-thread loading shards:  11% Completed | 4/37 [00:03<00:26,  1.22it/s]
Multi-thread loading shards:  14% Completed | 5/37 [00:04<00:23,  1.35it/s]
Multi-thread loading shards:  16% Completed | 6/37 [00:04<00:21,  1.47it/s]
Multi-thread loading shards:  19% Completed | 7/37 [00:05<00:23,  1.27it/s]
Multi-thread loading shards:  22% Completed | 8/37 [00:06<00:20,  1.40it/s]
Multi-thread loading shards:  24% Completed | 9/37 [00:06<00:18,  1.51it/s]
Multi-thread loading shards:  27% Completed | 10/37 [00:07<00:17,  1.58it/s]
Multi-thread loading shards:  30% Completed | 11/37 [00:07<00:15,  1.67it/s]
Multi-thread loading shards:  32% Completed | 12/37 [00:08<00:14,  1.73it/s]
Multi-thread loading shards:  35% Completed | 13/37 [00:09<00:13,  1.73it/s]
Multi-thread loading shards:  38% Completed | 14/37 [00:09<00:13,  1.72it/s]
Multi-thread loading shards:  41% Completed | 15/37 [00:10<00:12,  1.74it/s]
Multi-thread loading shards:  43% Completed | 16/37 [00:10<00:11,  1.76it/s]
Multi-thread loading shards:  46% Completed | 17/37 [00:11<00:11,  1.77it/s]
Multi-thread loading shards:  49% Completed | 18/37 [00:11<00:10,  1.77it/s]
Multi-thread loading shards:  51% Completed | 19/37 [00:12<00:10,  1.77it/s]
Multi-thread loading shards:  54% Completed | 20/37 [00:13<00:09,  1.75it/s]
Multi-thread loading shards:  57% Completed | 21/37 [00:13<00:09,  1.75it/s]
Multi-thread loading shards:  59% Completed | 22/37 [00:14<00:08,  1.75it/s]
Multi-thread loading shards:  62% Completed | 23/37 [00:14<00:07,  1.77it/s]
Multi-thread loading shards:  65% Completed | 24/37 [00:15<00:07,  1.78it/s]
Multi-thread loading shards:  68% Completed | 25/37 [00:15<00:06,  1.79it/s]
Multi-thread loading shards:  70% Completed | 26/37 [00:16<00:06,  1.82it/s]
Multi-thread loading shards:  73% Completed | 27/37 [00:17<00:07,  1.27it/s]
Multi-thread loading shards:  76% Completed | 28/37 [00:18<00:06,  1.42it/s]
Multi-thread loading shards:  78% Completed | 29/37 [00:18<00:05,  1.54it/s]
Multi-thread loading shards:  81% Completed | 30/37 [00:19<00:04,  1.63it/s]
Multi-thread loading shards:  84% Completed | 31/37 [00:19<00:03,  1.71it/s]
Multi-thread loading shards:  86% Completed | 32/37 [00:20<00:02,  1.74it/s]
Multi-thread loading shards:  89% Completed | 33/37 [00:20<00:02,  1.77it/s]
Multi-thread loading shards:  92% Completed | 34/37 [00:21<00:01,  1.92it/s]
Multi-thread loading shards:  95% Completed | 35/37 [00:21<00:00,  2.33it/s][2026-05-05 19:27:16 TP2] Skipping draft-only MiMo-V2 MTP weights while loading the target model; MiMoV2MTP loads these weights in the draft model runner.
 
Multi-thread loading shards:  97% Completed | 36/37 [00:23<00:00,  1.28it/s][2026-05-05 19:27:17 TP0] Skipping draft-only MiMo-V2 MTP weights while loading the target model; MiMoV2MTP loads these weights in the draft model runner.
 
Multi-thread loading shards: 100% Completed | 37/37 [00:23<00:00,  1.60it/s]
 [2026-05-05 19:27:17 TP1] Skipping draft-only MiMo-V2 MTP weights while loading the target model; MiMoV2MTP loads these weights in the draft model runner.
 [2026-05-05 19:27:17 TP3] Skipping draft-only MiMo-V2 MTP weights while loading the target model; MiMoV2MTP loads these weights in the draft model runner.
 [2026-05-05 19:27:28 TP2] Using FP8 KV cache but no scaling factors provided. Defaulting to scaling factors of 1.0. This may lead to less accurate results!
 [2026-05-05 19:27:28 TP2] Load weight end. elapsed=36.06 s, type=MiMoV2ForCausalLM, quant=modelopt_mixed, quant_algo=MIXED_PRECISION, avail mem=49.91 GB, mem usage=44.16 GB.
 [2026-05-05 19:27:28 TP0] Using FP8 KV cache but no scaling factors provided. Defaulting to scaling factors of 1.0. This may lead to less accurate results!
 [2026-05-05 19:27:28 TP0] Load weight end. elapsed=35.91 s, type=MiMoV2ForCausalLM, quant=modelopt_mixed, quant_algo=MIXED_PRECISION, avail mem=49.91 GB, mem usage=44.16 GB.
 [2026-05-05 19:27:28 TP1] Using FP8 KV cache but no scaling factors provided. Defaulting to scaling factors of 1.0. This may lead to less accurate results!
 [2026-05-05 19:27:28 TP1] Load weight end. elapsed=36.14 s, type=MiMoV2ForCausalLM, quant=modelopt_mixed, quant_algo=MIXED_PRECISION, avail mem=49.91 GB, mem usage=44.16 GB.
 [2026-05-05 19:27:28 TP3] Using FP8 KV cache but no scaling factors provided. Defaulting to scaling factors of 1.0. This may lead to less accurate results!
 [2026-05-05 19:27:28 TP3] Load weight end. elapsed=36.45 s, type=MiMoV2ForCausalLM, quant=modelopt_mixed, quant_algo=MIXED_PRECISION, avail mem=49.91 GB, mem usage=44.16 GB.
 [2026-05-05 19:27:28 TP0] Using KV cache dtype: torch.float8_e4m3fn
 [2026-05-05 19:27:28 TP0] Use sliding window memory pool. full_layer_tokens=3706432, swa_layer_tokens=1111872
 [2026-05-05 19:27:28 TP3] Use sliding window memory pool. full_layer_tokens=3706432, swa_layer_tokens=1111872
 [2026-05-05 19:27:28 TP2] Use sliding window memory pool. full_layer_tokens=3706432, swa_layer_tokens=1111872
 [2026-05-05 19:27:28 TP1] Use sliding window memory pool. full_layer_tokens=3706432, swa_layer_tokens=1111872
 [2026-05-05 19:27:28 TP3] KV Cache is allocated. #tokens: 1111872, K size: 15.51 GB, V size: 10.34 GB
 [2026-05-05 19:27:28 TP2] KV Cache is allocated. #tokens: 1111872, K size: 15.51 GB, V size: 10.34 GB
 [2026-05-05 19:27:28 TP0] KV Cache is allocated. #tokens: 1111872, K size: 15.51 GB, V size: 10.34 GB
 [2026-05-05 19:27:28 TP1] KV Cache is allocated. #tokens: 1111872, K size: 15.51 GB, V size: 10.34 GB
 [2026-05-05 19:27:28 TP2] KV Cache is allocated. #tokens: 3706432, K size: 5.96 GB, V size: 3.98 GB
 [2026-05-05 19:27:28 TP3] KV Cache is allocated. #tokens: 3706432, K size: 5.96 GB, V size: 3.98 GB
 [2026-05-05 19:27:28 TP2] SWAKVPool mem usage: 35.79 GB, swa size: 1111872, full size: 3706432
 [2026-05-05 19:27:28 TP3] SWAKVPool mem usage: 35.79 GB, swa size: 1111872, full size: 3706432
 [2026-05-05 19:27:28 TP2] Memory pool end. avail mem=13.97 GB
 [2026-05-05 19:27:28 TP0] KV Cache is allocated. #tokens: 3706432, K size: 5.96 GB, V size: 3.98 GB
 [2026-05-05 19:27:28 TP3] Memory pool end. avail mem=13.97 GB
 [2026-05-05 19:27:28 TP0] SWAKVPool mem usage: 35.79 GB, swa size: 1111872, full size: 3706432
 [2026-05-05 19:27:28 TP1] KV Cache is allocated. #tokens: 3706432, K size: 5.96 GB, V size: 3.98 GB
 [2026-05-05 19:27:28 TP1] SWAKVPool mem usage: 35.79 GB, swa size: 1111872, full size: 3706432
 [2026-05-05 19:27:28 TP0] Memory pool end. avail mem=13.97 GB
 [2026-05-05 19:27:28 TP1] Memory pool end. avail mem=13.97 GB
 [2026-05-05 19:27:29 TP3] Capture cuda graph begin. This can take up to several minutes. avail mem=13.07 GB
 [2026-05-05 19:27:29 TP1] Capture cuda graph begin. This can take up to several minutes. avail mem=13.07 GB
 [2026-05-05 19:27:29 TP2] Capture cuda graph begin. This can take up to several minutes. avail mem=13.07 GB
 [2026-05-05 19:27:29 TP0] Capture cuda graph begin. This can take up to several minutes. avail mem=13.07 GB
 [2026-05-05 19:27:29 TP0] Capture cuda graph bs [1, 2, 3, 4, 5, 6, 7, 8]
 
  0%|          | 0/8 [00:00<?, ?it/s]
Capturing batches (bs=8 avail_mem=13.02 GB):   0%|          | 0/8 [00:00<?, ?it/s]
Capturing batches (bs=8 avail_mem=13.02 GB):   0%|          | 0/8 [00:02<?, ?it/s]
 [2026-05-05 19:27:32 TP1] Scheduler hit an exception: Traceback (most recent call last):
   File "/opt/sglang/python/sglang/srt/managers/scheduler.py", line 3963, in run_scheduler_process
     scheduler = Scheduler(
                 ^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/managers/scheduler.py", line 433, in __init__
     self.init_model_worker()
   File "/opt/sglang/python/sglang/srt/managers/scheduler.py", line 703, in init_model_worker
     self.init_tp_model_worker()
   File "/opt/sglang/python/sglang/srt/managers/scheduler.py", line 658, in init_tp_model_worker
     self.tp_worker = TpModelWorker(**worker_kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/managers/tp_worker.py", line 260, in __init__
     self._init_model_runner()
   File "/opt/sglang/python/sglang/srt/managers/tp_worker.py", line 345, in _init_model_runner
     self._model_runner = ModelRunner(
                          ^^^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/model_executor/model_runner.py", line 511, in __init__
     self.initialize(pre_model_load_memory)
   File "/opt/sglang/python/sglang/srt/model_executor/model_runner.py", line 774, in initialize
     self.init_device_graphs()
   File "/opt/sglang/python/sglang/srt/model_executor/model_runner.py", line 2796, in init_device_graphs
     self.graph_runner = graph_runners[self.device](self)
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 730, in __init__
     self.capture()
   File "/opt/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 913, in capture
     _capture_one_stream()
   File "/opt/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 901, in _capture_one_stream
     ) = self.capture_one_batch_size(bs, forward, stream_idx)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 1188, in capture_one_batch_size
     run_once()
   File "/opt/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 1166, in run_once
     logits_output_or_pp_proxy_tensors = forward(
                                         ^^^^^^^^
   File "/opt/venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/models/mimo_v2.py", line 1173, in forward
     hidden_states, hidden_states_before_norm = general_mm_embed_routine(
                                                ^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/managers/mm_utils.py", line 1106, in general_mm_embed_routine
     hidden_states = language_model(
                     ^^^^^^^^^^^^^^^
   File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
     return self._call_impl(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
     return forward_call(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/models/mimo_v2.py", line 1002, in forward
     hidden_states, residual = layer(
                               ^^^^^^
   File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
     return self._call_impl(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
     return forward_call(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/models/mimo_v2.py", line 797, in forward
     hidden_states = self.self_attn(
                     ^^^^^^^^^^^^^^^
   File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
     return self._call_impl(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
     return forward_call(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/models/mimo_v2.py", line 652, in forward
     qkv, _ = self.qkv_proj(hidden_states)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
     return self._call_impl(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
     return forward_call(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/layers/linear.py", line 466, in forward
     output_parallel = self.quant_method.apply(self, input_, bias)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/layers/quantization/fp8.py", line 751, in apply
     return self.w8a8_mxfp8_linear(
            ^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/venv/lib/python3.12/site-packages/torch/_ops.py", line 1269, in __call__
     return self._op(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/layers/quantization/fp8_utils.py", line 1016, in triton_mxfp8_blockscaled_linear
     return _raw_triton_mxfp8_blockscaled_linear(
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/opt/sglang/python/sglang/srt/layers/quantization/fp8_utils.py", line 941, in _raw_triton_mxfp8_blockscaled_linear
     assert n % block_n == 0, f"{n=} must be divisible by {block_n}"
            ^^^^^^^^^^^^^^^^
 AssertionError: n=3392 must be divisible by 128
 
 [2026-05-05 19:27:32] Received sigquit from a child process. It usually means the child failed.

--fp8-gemm-backend flashinfer_cutlass

Sign up or log in to comment