inclusionAI
/

Ling-2.6-flash-fp8

@@ -1,8 +1,8 @@
----
-license: mit
-language:
-- en
----
 ## Ling-2.6-flash: Faster Responses, Stronger Execution, Higher Token Efficiency
 ### Introduction
 Today, we announce the official open-source release of **Ling-2.6-flash**, an **instruct model** with **104B total parameters** and **7.4B active parameters**.
@@ -95,7 +95,7 @@ python -m sglang.launch_server \
     --dp-size 1 \
     --trust-remote-code \
     --tool-call-parser qwen25 \
-    --json-model-override-args '{"quantization_config": {"activation_scheme": "dynamic", "fmt": "e4m3", "quant_method": "fp8", "modules_to_not_convert": ["q_a_proj", "q_b_proj", "kv_a_proj_with_mqa", "kv_b_proj", "lm_head"], "weight_block_size": [128, 128]}, "linear_backend": "seg_la", "torch_dtype": "bfloat16", "architectures": ["BailingMoeV2_5ForCausalLM"], "model_type": "bailing_hybrid", "rope_scaling": {"rope_type": "yarn", "factor": 2.0, "rope_theta": 6000000, "partial_rotary_factor": 0.5, "original_max_position_embeddings": 131072}}' \
     --dist-init-addr $MASTER_IP:2345 \
     --port $PORT \
     --nnodes 1
@@ -119,6 +119,7 @@ python -m sglang.launch_server \
     --tp-size 4 \
     --pp-size 1 \
     --dp-size 1 \
     --speculative-algorithm NEXTN \
     --speculative-num-steps 1 \
     --speculative-eagle-topk 1 \
@@ -127,7 +128,7 @@ python -m sglang.launch_server \
     --max-running-requests 64 \
     --max-mamba-cache-size 256 \
     --tool-call-parser qwen25 \
-    --json-model-override-args '{"quantization_config": {"activation_scheme": "dynamic", "fmt": "e4m3", "quant_method": "fp8", "modules_to_not_convert": ["q_a_proj", "q_b_proj", "kv_a_proj_with_mqa", "kv_b_proj", "lm_head"], "weight_block_size": [128, 128]}, "linear_backend": "seg_la", "torch_dtype": "bfloat16", "architectures": ["BailingMoeV2_5ForCausalLM"], "model_type": "bailing_hybrid", "rope_scaling": {"rope_type": "yarn", "factor": 2.0, "rope_theta": 6000000, "partial_rotary_factor": 0.5, "original_max_position_embeddings": 131072}}' \
     --trust-remote-code \
     --dist-init-addr $MASTER_IP:2345 \
     --port $PORT \

+---
+license: mit
+language:
+- en
+---
 ## Ling-2.6-flash: Faster Responses, Stronger Execution, Higher Token Efficiency
 ### Introduction
 Today, we announce the official open-source release of **Ling-2.6-flash**, an **instruct model** with **104B total parameters** and **7.4B active parameters**.
     --dp-size 1 \
     --trust-remote-code \
     --tool-call-parser qwen25 \
+    --json-model-override-args '{"rope_scaling": {"rope_type": "yarn", "factor": 2.0, "rope_theta": 6000000, "partial_rotary_factor": 0.5, "original_max_position_embeddings": 131072}}' \
     --dist-init-addr $MASTER_IP:2345 \
     --port $PORT \
     --nnodes 1
     --tp-size 4 \
     --pp-size 1 \
     --dp-size 1 \
+    --mamba-scheduler-strategy extra_buffer \
     --speculative-algorithm NEXTN \
     --speculative-num-steps 1 \
     --speculative-eagle-topk 1 \
     --max-running-requests 64 \
     --max-mamba-cache-size 256 \
     --tool-call-parser qwen25 \
+    --json-model-override-args '{"rope_scaling": {"rope_type": "yarn", "factor": 2.0, "rope_theta": 6000000, "partial_rotary_factor": 0.5, "original_max_position_embeddings": 131072}}' \
     --trust-remote-code \
     --dist-init-addr $MASTER_IP:2345 \
     --port $PORT \