zheyishine commited on
Commit
b6f4a2f
·
verified ·
1 Parent(s): c7e9500

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +8 -7
README.md CHANGED
@@ -1,8 +1,8 @@
1
- ---
2
- license: mit
3
- language:
4
- - en
5
- ---
6
  ## Ling-2.6-flash: Faster Responses, Stronger Execution, Higher Token Efficiency
7
  ### Introduction
8
  Today, we announce the official open-source release of **Ling-2.6-flash**, an **instruct model** with **104B total parameters** and **7.4B active parameters**.
@@ -95,7 +95,7 @@ python -m sglang.launch_server \
95
  --dp-size 1 \
96
  --trust-remote-code \
97
  --tool-call-parser qwen25 \
98
- --json-model-override-args '{"quantization_config": {"activation_scheme": "dynamic", "fmt": "e4m3", "quant_method": "fp8", "modules_to_not_convert": ["q_a_proj", "q_b_proj", "kv_a_proj_with_mqa", "kv_b_proj", "lm_head"], "weight_block_size": [128, 128]}, "linear_backend": "seg_la", "torch_dtype": "bfloat16", "architectures": ["BailingMoeV2_5ForCausalLM"], "model_type": "bailing_hybrid", "rope_scaling": {"rope_type": "yarn", "factor": 2.0, "rope_theta": 6000000, "partial_rotary_factor": 0.5, "original_max_position_embeddings": 131072}}' \
99
  --dist-init-addr $MASTER_IP:2345 \
100
  --port $PORT \
101
  --nnodes 1
@@ -119,6 +119,7 @@ python -m sglang.launch_server \
119
  --tp-size 4 \
120
  --pp-size 1 \
121
  --dp-size 1 \
 
122
  --speculative-algorithm NEXTN \
123
  --speculative-num-steps 1 \
124
  --speculative-eagle-topk 1 \
@@ -127,7 +128,7 @@ python -m sglang.launch_server \
127
  --max-running-requests 64 \
128
  --max-mamba-cache-size 256 \
129
  --tool-call-parser qwen25 \
130
- --json-model-override-args '{"quantization_config": {"activation_scheme": "dynamic", "fmt": "e4m3", "quant_method": "fp8", "modules_to_not_convert": ["q_a_proj", "q_b_proj", "kv_a_proj_with_mqa", "kv_b_proj", "lm_head"], "weight_block_size": [128, 128]}, "linear_backend": "seg_la", "torch_dtype": "bfloat16", "architectures": ["BailingMoeV2_5ForCausalLM"], "model_type": "bailing_hybrid", "rope_scaling": {"rope_type": "yarn", "factor": 2.0, "rope_theta": 6000000, "partial_rotary_factor": 0.5, "original_max_position_embeddings": 131072}}' \
131
  --trust-remote-code \
132
  --dist-init-addr $MASTER_IP:2345 \
133
  --port $PORT \
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ ---
6
  ## Ling-2.6-flash: Faster Responses, Stronger Execution, Higher Token Efficiency
7
  ### Introduction
8
  Today, we announce the official open-source release of **Ling-2.6-flash**, an **instruct model** with **104B total parameters** and **7.4B active parameters**.
 
95
  --dp-size 1 \
96
  --trust-remote-code \
97
  --tool-call-parser qwen25 \
98
+ --json-model-override-args '{"rope_scaling": {"rope_type": "yarn", "factor": 2.0, "rope_theta": 6000000, "partial_rotary_factor": 0.5, "original_max_position_embeddings": 131072}}' \
99
  --dist-init-addr $MASTER_IP:2345 \
100
  --port $PORT \
101
  --nnodes 1
 
119
  --tp-size 4 \
120
  --pp-size 1 \
121
  --dp-size 1 \
122
+ --mamba-scheduler-strategy extra_buffer \
123
  --speculative-algorithm NEXTN \
124
  --speculative-num-steps 1 \
125
  --speculative-eagle-topk 1 \
 
128
  --max-running-requests 64 \
129
  --max-mamba-cache-size 256 \
130
  --tool-call-parser qwen25 \
131
+ --json-model-override-args '{"rope_scaling": {"rope_type": "yarn", "factor": 2.0, "rope_theta": 6000000, "partial_rotary_factor": 0.5, "original_max_position_embeddings": 131072}}' \
132
  --trust-remote-code \
133
  --dist-init-addr $MASTER_IP:2345 \
134
  --port $PORT \