| |
| """Run a nonuniform-pruned Qwen3-Coder-Next variant with vLLM. |
| |
| Nonuniform variants have different expert counts per layer. Stock vLLM |
| assumes a single `config.num_experts` for all layers. This script applies |
| a monkey-patch before loading the model so each layer gets its own count |
| from `config.per_layer_num_experts`. |
| |
| Usage: |
| # Interactive generation |
| python run_vllm_nonuniform.py --model ./coding-25-nonuniform --tp 4 |
| |
| # With a custom prompt |
| python run_vllm_nonuniform.py --model ./coding-50-nonuniform --tp 2 \ |
| --prompt "Write a Python function to merge two sorted lists." |
| |
| # As a library (import the patch before importing vLLM) |
| import run_vllm_nonuniform # applies patch on import |
| from vllm import LLM, SamplingParams |
| llm = LLM(model="./coding-25-nonuniform", ...) |
| |
| How it works: |
| vLLM spawns worker subprocesses that re-import all modules. A plain |
| monkey-patch in the parent process would be lost. This script places |
| the variant folder on PYTHONPATH so that `sitecustomize.py` (bundled |
| in each nonuniform variant) auto-applies the patch in every Python |
| process, including workers. |
| """ |
|
|
| import argparse |
| import json |
| import os |
| import sys |
|
|
|
|
| def setup(model_path: str): |
| """Apply the nonuniform-expert patch for vLLM worker processes.""" |
| model_path = os.path.abspath(os.path.expanduser(model_path)) |
|
|
| |
| meta_path = os.path.join(model_path, "pruned_metadata.json") |
| if os.path.exists(meta_path): |
| meta = json.load(open(meta_path)) |
| if meta.get("allocation") != "nonuniform": |
| print(f"Note: {model_path} is a uniform variant; patch not needed.") |
| return |
| per = meta["per_layer_num_experts"] |
| print(f"Nonuniform variant: {meta['total_pruned']} experts pruned " |
| f"({meta['sparsity']:.0%}), kept range [{min(per)}, {max(per)}]") |
|
|
| |
| cur = os.environ.get("PYTHONPATH", "") |
| if model_path not in cur: |
| os.environ["PYTHONPATH"] = model_path + (os.pathsep + cur if cur else "") |
|
|
| |
| if model_path not in sys.path: |
| sys.path.insert(0, model_path) |
|
|
| |
| import vllm_pruned_patch |
| vllm_pruned_patch.apply() |
|
|
|
|
| |
| if __name__ != "__main__": |
| |
| pass |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser( |
| description="Run a nonuniform-pruned model with vLLM") |
| parser.add_argument("--model", required=True, |
| help="Path to a nonuniform variant folder") |
| parser.add_argument("--tp", type=int, default=4, |
| help="Tensor parallel size") |
| parser.add_argument("--prompt", type=str, |
| default="def fibonacci(n):\n" |
| ' """Return the n-th Fibonacci number."""\n', |
| help="Prompt for generation") |
| parser.add_argument("--max-tokens", type=int, default=512) |
| parser.add_argument("--temperature", type=float, default=0.0) |
| parser.add_argument("--gpu-mem-util", type=float, default=0.85) |
| parser.add_argument("--max-model-len", type=int, default=4096) |
| args = parser.parse_args() |
|
|
| |
| setup(args.model) |
|
|
| from vllm import LLM, SamplingParams |
|
|
| llm = LLM( |
| model=args.model, |
| tensor_parallel_size=args.tp, |
| dtype="bfloat16", |
| gpu_memory_utilization=args.gpu_mem_util, |
| max_model_len=args.max_model_len, |
| trust_remote_code=True, |
| enforce_eager=True, |
| ) |
|
|
| sp = SamplingParams( |
| temperature=args.temperature, |
| max_tokens=args.max_tokens, |
| ) |
|
|
| outputs = llm.generate([args.prompt], sp) |
| text = outputs[0].outputs[0].text |
| n_tok = len(outputs[0].outputs[0].token_ids) |
|
|
| print("=" * 60) |
| print("PROMPT:") |
| print(args.prompt) |
| print("=" * 60) |
| print("COMPLETION:") |
| print(text) |
| print("=" * 60) |
| print(f"({n_tok} tokens)") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|