| CUDA_VISIBLE_DEVICES=0 python -m vllm.entrypoints.openai.api_server \ | |
| --model /home/hector5/models/Qwen2.5-14B-Instruct/ \ | |
| --served-model-name Qwen2.5-14B-Instruct \ | |
| --dtype auto \ | |
| --max-model-len 10000 \ | |
| --tensor-parallel-size 1 \ | |
| --port 8000 | |
| # --served-model-name Llama-2-7b-chat | |
| # --host 0.0.0.0 \ | |
| # CUDA_VISIBLE_DEVICES=3 vllm serve /data1/public/models/Qwen2.5-7B-Instruct --port 8000 --served-model-name Qwen2.5-7B-Instruct --dtype auto --max-model-len 10000 --tensor-parallel-size 1 --enable_prefix_caching |