CUDA_VISIBLE_DEVICES=0 python -m vllm.entrypoints.openai.api_server \ --model /home/hector5/models/Qwen2.5-14B-Instruct/ \ --served-model-name Qwen2.5-14B-Instruct \ --dtype auto \ --max-model-len 10000 \ --tensor-parallel-size 1 \ --port 8000 # --served-model-name Llama-2-7b-chat # --host 0.0.0.0 \ # CUDA_VISIBLE_DEVICES=3 vllm serve /data1/public/models/Qwen2.5-7B-Instruct --port 8000 --served-model-name Qwen2.5-7B-Instruct --dtype auto --max-model-len 10000 --tensor-parallel-size 1 --enable_prefix_caching