--- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,10 @@ -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 +FROM vllm/vllm-openai-rocm:latest WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt COPY . . -ENV NVIDIA_VISIBLE_DEVICES=all +ENV HIP_VISIBLE_DEVICES=all CMD ["python", "infer.py"] --- a/infer.py +++ b/infer.py @@ -1,15 +1,18 @@ import torch from transformers import AutoModelForCausalLM, AutoTokenizer +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace. +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" -device = torch.device("cuda") +device = _rocmport_device def main(): tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() - inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda") + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) + inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device) with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=64) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) --- a/scripts/serve_vllm.sh +++ b/scripts/serve_vllm.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash set -euo pipefail -export CUDA_VISIBLE_DEVICES=0 -nvidia-smi +export HIP_VISIBLE_DEVICES=0 +rocm-smi vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1