|
|
|
|
| @@ -1,10 +1,10 @@
|
| -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
| +FROM vllm/vllm-openai-rocm:latest
|
|
|
| WORKDIR /app
|
| COPY requirements.txt .
|
| RUN pip install --no-cache-dir -r requirements.txt
|
| COPY . .
|
|
|
| -ENV NVIDIA_VISIBLE_DEVICES=all
|
| +ENV HIP_VISIBLE_DEVICES=all
|
|
|
| CMD ["python", "infer.py"]
|
|
|
|
|
| @@ -1,15 +1,18 @@
|
| import torch
|
| from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
| +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| +
|
|
|
| MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
|
| -device = torch.device("cuda")
|
| +device = _rocmport_device
|
|
|
|
|
| def main():
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
|
| - inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
|
| + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
|
| + inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
|
| with torch.no_grad():
|
| outputs = model.generate(**inputs, max_new_tokens=64)
|
| print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
|
|
|
|
| @@ -2,5 +2,5 @@
|
| set -euo pipefail
|
|
|
| export CUDA_VISIBLE_DEVICES=0
|
| -nvidia-smi
|
| +rocm-smi
|
| vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
|
|
|