--- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -9,6 +9,9 @@ import json import torch from transformers import AutoModelForCausalLM, AutoTokenizer + +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace. +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" # should → HIP_VISIBLE_DEVICES @@ -33,12 +36,12 @@ hw = gpu_info() print("GPU info:", hw) - device = torch.device("cuda") # hardcoded CUDA device + device = _rocmport_device # hardcoded CUDA device tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda() + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device) model.eval() - inputs = tokenizer(PROMPT, return_tensors="pt").to("cuda") # .to("cuda") + inputs = tokenizer(PROMPT, return_tensors="pt").to(_rocmport_device) # .to(_rocmport_device) # Warm-up with torch.no_grad(): --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,9 +3,9 @@ inference: image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 environment: - - NVIDIA_VISIBLE_DEVICES=all - - NVIDIA_DRIVER_CAPABILITIES=compute,utility - - CUDA_VISIBLE_DEVICES=0 + - HIP_VISIBLE_DEVICES=all + - ROCM_VISIBLE_DEVICES=compute,utility + - HIP_VISIBLE_DEVICES=0 deploy: resources: reservations: @@ -21,10 +21,10 @@ vllm_server: image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 environment: - - NVIDIA_VISIBLE_DEVICES=0,1 - - CUDA_VISIBLE_DEVICES=0,1 + - HIP_VISIBLE_DEVICES=0,1 + - HIP_VISIBLE_DEVICES=0,1 ports: - "8000:8000" command: > - bash -c "nvidia-smi && pip install vllm && + bash -c "rocm-smi && pip install vllm && vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 2" --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,10 @@ -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 +FROM vllm/vllm-openai-rocm:latest WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt COPY . . -ENV NVIDIA_VISIBLE_DEVICES=all +ENV HIP_VISIBLE_DEVICES=all CMD ["python", "infer.py"] --- a/infer.py +++ b/infer.py @@ -1,15 +1,18 @@ import torch from transformers import AutoModelForCausalLM, AutoTokenizer +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace. +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" -device = torch.device("cuda") +device = _rocmport_device def main(): tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() - inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda") + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) + inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device) with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=64) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) --- a/scripts/serve_vllm.sh +++ b/scripts/serve_vllm.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash set -euo pipefail -export CUDA_VISIBLE_DEVICES=0 -nvidia-smi +export HIP_VISIBLE_DEVICES=0 +rocm-smi vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1 --- a/scripts/train.py +++ b/scripts/train.py @@ -9,13 +9,16 @@ from torch.utils.data import DataLoader, TensorDataset from transformers import AutoModelForCausalLM, AutoTokenizer +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace. +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # ── CUDA-specific patterns that ROCmPort will flag ───────────────────────── os.environ["CUDA_VISIBLE_DEVICES"] = "0" # should → HIP_VISIBLE_DEVICES os.environ["CUDA_HOME"] = "/usr/local/cuda" # should be removed / replaced MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" -device = torch.device("cuda") # hardcoded CUDA device +device = _rocmport_device # hardcoded CUDA device print("CUDA available:", torch.cuda.is_available()) @@ -27,13 +30,13 @@ def train(epochs: int = 3, lr: float = 2e-5): tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda() call + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device) call optimizer = torch.optim.AdamW(model.parameters(), lr=lr) ids, labels = get_dummy_batch() - ids = ids.to("cuda") # hardcoded "cuda" string - labels = labels.to("cuda") # hardcoded "cuda" string + ids = ids.to(_rocmport_device) # hardcoded "cuda" string + labels = labels.to(_rocmport_device) # hardcoded "cuda" string dataset = TensorDataset(ids, labels) loader = DataLoader(dataset, batch_size=2) @@ -41,8 +44,8 @@ model.train() for epoch in range(epochs): for batch_ids, batch_labels in loader: - batch_ids = batch_ids.cuda() # another .cuda() call - batch_labels = batch_labels.cuda() + batch_ids = batch_ids.to(_rocmport_device) # another .to(_rocmport_device) call + batch_labels = batch_labels.to(_rocmport_device) outputs = model(input_ids=batch_ids, labels=batch_labels) loss = outputs.loss loss.backward()