ROCmPort-AI / artifacts /test-output /rocm_patch.diff
Nawangdorjay's picture
Deploy ROCmPort AI — CUDA-to-ROCm migration scanner
786f63c verified
--- a/benchmarks/benchmark.py
+++ b/benchmarks/benchmark.py
@@ -9,6 +9,9 @@
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" # should → HIP_VISIBLE_DEVICES
@@ -33,12 +36,12 @@
hw = gpu_info()
print("GPU info:", hw)
- device = torch.device("cuda") # hardcoded CUDA device
+ device = _rocmport_device # hardcoded CUDA device
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda()
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device)
model.eval()
- inputs = tokenizer(PROMPT, return_tensors="pt").to("cuda") # .to("cuda")
+ inputs = tokenizer(PROMPT, return_tensors="pt").to(_rocmport_device) # .to(_rocmport_device)
# Warm-up
with torch.no_grad():
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,9 +3,9 @@
inference:
image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
environment:
- - NVIDIA_VISIBLE_DEVICES=all
- - NVIDIA_DRIVER_CAPABILITIES=compute,utility
- - CUDA_VISIBLE_DEVICES=0
+ - HIP_VISIBLE_DEVICES=all
+ - ROCM_VISIBLE_DEVICES=compute,utility
+ - HIP_VISIBLE_DEVICES=0
deploy:
resources:
reservations:
@@ -21,10 +21,10 @@
vllm_server:
image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
environment:
- - NVIDIA_VISIBLE_DEVICES=0,1
- - CUDA_VISIBLE_DEVICES=0,1
+ - HIP_VISIBLE_DEVICES=0,1
+ - HIP_VISIBLE_DEVICES=0,1
ports:
- "8000:8000"
command: >
- bash -c "nvidia-smi && pip install vllm &&
+ bash -c "rocm-smi && pip install vllm &&
vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 2"
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,10 +1,10 @@
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+FROM vllm/vllm-openai-rocm:latest
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
-ENV NVIDIA_VISIBLE_DEVICES=all
+ENV HIP_VISIBLE_DEVICES=all
CMD ["python", "infer.py"]
--- a/infer.py
+++ b/infer.py
@@ -1,15 +1,18 @@
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
-device = torch.device("cuda")
+device = _rocmport_device
def main():
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
- inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
+ inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
--- a/scripts/serve_vllm.sh
+++ b/scripts/serve_vllm.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
set -euo pipefail
-export CUDA_VISIBLE_DEVICES=0
-nvidia-smi
+export HIP_VISIBLE_DEVICES=0
+rocm-smi
vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -9,13 +9,16 @@
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForCausalLM, AutoTokenizer
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
# ── CUDA-specific patterns that ROCmPort will flag ─────────────────────────
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # should → HIP_VISIBLE_DEVICES
os.environ["CUDA_HOME"] = "/usr/local/cuda" # should be removed / replaced
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
-device = torch.device("cuda") # hardcoded CUDA device
+device = _rocmport_device # hardcoded CUDA device
print("CUDA available:", torch.cuda.is_available())
@@ -27,13 +30,13 @@
def train(epochs: int = 3, lr: float = 2e-5):
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda() call
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device) call
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
ids, labels = get_dummy_batch()
- ids = ids.to("cuda") # hardcoded "cuda" string
- labels = labels.to("cuda") # hardcoded "cuda" string
+ ids = ids.to(_rocmport_device) # hardcoded "cuda" string
+ labels = labels.to(_rocmport_device) # hardcoded "cuda" string
dataset = TensorDataset(ids, labels)
loader = DataLoader(dataset, batch_size=2)
@@ -41,8 +44,8 @@
model.train()
for epoch in range(epochs):
for batch_ids, batch_labels in loader:
- batch_ids = batch_ids.cuda() # another .cuda() call
- batch_labels = batch_labels.cuda()
+ batch_ids = batch_ids.to(_rocmport_device) # another .to(_rocmport_device) call
+ batch_labels = batch_labels.to(_rocmport_device)
outputs = model(input_ids=batch_ids, labels=batch_labels)
loss = outputs.loss
loss.backward()