Spaces:
Runtime error
Runtime error
File size: 5,884 Bytes
f6e0440 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | --- a/benchmarks/benchmark.py
+++ b/benchmarks/benchmark.py
@@ -9,6 +9,9 @@
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" # should → HIP_VISIBLE_DEVICES
@@ -33,12 +36,12 @@
hw = gpu_info()
print("GPU info:", hw)
- device = torch.device("cuda") # hardcoded CUDA device
+ device = _rocmport_device # hardcoded CUDA device
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda()
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device)
model.eval()
- inputs = tokenizer(PROMPT, return_tensors="pt").to("cuda") # .to("cuda")
+ inputs = tokenizer(PROMPT, return_tensors="pt").to(_rocmport_device) # .to(_rocmport_device)
# Warm-up
with torch.no_grad():
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,9 +3,9 @@
inference:
image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
environment:
- - NVIDIA_VISIBLE_DEVICES=all
- - NVIDIA_DRIVER_CAPABILITIES=compute,utility
- - CUDA_VISIBLE_DEVICES=0
+ - HIP_VISIBLE_DEVICES=all
+ - ROCM_VISIBLE_DEVICES=compute,utility
+ - HIP_VISIBLE_DEVICES=0
deploy:
resources:
reservations:
@@ -21,10 +21,10 @@
vllm_server:
image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
environment:
- - NVIDIA_VISIBLE_DEVICES=0,1
- - CUDA_VISIBLE_DEVICES=0,1
+ - HIP_VISIBLE_DEVICES=0,1
+ - HIP_VISIBLE_DEVICES=0,1
ports:
- "8000:8000"
command: >
- bash -c "nvidia-smi && pip install vllm &&
+ bash -c "rocm-smi && pip install vllm &&
vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 2"
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,10 +1,10 @@
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+FROM vllm/vllm-openai-rocm:latest
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
-ENV NVIDIA_VISIBLE_DEVICES=all
+ENV HIP_VISIBLE_DEVICES=all
CMD ["python", "infer.py"]
--- a/infer.py
+++ b/infer.py
@@ -1,15 +1,18 @@
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
-device = torch.device("cuda")
+device = _rocmport_device
def main():
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
- inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
+ inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
--- a/scripts/serve_vllm.sh
+++ b/scripts/serve_vllm.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
set -euo pipefail
-export CUDA_VISIBLE_DEVICES=0
-nvidia-smi
+export HIP_VISIBLE_DEVICES=0
+rocm-smi
vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -9,13 +9,16 @@
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForCausalLM, AutoTokenizer
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
# ── CUDA-specific patterns that ROCmPort will flag ─────────────────────────
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # should → HIP_VISIBLE_DEVICES
os.environ["CUDA_HOME"] = "/usr/local/cuda" # should be removed / replaced
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
-device = torch.device("cuda") # hardcoded CUDA device
+device = _rocmport_device # hardcoded CUDA device
print("CUDA available:", torch.cuda.is_available())
@@ -27,13 +30,13 @@
def train(epochs: int = 3, lr: float = 2e-5):
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda() call
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device) call
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
ids, labels = get_dummy_batch()
- ids = ids.to("cuda") # hardcoded "cuda" string
- labels = labels.to("cuda") # hardcoded "cuda" string
+ ids = ids.to(_rocmport_device) # hardcoded "cuda" string
+ labels = labels.to(_rocmport_device) # hardcoded "cuda" string
dataset = TensorDataset(ids, labels)
loader = DataLoader(dataset, batch_size=2)
@@ -41,8 +44,8 @@
model.train()
for epoch in range(epochs):
for batch_ids, batch_labels in loader:
- batch_ids = batch_ids.cuda() # another .cuda() call
- batch_labels = batch_labels.cuda()
+ batch_ids = batch_ids.to(_rocmport_device) # another .to(_rocmport_device) call
+ batch_labels = batch_labels.to(_rocmport_device)
outputs = model(input_ids=batch_ids, labels=batch_labels)
loss = outputs.loss
loss.backward()
|