File size: 5,884 Bytes
f6e0440
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
--- a/benchmarks/benchmark.py
+++ b/benchmarks/benchmark.py
@@ -9,6 +9,9 @@
 import json
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # should → HIP_VISIBLE_DEVICES
 
@@ -33,12 +36,12 @@
     hw = gpu_info()
     print("GPU info:", hw)
 
-    device = torch.device("cuda")          # hardcoded CUDA device
+    device = _rocmport_device          # hardcoded CUDA device
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()   # .cuda()
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)   # .to(_rocmport_device)
     model.eval()
 
-    inputs = tokenizer(PROMPT, return_tensors="pt").to("cuda")       # .to("cuda")
+    inputs = tokenizer(PROMPT, return_tensors="pt").to(_rocmport_device)       # .to(_rocmport_device)
 
     # Warm-up
     with torch.no_grad():
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,9 +3,9 @@
   inference:
     image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
     environment:
-      - NVIDIA_VISIBLE_DEVICES=all
-      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
-      - CUDA_VISIBLE_DEVICES=0
+      - HIP_VISIBLE_DEVICES=all
+      - ROCM_VISIBLE_DEVICES=compute,utility
+      - HIP_VISIBLE_DEVICES=0
     deploy:
       resources:
         reservations:
@@ -21,10 +21,10 @@
   vllm_server:
     image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
     environment:
-      - NVIDIA_VISIBLE_DEVICES=0,1
-      - CUDA_VISIBLE_DEVICES=0,1
+      - HIP_VISIBLE_DEVICES=0,1
+      - HIP_VISIBLE_DEVICES=0,1
     ports:
       - "8000:8000"
     command: >
-      bash -c "nvidia-smi && pip install vllm &&
+      bash -c "rocm-smi && pip install vllm &&
                vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 2"
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,10 +1,10 @@
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+FROM vllm/vllm-openai-rocm:latest
 
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 
-ENV NVIDIA_VISIBLE_DEVICES=all
+ENV HIP_VISIBLE_DEVICES=all
 
 CMD ["python", "infer.py"]
--- a/infer.py
+++ b/infer.py
@@ -1,15 +1,18 @@
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
 
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
-device = torch.device("cuda")
+device = _rocmport_device
 
 
 def main():
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
-    inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
+    inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
     with torch.no_grad():
         outputs = model.generate(**inputs, max_new_tokens=64)
     print(tokenizer.decode(outputs[0], skip_special_tokens=True))
--- a/scripts/serve_vllm.sh
+++ b/scripts/serve_vllm.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 set -euo pipefail
 
-export CUDA_VISIBLE_DEVICES=0
-nvidia-smi
+export HIP_VISIBLE_DEVICES=0
+rocm-smi
 vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -9,13 +9,16 @@
 from torch.utils.data import DataLoader, TensorDataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
 # ── CUDA-specific patterns that ROCmPort will flag ─────────────────────────
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"          # should → HIP_VISIBLE_DEVICES
 os.environ["CUDA_HOME"] = "/usr/local/cuda"        # should be removed / replaced
 
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
 
-device = torch.device("cuda")                      # hardcoded CUDA device
+device = _rocmport_device                      # hardcoded CUDA device
 print("CUDA available:", torch.cuda.is_available())
 
 
@@ -27,13 +30,13 @@
 
 def train(epochs: int = 3, lr: float = 2e-5):
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()   # .cuda() call
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)   # .to(_rocmport_device) call
 
     optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
 
     ids, labels = get_dummy_batch()
-    ids = ids.to("cuda")        # hardcoded "cuda" string
-    labels = labels.to("cuda")  # hardcoded "cuda" string
+    ids = ids.to(_rocmport_device)        # hardcoded "cuda" string
+    labels = labels.to(_rocmport_device)  # hardcoded "cuda" string
 
     dataset = TensorDataset(ids, labels)
     loader = DataLoader(dataset, batch_size=2)
@@ -41,8 +44,8 @@
     model.train()
     for epoch in range(epochs):
         for batch_ids, batch_labels in loader:
-            batch_ids = batch_ids.cuda()    # another .cuda() call
-            batch_labels = batch_labels.cuda()
+            batch_ids = batch_ids.to(_rocmport_device)    # another .to(_rocmport_device) call
+            batch_labels = batch_labels.to(_rocmport_device)
             outputs = model(input_ids=batch_ids, labels=batch_labels)
             loss = outputs.loss
             loss.backward()