|
|
|
|
| @@ -9,6 +9,9 @@
|
| import json
|
| import torch
|
| from transformers import AutoModelForCausalLM, AutoTokenizer
|
| +
|
| +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
| os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" # should → HIP_VISIBLE_DEVICES
|
|
|
| @@ -33,12 +36,12 @@
|
| hw = gpu_info()
|
| print("GPU info:", hw)
|
|
|
| - device = torch.device("cuda") # hardcoded CUDA device
|
| + device = _rocmport_device # hardcoded CUDA device
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda()
|
| + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device)
|
| model.eval()
|
|
|
| - inputs = tokenizer(PROMPT, return_tensors="pt").to("cuda") # .to("cuda")
|
| + inputs = tokenizer(PROMPT, return_tensors="pt").to(_rocmport_device) # .to(_rocmport_device)
|
|
|
| # Warm-up
|
| with torch.no_grad():
|
|
|
|
|
| @@ -3,9 +3,9 @@
|
| inference:
|
| image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
| environment:
|
| - - NVIDIA_VISIBLE_DEVICES=all
|
| - - NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
| - - CUDA_VISIBLE_DEVICES=0
|
| + - HIP_VISIBLE_DEVICES=all
|
| + - ROCM_VISIBLE_DEVICES=compute,utility
|
| + - HIP_VISIBLE_DEVICES=0
|
| deploy:
|
| resources:
|
| reservations:
|
| @@ -21,10 +21,10 @@
|
| vllm_server:
|
| image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
| environment:
|
| - - NVIDIA_VISIBLE_DEVICES=0,1
|
| - - CUDA_VISIBLE_DEVICES=0,1
|
| + - HIP_VISIBLE_DEVICES=0,1
|
| + - HIP_VISIBLE_DEVICES=0,1
|
| ports:
|
| - "8000:8000"
|
| command: >
|
| - bash -c "nvidia-smi && pip install vllm &&
|
| + bash -c "rocm-smi && pip install vllm &&
|
| vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 2"
|
|
|
|
|
| @@ -1,10 +1,10 @@
|
| -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
| +FROM vllm/vllm-openai-rocm:latest
|
|
|
| WORKDIR /app
|
| COPY requirements.txt .
|
| RUN pip install --no-cache-dir -r requirements.txt
|
| COPY . .
|
|
|
| -ENV NVIDIA_VISIBLE_DEVICES=all
|
| +ENV HIP_VISIBLE_DEVICES=all
|
|
|
| CMD ["python", "infer.py"]
|
|
|
|
|
| @@ -1,15 +1,18 @@
|
| import torch
|
| from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
| +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| +
|
|
|
| MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
|
| -device = torch.device("cuda")
|
| +device = _rocmport_device
|
|
|
|
|
| def main():
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
|
| - inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
|
| + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
|
| + inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
|
| with torch.no_grad():
|
| outputs = model.generate(**inputs, max_new_tokens=64)
|
| print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
|
|
|
|
| @@ -1,6 +1,6 @@
|
| #!/usr/bin/env bash
|
| set -euo pipefail
|
|
|
| -export CUDA_VISIBLE_DEVICES=0
|
| -nvidia-smi
|
| +export HIP_VISIBLE_DEVICES=0
|
| +rocm-smi
|
| vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
|
|
|
|
|
| @@ -9,13 +9,16 @@
|
| from torch.utils.data import DataLoader, TensorDataset
|
| from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
| +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| +
|
| # ── CUDA-specific patterns that ROCmPort will flag ─────────────────────────
|
| os.environ["CUDA_VISIBLE_DEVICES"] = "0" # should → HIP_VISIBLE_DEVICES
|
| os.environ["CUDA_HOME"] = "/usr/local/cuda" # should be removed / replaced
|
|
|
| MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
|
|
|
| -device = torch.device("cuda") # hardcoded CUDA device
|
| +device = _rocmport_device # hardcoded CUDA device
|
| print("CUDA available:", torch.cuda.is_available())
|
|
|
|
|
| @@ -27,13 +30,13 @@
|
|
|
| def train(epochs: int = 3, lr: float = 2e-5):
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda() call
|
| + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device) call
|
|
|
| optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
|
|
|
| ids, labels = get_dummy_batch()
|
| - ids = ids.to("cuda") # hardcoded "cuda" string
|
| - labels = labels.to("cuda") # hardcoded "cuda" string
|
| + ids = ids.to(_rocmport_device) # hardcoded "cuda" string
|
| + labels = labels.to(_rocmport_device) # hardcoded "cuda" string
|
|
|
| dataset = TensorDataset(ids, labels)
|
| loader = DataLoader(dataset, batch_size=2)
|
| @@ -41,8 +44,8 @@
|
| model.train()
|
| for epoch in range(epochs):
|
| for batch_ids, batch_labels in loader:
|
| - batch_ids = batch_ids.cuda() # another .cuda() call
|
| - batch_labels = batch_labels.cuda()
|
| + batch_ids = batch_ids.to(_rocmport_device) # another .to(_rocmport_device) call
|
| + batch_labels = batch_labels.to(_rocmport_device)
|
| outputs = model(input_ids=batch_ids, labels=batch_labels)
|
| loss = outputs.loss
|
| loss.backward()
|
|
|