Spaces:
Runtime error
Runtime error
| --- a/benchmarks/benchmark.py | |
| +++ b/benchmarks/benchmark.py | |
| import json | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| + | |
| +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace. | |
| +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" # should → HIP_VISIBLE_DEVICES | |
| hw = gpu_info() | |
| print("GPU info:", hw) | |
| - device = torch.device("cuda") # hardcoded CUDA device | |
| + device = _rocmport_device # hardcoded CUDA device | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda() | |
| + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device) | |
| model.eval() | |
| - inputs = tokenizer(PROMPT, return_tensors="pt").to("cuda") # .to("cuda") | |
| + inputs = tokenizer(PROMPT, return_tensors="pt").to(_rocmport_device) # .to(_rocmport_device) | |
| # Warm-up | |
| with torch.no_grad(): | |
| --- a/docker-compose.yml | |
| +++ b/docker-compose.yml | |
| inference: | |
| image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 | |
| environment: | |
| - - NVIDIA_VISIBLE_DEVICES=all | |
| - - NVIDIA_DRIVER_CAPABILITIES=compute,utility | |
| - - CUDA_VISIBLE_DEVICES=0 | |
| + - HIP_VISIBLE_DEVICES=all | |
| + - ROCM_VISIBLE_DEVICES=compute,utility | |
| + - HIP_VISIBLE_DEVICES=0 | |
| deploy: | |
| resources: | |
| reservations: | |
| vllm_server: | |
| image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 | |
| environment: | |
| - - NVIDIA_VISIBLE_DEVICES=0,1 | |
| - - CUDA_VISIBLE_DEVICES=0,1 | |
| + - HIP_VISIBLE_DEVICES=0,1 | |
| + - HIP_VISIBLE_DEVICES=0,1 | |
| ports: | |
| - "8000:8000" | |
| command: > | |
| - bash -c "nvidia-smi && pip install vllm && | |
| + bash -c "rocm-smi && pip install vllm && | |
| vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 2" | |
| --- a/Dockerfile | |
| +++ b/Dockerfile | |
| -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 | |
| +FROM vllm/vllm-openai-rocm:latest | |
| WORKDIR /app | |
| COPY requirements.txt . | |
| RUN pip install --no-cache-dir -r requirements.txt | |
| COPY . . | |
| -ENV NVIDIA_VISIBLE_DEVICES=all | |
| +ENV HIP_VISIBLE_DEVICES=all | |
| CMD ["python", "infer.py"] | |
| --- a/infer.py | |
| +++ b/infer.py | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace. | |
| +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| + | |
| MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" | |
| -device = torch.device("cuda") | |
| +device = _rocmport_device | |
| def main(): | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() | |
| - inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda") | |
| + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) | |
| + inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device) | |
| with torch.no_grad(): | |
| outputs = model.generate(**inputs, max_new_tokens=64) | |
| print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
| --- a/scripts/serve_vllm.sh | |
| +++ b/scripts/serve_vllm.sh | |
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| -export CUDA_VISIBLE_DEVICES=0 | |
| -nvidia-smi | |
| +export HIP_VISIBLE_DEVICES=0 | |
| +rocm-smi | |
| vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1 | |
| --- a/scripts/train.py | |
| +++ b/scripts/train.py | |
| from torch.utils.data import DataLoader, TensorDataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace. | |
| +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| + | |
| # ── CUDA-specific patterns that ROCmPort will flag ───────────────────────── | |
| os.environ["CUDA_VISIBLE_DEVICES"] = "0" # should → HIP_VISIBLE_DEVICES | |
| os.environ["CUDA_HOME"] = "/usr/local/cuda" # should be removed / replaced | |
| MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" | |
| -device = torch.device("cuda") # hardcoded CUDA device | |
| +device = _rocmport_device # hardcoded CUDA device | |
| print("CUDA available:", torch.cuda.is_available()) | |
| def train(epochs: int = 3, lr: float = 2e-5): | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda() call | |
| + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device) call | |
| optimizer = torch.optim.AdamW(model.parameters(), lr=lr) | |
| ids, labels = get_dummy_batch() | |
| - ids = ids.to("cuda") # hardcoded "cuda" string | |
| - labels = labels.to("cuda") # hardcoded "cuda" string | |
| + ids = ids.to(_rocmport_device) # hardcoded "cuda" string | |
| + labels = labels.to(_rocmport_device) # hardcoded "cuda" string | |
| dataset = TensorDataset(ids, labels) | |
| loader = DataLoader(dataset, batch_size=2) | |
| model.train() | |
| for epoch in range(epochs): | |
| for batch_ids, batch_labels in loader: | |
| - batch_ids = batch_ids.cuda() # another .cuda() call | |
| - batch_labels = batch_labels.cuda() | |
| + batch_ids = batch_ids.to(_rocmport_device) # another .to(_rocmport_device) call | |
| + batch_labels = batch_labels.to(_rocmport_device) | |
| outputs = model(input_ids=batch_ids, labels=batch_labels) | |
| loss = outputs.loss | |
| loss.backward() | |