Deploy ROCmPort AI — CUDA-to-ROCm migration scanner
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +2 -0
- .gitignore +12 -0
- .venv/Scripts/python.exe +3 -0
- .venv/Scripts/pythonw.exe +3 -0
- .venv/pyvenv.cfg +5 -0
- LICENSE +21 -0
- README.md +110 -6
- app.py +370 -0
- artifacts/check-scoring/Dockerfile.rocm +11 -0
- artifacts/check-scoring/ROCM_FEEDBACK.md +17 -0
- artifacts/check-scoring/ROCM_MIGRATION_COOKBOOK.md +40 -0
- artifacts/check-scoring/amd_developer_cloud_runbook.md +50 -0
- artifacts/check-scoring/benchmark_result.json +20 -0
- artifacts/check-scoring/migration_report.md +47 -0
- artifacts/check-scoring/rocm_patch.diff +50 -0
- artifacts/check-scoring/rocmport_artifacts.zip +3 -0
- artifacts/check/Dockerfile.rocm +11 -0
- artifacts/check/ROCM_FEEDBACK.md +17 -0
- artifacts/check/ROCM_MIGRATION_COOKBOOK.md +40 -0
- artifacts/check/amd_developer_cloud_runbook.md +50 -0
- artifacts/check/benchmark_result.json +20 -0
- artifacts/check/migration_report.md +45 -0
- artifacts/check/rocm_patch.diff +48 -0
- artifacts/check/rocmport_artifacts.zip +3 -0
- artifacts/check2/Dockerfile.rocm +11 -0
- artifacts/check2/ROCM_FEEDBACK.md +17 -0
- artifacts/check2/ROCM_MIGRATION_COOKBOOK.md +40 -0
- artifacts/check2/amd_developer_cloud_runbook.md +50 -0
- artifacts/check2/benchmark_result.json +20 -0
- artifacts/check2/migration_report.md +46 -0
- artifacts/check2/rocm_patch.diff +50 -0
- artifacts/check2/rocmport_artifacts.zip +3 -0
- artifacts/hackathon_content.txt +0 -0
- artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/Dockerfile.rocm +11 -0
- artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/ROCM_FEEDBACK.md +17 -0
- artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/ROCM_MIGRATION_COOKBOOK.md +40 -0
- artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/amd_developer_cloud_runbook.md +50 -0
- artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/benchmark_result.json +20 -0
- artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/migration_report.md +71 -0
- artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/rocm_patch.diff +155 -0
- artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/rocmport_artifacts.zip +3 -0
- artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/Dockerfile.rocm +11 -0
- artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/ROCM_FEEDBACK.md +17 -0
- artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/ROCM_MIGRATION_COOKBOOK.md +40 -0
- artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/amd_developer_cloud_runbook.md +50 -0
- artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/benchmark_result.json +20 -0
- artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/migration_report.md +45 -0
- artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/rocm_patch.diff +48 -0
- artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/rocmport_artifacts.zip +3 -0
- artifacts/runtime/rocmport-ui-artifacts-6f4540ff23e142ad9d6ab18154ea44e6/Dockerfile.rocm +11 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
.venv/Scripts/python.exe filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
.venv/Scripts/pythonw.exe filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
.pytest_cache/
|
| 3 |
+
.venv/
|
| 4 |
+
venv/
|
| 5 |
+
dist/
|
| 6 |
+
build/
|
| 7 |
+
*.egg-info/
|
| 8 |
+
.gradio/
|
| 9 |
+
artifacts/
|
| 10 |
+
tmp/
|
| 11 |
+
.tmp/
|
| 12 |
+
pytest-cache-files-*/
|
.venv/Scripts/python.exe
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8afe555632efdbf8b01309532efc9389c5d4417fac976f6ee4389c7750772745
|
| 3 |
+
size 269072
|
.venv/Scripts/pythonw.exe
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:080e0247dd5a0240070bb2244b2bd64fc31217c764c8c9d7ce3bf844760ba88e
|
| 3 |
+
size 256784
|
.venv/pyvenv.cfg
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
home = C:\Users\nawan\anaconda3
|
| 2 |
+
include-system-site-packages = false
|
| 3 |
+
version = 3.12.4
|
| 4 |
+
executable = C:\Users\nawan\anaconda3\python.exe
|
| 5 |
+
command = C:\Users\nawan\anaconda3\python.exe -m venv C:\Users\nawan\Documents\Codex\2026-05-05\come-build-the-next-generation-of\.venv
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026 ROCmPort AI contributors
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,11 +1,115 @@
|
|
| 1 |
---
|
| 2 |
title: ROCmPort AI
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
|
|
|
| 7 |
pinned: false
|
| 8 |
-
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: ROCmPort AI
|
| 3 |
+
emoji: ⚡
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: gradio
|
| 7 |
+
app_file: app.py
|
| 8 |
pinned: false
|
| 9 |
+
license: mit
|
| 10 |
+
short_description: CUDA-to-ROCm migration scanner for PyTorch, HF & vLLM repos
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# ⚡ ROCmPort AI
|
| 14 |
+
|
| 15 |
+
> **AMD Developer Hackathon — lablab.ai** | Track: AI Agents & Agentic Workflows
|
| 16 |
+
|
| 17 |
+
ROCmPort AI is a **CUDA-to-ROCm migration scanner** powered by a three-agent CrewAI pipeline and Qwen3-Coder running on AMD Instinct GPUs. Drop in any CUDA-first PyTorch, Hugging Face, or vLLM repository and get a full AMD readiness report in seconds.
|
| 18 |
+
|
| 19 |
+
## What it does
|
| 20 |
+
|
| 21 |
+
```mermaid
|
| 22 |
+
graph LR
|
| 23 |
+
User([User Repo]) --> Gradio[Gradio UI]
|
| 24 |
+
Gradio --> Pipeline{Pipeline}
|
| 25 |
+
|
| 26 |
+
subgraph Agentic Workflow
|
| 27 |
+
Pipeline --> Auditor[CUDA Auditor]
|
| 28 |
+
Auditor --> Engineer[ROCm Engineer]
|
| 29 |
+
Engineer --> Reporter[Report Writer]
|
| 30 |
+
end
|
| 31 |
+
|
| 32 |
+
Reporter --> LLM[(Qwen3-Coder on AMD Instinct)]
|
| 33 |
+
LLM --> Reporter
|
| 34 |
+
|
| 35 |
+
Pipeline --> Scanner[Deterministic Scanner]
|
| 36 |
+
Scanner --> Patcher[Patcher]
|
| 37 |
+
Patcher --> Artifacts[Artifact Generator]
|
| 38 |
+
|
| 39 |
+
Reporter --> Final([Migration Artifacts & Patch])
|
| 40 |
+
Artifacts --> Final
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
| Output | Description |
|
| 44 |
+
|---|---|
|
| 45 |
+
| **AMD Readiness Score** | Before/after scores across 5 categories |
|
| 46 |
+
| **Findings table** | File + line references for every CUDA blocker |
|
| 47 |
+
| **ROCm patch diff** | Auto-generated unified diff to apply deterministic fixes |
|
| 48 |
+
| **Dockerfile.rocm** | ROCm-enabled container using vllm/vllm-openai-rocm |
|
| 49 |
+
| **AMD Developer Cloud Runbook** | Exact validation commands for AMD Instinct GPUs |
|
| 50 |
+
| **Migration report** | Narrative report (CrewAI + Qwen when configured) |
|
| 51 |
+
| **Benchmark schema** | Structured result to fill after AMD Developer Cloud run |
|
| 52 |
+
| **Artifact ZIP** | All outputs bundled for download |
|
| 53 |
+
|
| 54 |
+
## Three-agent pipeline
|
| 55 |
+
|
| 56 |
+
When `QWEN_BASE_URL` and `QWEN_API_KEY` are set (pointing to a Qwen3-Coder endpoint on AMD Instinct MI300X via vLLM), three CrewAI agents collaborate:
|
| 57 |
+
|
| 58 |
+
1. **CUDA Migration Auditor** — scans every file for blockers using `scan_cuda_repository` tool
|
| 59 |
+
2. **ROCm Migration Engineer** — generates the patch diff using `generate_rocm_patch` tool
|
| 60 |
+
3. **Migration Report Writer** — synthesises findings into an actionable Markdown report
|
| 61 |
+
|
| 62 |
+
Without those env vars the app falls back to the fully deterministic scanner + patcher (which always runs).
|
| 63 |
+
|
| 64 |
+
## Run locally
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
pip install -r requirements.txt
|
| 68 |
+
python app.py
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
App listens on `http://127.0.0.1:7860`.
|
| 72 |
+
|
| 73 |
+
## Enable the full CrewAI + Qwen pipeline
|
| 74 |
+
|
| 75 |
+
```bash
|
| 76 |
+
# Windows
|
| 77 |
+
set QWEN_BASE_URL=https://your-amd-instinct-endpoint/v1
|
| 78 |
+
set QWEN_API_KEY=your-token
|
| 79 |
+
set QWEN_MODEL=Qwen/Qwen3-Coder-Next-FP8
|
| 80 |
+
python app.py
|
| 81 |
+
|
| 82 |
+
# Linux / macOS
|
| 83 |
+
QWEN_BASE_URL=https://your-amd-instinct-endpoint/v1 \
|
| 84 |
+
QWEN_API_KEY=your-token \
|
| 85 |
+
QWEN_MODEL=Qwen/Qwen3-Coder-Next-FP8 \
|
| 86 |
+
python app.py
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
## Tests
|
| 90 |
+
|
| 91 |
+
```bash
|
| 92 |
+
python -m pytest tests/ -v
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
7 tests cover the scanner, pipeline, and CrewAI agent layer.
|
| 96 |
+
|
| 97 |
+
## AMD Benchmark
|
| 98 |
+
|
| 99 |
+
The `data/benchmark_result.json` is a transparent **pending benchmark schema** — not a fabricated result. Run the generated AMD Developer Cloud runbook (shown in the app's Runbook tab) on an AMD Instinct MI300X instance to capture real throughput, latency, and VRAM figures, then replace the file.
|
| 100 |
+
|
| 101 |
+
## Deploy to Hugging Face Spaces
|
| 102 |
+
|
| 103 |
+
```bash
|
| 104 |
+
python scripts/deploy_to_hf.py --token hf_... --username YourHFUsername
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
## Tech stack
|
| 108 |
+
|
| 109 |
+
- **AMD Developer Cloud** + **AMD Instinct MI300X** for GPU compute
|
| 110 |
+
- **ROCm** — open-source GPU computing platform
|
| 111 |
+
- **CrewAI** — multi-agent orchestration
|
| 112 |
+
- **Qwen3-Coder-Next-FP8** — code-specialist LLM on AMD hardware
|
| 113 |
+
- **vLLM (ROCm build)** — high-throughput serving
|
| 114 |
+
- **Hugging Face** — model hub + Space hosting
|
| 115 |
+
- **Gradio 5** — web UI
|
app.py
ADDED
|
@@ -0,0 +1,370 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import uuid
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
import gradio as gr
|
| 10 |
+
|
| 11 |
+
from rocmport.agents import CREWAI_AVAILABLE
|
| 12 |
+
from rocmport.ingest import PreparedRepo, prepare_github_repo, prepare_uploaded_zip, sample_repo_path
|
| 13 |
+
from rocmport.models import CATEGORY_LABELS, MigrationBundle
|
| 14 |
+
from rocmport.pipeline import analyze_repository
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _pipeline_mode_html() -> str:
|
| 18 |
+
"""Return an HTML badge indicating whether the agentic CrewAI pipeline is active."""
|
| 19 |
+
if (
|
| 20 |
+
CREWAI_AVAILABLE
|
| 21 |
+
and os.getenv("QWEN_BASE_URL", "").strip()
|
| 22 |
+
and os.getenv("QWEN_API_KEY", "").strip()
|
| 23 |
+
):
|
| 24 |
+
return (
|
| 25 |
+
"<div class='mode-badge agentic'>"
|
| 26 |
+
"🤖 <strong>CrewAI Agentic Mode</strong> — "
|
| 27 |
+
"CUDA Auditor → ROCm Engineer → Report Writer agents active "
|
| 28 |
+
"(powered by Qwen3-Coder on AMD Instinct)"
|
| 29 |
+
"</div>"
|
| 30 |
+
)
|
| 31 |
+
return (
|
| 32 |
+
"<div class='mode-badge deterministic'>"
|
| 33 |
+
"⚙️ <strong>Deterministic Mode</strong> — "
|
| 34 |
+
"Set <code>QWEN_BASE_URL</code> & <code>QWEN_API_KEY</code> "
|
| 35 |
+
"to enable the full CrewAI multi-agent pipeline."
|
| 36 |
+
"</div>"
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
PROJECT_ROOT = Path(__file__).resolve().parent
|
| 41 |
+
RUNTIME_DIR = PROJECT_ROOT / "artifacts" / "runtime"
|
| 42 |
+
RUNTIME_DIR.mkdir(parents=True, exist_ok=True)
|
| 43 |
+
os.environ.setdefault("ROCMPORT_TMP_DIR", str(RUNTIME_DIR))
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def run_analysis(source_mode: str, uploaded_zip: str | None, github_url: str, branch: str) -> tuple[Any, ...]:
|
| 47 |
+
try:
|
| 48 |
+
prepared = _prepare_repo(source_mode, uploaded_zip, github_url, branch)
|
| 49 |
+
output_dir = RUNTIME_DIR / f"rocmport-ui-artifacts-{uuid.uuid4().hex}"
|
| 50 |
+
output_dir.mkdir(parents=True, exist_ok=False)
|
| 51 |
+
bundle = analyze_repository(prepared.path, output_dir=output_dir, repo_name=prepared.name)
|
| 52 |
+
return _format_outputs(bundle)
|
| 53 |
+
except Exception as exc:
|
| 54 |
+
error = f"Analysis failed: {exc}"
|
| 55 |
+
empty_scores = "<div class='score-card'><h2>Analysis failed</h2><p>{}</p></div>".format(error)
|
| 56 |
+
return (
|
| 57 |
+
empty_scores,
|
| 58 |
+
[],
|
| 59 |
+
error,
|
| 60 |
+
"",
|
| 61 |
+
"",
|
| 62 |
+
"",
|
| 63 |
+
error,
|
| 64 |
+
"{}",
|
| 65 |
+
error,
|
| 66 |
+
None,
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _prepare_repo(source_mode: str, uploaded_zip: str | None, github_url: str, branch: str) -> PreparedRepo:
|
| 71 |
+
if source_mode == "Built-in sample":
|
| 72 |
+
sample = sample_repo_path(PROJECT_ROOT)
|
| 73 |
+
return PreparedRepo(path=sample, name="cuda_first_repo")
|
| 74 |
+
if source_mode == "Uploaded ZIP":
|
| 75 |
+
if not uploaded_zip:
|
| 76 |
+
raise ValueError("Upload a ZIP file or switch to the built-in sample.")
|
| 77 |
+
return prepare_uploaded_zip(uploaded_zip)
|
| 78 |
+
if source_mode == "Public GitHub URL":
|
| 79 |
+
if not github_url.strip():
|
| 80 |
+
raise ValueError("Enter a public GitHub repository URL.")
|
| 81 |
+
return prepare_github_repo(github_url, branch.strip() or "main")
|
| 82 |
+
raise ValueError("Unknown source mode.")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _format_outputs(bundle: MigrationBundle) -> tuple[Any, ...]:
|
| 86 |
+
benchmark_json = json.dumps(bundle.benchmark, indent=2)
|
| 87 |
+
return (
|
| 88 |
+
_score_html(bundle),
|
| 89 |
+
bundle.findings_table(),
|
| 90 |
+
_migration_plan_markdown(bundle),
|
| 91 |
+
bundle.patch_diff,
|
| 92 |
+
bundle.dockerfile,
|
| 93 |
+
bundle.runbook,
|
| 94 |
+
_benchmark_markdown(bundle.benchmark),
|
| 95 |
+
benchmark_json,
|
| 96 |
+
bundle.report,
|
| 97 |
+
bundle.artifact_paths.get("rocmport_artifacts.zip"),
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def _score_html(bundle: MigrationBundle) -> str:
|
| 102 |
+
rows = []
|
| 103 |
+
for category, label in CATEGORY_LABELS.items():
|
| 104 |
+
before = bundle.before_score.categories[category]
|
| 105 |
+
after = bundle.after_score.categories[category]
|
| 106 |
+
rows.append(
|
| 107 |
+
f"""
|
| 108 |
+
<tr>
|
| 109 |
+
<td>{label}</td>
|
| 110 |
+
<td><div class="meter"><span style="width:{before}%"></span></div><strong>{before}</strong></td>
|
| 111 |
+
<td><div class="meter after"><span style="width:{after}%"></span></div><strong>{after}</strong></td>
|
| 112 |
+
</tr>
|
| 113 |
+
"""
|
| 114 |
+
)
|
| 115 |
+
return f"""
|
| 116 |
+
<div class="score-wrap">
|
| 117 |
+
<div class="score-card">
|
| 118 |
+
<div class="score-label">Before</div>
|
| 119 |
+
<div class="score-number">{bundle.before_score.total}</div>
|
| 120 |
+
</div>
|
| 121 |
+
<div class="score-card">
|
| 122 |
+
<div class="score-label">Migration package</div>
|
| 123 |
+
<div class="score-number after-text">{bundle.after_score.total}</div>
|
| 124 |
+
</div>
|
| 125 |
+
<div class="score-card">
|
| 126 |
+
<div class="score-label">Findings</div>
|
| 127 |
+
<div class="score-number">{len(bundle.findings)}</div>
|
| 128 |
+
</div>
|
| 129 |
+
</div>
|
| 130 |
+
<table class="score-table">
|
| 131 |
+
<thead><tr><th>Category</th><th>Before</th><th>Migration package</th></tr></thead>
|
| 132 |
+
<tbody>{''.join(rows)}</tbody>
|
| 133 |
+
</table>
|
| 134 |
+
"""
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def _migration_plan_markdown(bundle: MigrationBundle) -> str:
|
| 138 |
+
if not bundle.findings:
|
| 139 |
+
return "### Migration Plan\n\nNo blockers were found. Run the generated AMD Developer Cloud smoke test before submission."
|
| 140 |
+
grouped: dict[str, list[str]] = {}
|
| 141 |
+
for finding in bundle.findings:
|
| 142 |
+
grouped.setdefault(finding.category, []).append(
|
| 143 |
+
f"- `{finding.path}:{finding.line}`: {finding.suggested_fix}"
|
| 144 |
+
)
|
| 145 |
+
sections = ["### Migration Plan"]
|
| 146 |
+
for category, label in CATEGORY_LABELS.items():
|
| 147 |
+
if category not in grouped:
|
| 148 |
+
continue
|
| 149 |
+
sections.append(f"\n#### {label}\n" + "\n".join(grouped[category][:8]))
|
| 150 |
+
return "\n".join(sections)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def _benchmark_markdown(benchmark: dict[str, Any]) -> str:
|
| 154 |
+
verified = benchmark.get("verified", False)
|
| 155 |
+
status = "Verified AMD Developer Cloud run" if verified else "Pending AMD Developer Cloud run"
|
| 156 |
+
lines = [
|
| 157 |
+
f"### {status}",
|
| 158 |
+
"",
|
| 159 |
+
f"- Hardware: `{benchmark.get('hardware', 'not captured')}`",
|
| 160 |
+
f"- ROCm: `{benchmark.get('rocm_version', 'not captured')}`",
|
| 161 |
+
f"- vLLM: `{benchmark.get('vllm_version', 'not captured')}`",
|
| 162 |
+
f"- Model: `{benchmark.get('model', 'not captured')}`",
|
| 163 |
+
f"- Throughput tokens/sec: `{benchmark.get('throughput_tokens_per_second', 'not captured')}`",
|
| 164 |
+
f"- P50 latency ms: `{benchmark.get('p50_latency_ms', 'not captured')}`",
|
| 165 |
+
f"- Peak VRAM GB: `{benchmark.get('peak_vram_gb', 'not captured')}`",
|
| 166 |
+
"",
|
| 167 |
+
benchmark.get("notes", "Run the generated AMD Developer Cloud runbook to replace this record with measured values."),
|
| 168 |
+
]
|
| 169 |
+
return "\n".join(lines)
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
CSS = """
|
| 173 |
+
.gradio-container { max-width: 1280px !important; }
|
| 174 |
+
.mode-badge {
|
| 175 |
+
padding: 10px 16px;
|
| 176 |
+
border-radius: 8px;
|
| 177 |
+
font-size: 14px;
|
| 178 |
+
margin-bottom: 16px;
|
| 179 |
+
border: 1px solid;
|
| 180 |
+
box-shadow: 0 4px 12px rgba(0,0,0,0.05);
|
| 181 |
+
}
|
| 182 |
+
.mode-badge.agentic {
|
| 183 |
+
background: rgba(8, 127, 91, 0.1);
|
| 184 |
+
border-color: #087f5b;
|
| 185 |
+
color: var(--body-text-color);
|
| 186 |
+
}
|
| 187 |
+
.mode-badge.deterministic {
|
| 188 |
+
background: rgba(54, 79, 199, 0.1);
|
| 189 |
+
border-color: #748ffc;
|
| 190 |
+
color: var(--body-text-color);
|
| 191 |
+
}
|
| 192 |
+
.score-wrap {
|
| 193 |
+
display: grid;
|
| 194 |
+
grid-template-columns: repeat(3, minmax(0, 1fr));
|
| 195 |
+
gap: 16px;
|
| 196 |
+
margin: 12px 0 24px;
|
| 197 |
+
}
|
| 198 |
+
.score-card {
|
| 199 |
+
border: 1px solid var(--border-color-primary);
|
| 200 |
+
border-radius: 12px;
|
| 201 |
+
padding: 20px;
|
| 202 |
+
background: var(--background-fill-secondary);
|
| 203 |
+
box-shadow: 0 4px 20px rgba(0,0,0,0.08);
|
| 204 |
+
transition: transform 0.2s ease, box-shadow 0.2s ease;
|
| 205 |
+
}
|
| 206 |
+
.score-card:hover {
|
| 207 |
+
transform: translateY(-2px);
|
| 208 |
+
box-shadow: 0 8px 24px rgba(237, 28, 36, 0.15);
|
| 209 |
+
}
|
| 210 |
+
.score-label {
|
| 211 |
+
color: var(--body-text-color-subdued);
|
| 212 |
+
font-size: 14px;
|
| 213 |
+
text-transform: uppercase;
|
| 214 |
+
letter-spacing: 0.5px;
|
| 215 |
+
margin-bottom: 8px;
|
| 216 |
+
font-weight: 600;
|
| 217 |
+
}
|
| 218 |
+
.score-number {
|
| 219 |
+
color: var(--body-text-color);
|
| 220 |
+
font-size: 42px;
|
| 221 |
+
line-height: 1;
|
| 222 |
+
font-weight: 800;
|
| 223 |
+
}
|
| 224 |
+
.after-text { color: #ed1c24; } /* AMD Red */
|
| 225 |
+
.score-table {
|
| 226 |
+
width: 100%;
|
| 227 |
+
border-collapse: collapse;
|
| 228 |
+
margin-top: 12px;
|
| 229 |
+
}
|
| 230 |
+
.score-table th {
|
| 231 |
+
text-transform: uppercase;
|
| 232 |
+
font-size: 12px;
|
| 233 |
+
color: var(--body-text-color-subdued);
|
| 234 |
+
}
|
| 235 |
+
.score-table th,
|
| 236 |
+
.score-table td {
|
| 237 |
+
border-bottom: 1px solid var(--border-color-primary);
|
| 238 |
+
padding: 12px 8px;
|
| 239 |
+
text-align: left;
|
| 240 |
+
}
|
| 241 |
+
.meter {
|
| 242 |
+
width: calc(100% - 48px);
|
| 243 |
+
height: 10px;
|
| 244 |
+
background: var(--background-fill-primary);
|
| 245 |
+
border-radius: 5px;
|
| 246 |
+
display: inline-block;
|
| 247 |
+
vertical-align: middle;
|
| 248 |
+
margin-right: 8px;
|
| 249 |
+
overflow: hidden;
|
| 250 |
+
}
|
| 251 |
+
.meter span {
|
| 252 |
+
display: block;
|
| 253 |
+
height: 100%;
|
| 254 |
+
background: var(--body-text-color-subdued);
|
| 255 |
+
border-radius: 5px;
|
| 256 |
+
transition: width 1s cubic-bezier(0.4, 0, 0.2, 1);
|
| 257 |
+
}
|
| 258 |
+
.meter.after span { background: linear-gradient(90deg, #b80000 0%, #ed1c24 100%); }
|
| 259 |
+
#findings-table table {
|
| 260 |
+
table-layout: fixed;
|
| 261 |
+
}
|
| 262 |
+
#findings-table th {
|
| 263 |
+
white-space: nowrap;
|
| 264 |
+
}
|
| 265 |
+
"""
|
| 266 |
+
|
| 267 |
+
THEME = gr.themes.Soft(
|
| 268 |
+
primary_hue="red",
|
| 269 |
+
neutral_hue="zinc",
|
| 270 |
+
font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"]
|
| 271 |
+
).set(
|
| 272 |
+
button_primary_background_fill="linear-gradient(90deg, #ed1c24 0%, #b80000 100%)",
|
| 273 |
+
button_primary_background_fill_hover="linear-gradient(90deg, #ff333a 0%, #cc0000 100%)",
|
| 274 |
+
button_primary_text_color="white",
|
| 275 |
+
block_title_text_weight="600",
|
| 276 |
+
block_shadow="*shadow_drop_lg",
|
| 277 |
+
block_border_width="0px",
|
| 278 |
+
block_radius="*radius_lg"
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
with gr.Blocks(title="ROCmPort AI ⚡") as demo:
|
| 283 |
+
gr.Markdown("# ROCmPort AI ⚡")
|
| 284 |
+
gr.Markdown("CUDA-to-ROCm migration scanner for PyTorch, Hugging Face, and vLLM repositories.")
|
| 285 |
+
gr.HTML(_pipeline_mode_html())
|
| 286 |
+
gr.Markdown(
|
| 287 |
+
"> **How it works:** Three CrewAI agents collaborate to migrate the repository. All scoring is deterministic.\n\n"
|
| 288 |
+
"```text\n"
|
| 289 |
+
" [ 📁 User Repository ]\n"
|
| 290 |
+
" │\n"
|
| 291 |
+
" ▼\n"
|
| 292 |
+
" [ 🖥️ Gradio UI ]\n"
|
| 293 |
+
" │\n"
|
| 294 |
+
" ▼\n"
|
| 295 |
+
" [ ⚙️ Pipeline ]\n"
|
| 296 |
+
" │\n"
|
| 297 |
+
" ┌─────────────────┴─────────────────┐\n"
|
| 298 |
+
" (Agentic Workflow) (Deterministic Fallback)\n"
|
| 299 |
+
" │ │\n"
|
| 300 |
+
" [ 🕵️ CUDA Auditor ] [ 🔍 Scanner ]\n"
|
| 301 |
+
" │ │\n"
|
| 302 |
+
" [ 🛠️ ROCm Engineer ] [ 🩹 Patcher ]\n"
|
| 303 |
+
" │ │\n"
|
| 304 |
+
" [ 📝 Report Writer ] [ 📦 Artifacts ]\n"
|
| 305 |
+
" │ │\n"
|
| 306 |
+
" (🧠 Qwen3 on MI300X) │\n"
|
| 307 |
+
" │ │\n"
|
| 308 |
+
" └─────────────────┬─────────────────┘\n"
|
| 309 |
+
" ▼\n"
|
| 310 |
+
" [ 🎯 Final Migration Package ]\n"
|
| 311 |
+
"```"
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
with gr.Row():
|
| 315 |
+
source_mode = gr.Radio(
|
| 316 |
+
choices=["Built-in sample", "Uploaded ZIP", "Public GitHub URL"],
|
| 317 |
+
value="Built-in sample",
|
| 318 |
+
label="Repository source",
|
| 319 |
+
)
|
| 320 |
+
uploaded_zip = gr.File(label="Repository ZIP", type="filepath", file_types=[".zip"])
|
| 321 |
+
with gr.Row():
|
| 322 |
+
github_url = gr.Textbox(label="GitHub URL", placeholder="https://github.com/owner/repo")
|
| 323 |
+
branch = gr.Textbox(label="Branch", value="main")
|
| 324 |
+
|
| 325 |
+
analyze_button = gr.Button("Analyze repository", variant="primary")
|
| 326 |
+
|
| 327 |
+
with gr.Tabs():
|
| 328 |
+
with gr.Tab("Scan"):
|
| 329 |
+
score_html = gr.HTML(label="AMD Readiness Score")
|
| 330 |
+
findings_table = gr.Dataframe(
|
| 331 |
+
headers=["Severity", "Category", "Path", "Line", "Finding", "Suggested fix"],
|
| 332 |
+
label="Findings",
|
| 333 |
+
wrap=True,
|
| 334 |
+
column_widths=[92, 210, 260, 72, 500, 620],
|
| 335 |
+
elem_id="findings-table",
|
| 336 |
+
)
|
| 337 |
+
migration_plan = gr.Markdown(label="Migration Plan")
|
| 338 |
+
with gr.Tab("Patch"):
|
| 339 |
+
patch_diff = gr.Code(label="rocm_patch.diff", language=None, lines=20)
|
| 340 |
+
dockerfile = gr.Code(label="Dockerfile.rocm", language="dockerfile", lines=18)
|
| 341 |
+
runbook = gr.Markdown(label="AMD Developer Cloud Runbook")
|
| 342 |
+
with gr.Tab("Benchmark"):
|
| 343 |
+
benchmark_md = gr.Markdown(label="Benchmark Summary")
|
| 344 |
+
benchmark_json = gr.Code(label="benchmark_result.json", language="json", lines=18)
|
| 345 |
+
with gr.Tab("Report"):
|
| 346 |
+
report_md = gr.Markdown(label="Migration Report")
|
| 347 |
+
artifact_zip = gr.File(label="Download migration artifact bundle")
|
| 348 |
+
|
| 349 |
+
analyze_button.click(
|
| 350 |
+
fn=run_analysis,
|
| 351 |
+
inputs=[source_mode, uploaded_zip, github_url, branch],
|
| 352 |
+
outputs=[
|
| 353 |
+
score_html,
|
| 354 |
+
findings_table,
|
| 355 |
+
migration_plan,
|
| 356 |
+
patch_diff,
|
| 357 |
+
dockerfile,
|
| 358 |
+
runbook,
|
| 359 |
+
benchmark_md,
|
| 360 |
+
benchmark_json,
|
| 361 |
+
report_md,
|
| 362 |
+
artifact_zip,
|
| 363 |
+
],
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
if __name__ == "__main__":
|
| 368 |
+
server_name = os.getenv("GRADIO_SERVER_NAME") or ("0.0.0.0" if os.getenv("SPACE_ID") else "127.0.0.1")
|
| 369 |
+
server_port = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
|
| 370 |
+
demo.launch(server_name=server_name, server_port=server_port, theme=THEME, css=CSS, quiet=True)
|
artifacts/check-scoring/Dockerfile.rocm
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM vllm/vllm-openai-rocm:latest
|
| 2 |
+
|
| 3 |
+
WORKDIR /workspace/cuda_first_repo
|
| 4 |
+
COPY . /workspace/cuda_first_repo
|
| 5 |
+
|
| 6 |
+
RUN if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi
|
| 7 |
+
|
| 8 |
+
ENV HIP_VISIBLE_DEVICES=0
|
| 9 |
+
ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
|
| 10 |
+
|
| 11 |
+
CMD ["python", "-c", "import torch; print('torch', torch.__version__); print('rocm_gpu_available', torch.cuda.is_available())"]
|
artifacts/check-scoring/ROCM_FEEDBACK.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCm / AMD Developer Cloud Feedback
|
| 2 |
+
|
| 3 |
+
## What worked well
|
| 4 |
+
|
| 5 |
+
- The ROCm-enabled vLLM container gives developers a clear serving path for AMD Instinct GPUs.
|
| 6 |
+
- AMD Developer Cloud is well aligned with hackathon demos because developers can avoid local GPU setup.
|
| 7 |
+
- Qwen3-Coder-Next on AMD Instinct is a strong story for repo-level coding agents.
|
| 8 |
+
|
| 9 |
+
## Friction points to document during the live run
|
| 10 |
+
|
| 11 |
+
- Exact VM image, ROCm version, and Docker image should be easy to capture in benchmark logs.
|
| 12 |
+
- Users need obvious examples for replacing NVIDIA container flags and monitoring commands.
|
| 13 |
+
- More migration examples for common CUDA-first PyTorch repos would reduce onboarding time.
|
| 14 |
+
|
| 15 |
+
## Suggested product improvement
|
| 16 |
+
|
| 17 |
+
Publish a small official CUDA-to-ROCm migration checklist for PyTorch, vLLM, and Hugging Face inference projects, with copyable Docker commands for AMD Developer Cloud.
|
artifacts/check-scoring/ROCM_MIGRATION_COOKBOOK.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCm Migration Cookbook
|
| 2 |
+
|
| 3 |
+
## PyTorch device handling
|
| 4 |
+
|
| 5 |
+
Use a runtime device abstraction instead of hardcoding `.cuda()` or `torch.device("cuda")` everywhere.
|
| 6 |
+
|
| 7 |
+
```python
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| 11 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 12 |
+
model = model.to(device)
|
| 13 |
+
inputs = inputs.to(device)
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## GPU inspection
|
| 17 |
+
|
| 18 |
+
Replace NVIDIA-only commands with ROCm equivalents:
|
| 19 |
+
|
| 20 |
+
```bash
|
| 21 |
+
rocm-smi --showproductname --showmeminfo vram --showuse
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
## Containers
|
| 25 |
+
|
| 26 |
+
For vLLM serving on AMD GPUs, use the ROCm-enabled vLLM image:
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
docker pull vllm/vllm-openai-rocm:latest
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
Run with AMD GPU device access:
|
| 33 |
+
|
| 34 |
+
```bash
|
| 35 |
+
docker run --rm -it --device /dev/kfd --device /dev/dri --group-add video --ipc=host --network=host --security-opt seccomp=unconfined vllm/vllm-openai-rocm:latest
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## Manual review cases
|
| 39 |
+
|
| 40 |
+
Manual migration is still required for CUDA C++ kernels, CUDA-only binary wheels, custom Triton kernels, and libraries that ship only CUDA builds.
|
artifacts/check-scoring/amd_developer_cloud_runbook.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AMD Developer Cloud Runbook
|
| 2 |
+
|
| 3 |
+
This runbook validates `cuda_first_repo` on AMD Developer Cloud without executing untrusted code inside the ROCmPort AI Space.
|
| 4 |
+
|
| 5 |
+
## 1. Create an AMD GPU VM
|
| 6 |
+
|
| 7 |
+
Use an AMD Developer Cloud VM with an AMD Instinct GPU and ROCm-ready Docker support.
|
| 8 |
+
|
| 9 |
+
## 2. Build the ROCm container
|
| 10 |
+
|
| 11 |
+
```bash
|
| 12 |
+
docker build -f Dockerfile.rocm -t rocmport-cuda_first_repo .
|
| 13 |
+
```
|
| 14 |
+
|
| 15 |
+
## 3. Run a smoke check
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
docker run --rm -it \
|
| 19 |
+
--device /dev/kfd \
|
| 20 |
+
--device /dev/dri \
|
| 21 |
+
--group-add video \
|
| 22 |
+
--ipc=host \
|
| 23 |
+
--network=host \
|
| 24 |
+
--security-opt seccomp=unconfined \
|
| 25 |
+
rocmport-cuda_first_repo
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## 4. Run vLLM on ROCm
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
docker run --rm -it \
|
| 32 |
+
--device /dev/kfd \
|
| 33 |
+
--device /dev/dri \
|
| 34 |
+
--group-add video \
|
| 35 |
+
--ipc=host \
|
| 36 |
+
--network=host \
|
| 37 |
+
--security-opt seccomp=unconfined \
|
| 38 |
+
-v "$PWD:/workspace/cuda_first_repo" \
|
| 39 |
+
vllm/vllm-openai-rocm:latest \
|
| 40 |
+
vllm serve Qwen/Qwen3-Coder-Next-FP8 --tensor-parallel-size 1
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## 5. Capture benchmark metadata
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
rocm-smi --showproductname --showmeminfo vram --showuse
|
| 47 |
+
python scripts/collect_benchmark_result.py --output benchmark_result.json
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
Replace `data/benchmark_result.json` with the captured result before final submission.
|
artifacts/check-scoring/benchmark_result.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"verified": false,
|
| 3 |
+
"status": "pending_external_amd_developer_cloud_run",
|
| 4 |
+
"hardware": "AMD Instinct MI300X or compatible AMD Developer Cloud instance",
|
| 5 |
+
"rocm_version": "capture with rocminfo or container metadata",
|
| 6 |
+
"vllm_version": "capture with python -m vllm --version",
|
| 7 |
+
"model": "Qwen/Qwen3-Coder-Next-FP8",
|
| 8 |
+
"prompt_config": {
|
| 9 |
+
"input_tokens": 512,
|
| 10 |
+
"output_tokens": 256,
|
| 11 |
+
"concurrency": 8,
|
| 12 |
+
"requests": 64
|
| 13 |
+
},
|
| 14 |
+
"throughput_tokens_per_second": null,
|
| 15 |
+
"p50_latency_ms": null,
|
| 16 |
+
"p95_latency_ms": null,
|
| 17 |
+
"peak_vram_gb": null,
|
| 18 |
+
"log_excerpt": "Replace this record after running the generated AMD Developer Cloud runbook.",
|
| 19 |
+
"notes": "This file is a transparent benchmark schema, not a fabricated result. The Space displays it as pending until measured values are captured on AMD Developer Cloud."
|
| 20 |
+
}
|
artifacts/check-scoring/migration_report.md
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCmPort AI Migration Report: cuda_first_repo
|
| 2 |
+
|
| 3 |
+
## AMD Readiness Score
|
| 4 |
+
|
| 5 |
+
- Before deterministic fixes: 51/100
|
| 6 |
+
- Migration package generated: 90/100
|
| 7 |
+
- This score means ROCm migration artifacts were generated and are ready for AMD Developer Cloud validation; it is not a production certification.
|
| 8 |
+
|
| 9 |
+
| Category | Before | Migration package |
|
| 10 |
+
| --- | ---: | ---: |
|
| 11 |
+
| Code portability | 0 | 86 |
|
| 12 |
+
| Environment readiness | 0 | 86 |
|
| 13 |
+
| Serving readiness | 90 | 98 |
|
| 14 |
+
| Benchmark readiness | 65 | 85 |
|
| 15 |
+
| Deployment readiness | 100 | 95 |
|
| 16 |
+
|
| 17 |
+
## Findings
|
| 18 |
+
|
| 19 |
+
| Severity | Category | Location | Finding | Suggested fix |
|
| 20 |
+
| --- | --- | --- | --- | --- |
|
| 21 |
+
| high | Benchmark readiness | `benchmarks/benchmark.py:6` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
|
| 22 |
+
| high | Environment readiness | `Dockerfile:1` | Dockerfile uses an NVIDIA CUDA base image. | Use vllm/vllm-openai-rocm:latest for vLLM serving or rocm/pytorch:latest for PyTorch workloads. |
|
| 23 |
+
| medium | Environment readiness | `Dockerfile:8` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 24 |
+
| high | Code portability | `infer.py:6` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
|
| 25 |
+
| high | Code portability | `infer.py:11` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
|
| 26 |
+
| high | Code portability | `infer.py:12` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
|
| 27 |
+
| low | Code portability | `infer.py:19` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
|
| 28 |
+
| medium | Environment readiness | `scripts/serve_vllm.sh:4` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 29 |
+
| high | Environment readiness | `scripts/serve_vllm.sh:5` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
|
| 30 |
+
| low | Serving readiness | `scripts/serve_vllm.sh:6` | vLLM serving command found without explicit ROCm container guidance. | Run vLLM inside vllm/vllm-openai-rocm with /dev/kfd, /dev/dri, host IPC, and video group access. |
|
| 31 |
+
|
| 32 |
+
## Generated Artifacts
|
| 33 |
+
|
| 34 |
+
- `rocm_patch.diff` contains deterministic MVP fixes.
|
| 35 |
+
- `Dockerfile.rocm` uses the ROCm-enabled vLLM container.
|
| 36 |
+
- `amd_developer_cloud_runbook.md` documents the validation path.
|
| 37 |
+
- `benchmark_result.json` records the AMD benchmark schema and status.
|
| 38 |
+
|
| 39 |
+
## Qwen Agent Notes
|
| 40 |
+
|
| 41 |
+
Qwen endpoint was not configured. The report uses deterministic scanner output only.
|
| 42 |
+
|
| 43 |
+
## Remaining Risks
|
| 44 |
+
|
| 45 |
+
- CUDA C++ kernels, custom Triton kernels, and CUDA-only binary dependencies require manual review.
|
| 46 |
+
- Uploaded repositories are not executed inside the Space; live validation belongs on AMD Developer Cloud.
|
| 47 |
+
- ROCm performance depends on model, batch shape, vLLM version, ROCm version, and GPU instance configuration.
|
artifacts/check-scoring/rocm_patch.diff
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--- a/Dockerfile
|
| 2 |
+
+++ b/Dockerfile
|
| 3 |
+
@@ -1,10 +1,10 @@
|
| 4 |
+
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
| 5 |
+
+FROM vllm/vllm-openai-rocm:latest
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 10 |
+
COPY . .
|
| 11 |
+
|
| 12 |
+
-ENV NVIDIA_VISIBLE_DEVICES=all
|
| 13 |
+
+ENV HIP_VISIBLE_DEVICES=all
|
| 14 |
+
|
| 15 |
+
CMD ["python", "infer.py"]
|
| 16 |
+
--- a/infer.py
|
| 17 |
+
+++ b/infer.py
|
| 18 |
+
@@ -1,15 +1,18 @@
|
| 19 |
+
import torch
|
| 20 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 21 |
+
|
| 22 |
+
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| 23 |
+
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 24 |
+
+
|
| 25 |
+
|
| 26 |
+
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
|
| 27 |
+
-device = torch.device("cuda")
|
| 28 |
+
+device = _rocmport_device
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def main():
|
| 32 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 33 |
+
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
|
| 34 |
+
- inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
|
| 35 |
+
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
|
| 36 |
+
+ inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
|
| 37 |
+
with torch.no_grad():
|
| 38 |
+
outputs = model.generate(**inputs, max_new_tokens=64)
|
| 39 |
+
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
| 40 |
+
--- a/scripts/serve_vllm.sh
|
| 41 |
+
+++ b/scripts/serve_vllm.sh
|
| 42 |
+
@@ -1,6 +1,6 @@
|
| 43 |
+
#!/usr/bin/env bash
|
| 44 |
+
set -euo pipefail
|
| 45 |
+
|
| 46 |
+
-export CUDA_VISIBLE_DEVICES=0
|
| 47 |
+
-nvidia-smi
|
| 48 |
+
+export HIP_VISIBLE_DEVICES=0
|
| 49 |
+
+rocm-smi
|
| 50 |
+
vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
|
artifacts/check-scoring/rocmport_artifacts.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f46889d5d26e62603bf801b51a98a18d203ef648bf52009a0be2777c57dc359a
|
| 3 |
+
size 5349
|
artifacts/check/Dockerfile.rocm
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM vllm/vllm-openai-rocm:latest
|
| 2 |
+
|
| 3 |
+
WORKDIR /workspace/cuda_first_repo
|
| 4 |
+
COPY . /workspace/cuda_first_repo
|
| 5 |
+
|
| 6 |
+
RUN if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi
|
| 7 |
+
|
| 8 |
+
ENV HIP_VISIBLE_DEVICES=0
|
| 9 |
+
ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
|
| 10 |
+
|
| 11 |
+
CMD ["python", "-c", "import torch; print('torch', torch.__version__); print('rocm_gpu_available', torch.cuda.is_available())"]
|
artifacts/check/ROCM_FEEDBACK.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCm / AMD Developer Cloud Feedback
|
| 2 |
+
|
| 3 |
+
## What worked well
|
| 4 |
+
|
| 5 |
+
- The ROCm-enabled vLLM container gives developers a clear serving path for AMD Instinct GPUs.
|
| 6 |
+
- AMD Developer Cloud is well aligned with hackathon demos because developers can avoid local GPU setup.
|
| 7 |
+
- Qwen3-Coder-Next on AMD Instinct is a strong story for repo-level coding agents.
|
| 8 |
+
|
| 9 |
+
## Friction points to document during the live run
|
| 10 |
+
|
| 11 |
+
- Exact VM image, ROCm version, and Docker image should be easy to capture in benchmark logs.
|
| 12 |
+
- Users need obvious examples for replacing NVIDIA container flags and monitoring commands.
|
| 13 |
+
- More migration examples for common CUDA-first PyTorch repos would reduce onboarding time.
|
| 14 |
+
|
| 15 |
+
## Suggested product improvement
|
| 16 |
+
|
| 17 |
+
Publish a small official CUDA-to-ROCm migration checklist for PyTorch, vLLM, and Hugging Face inference projects, with copyable Docker commands for AMD Developer Cloud.
|
artifacts/check/ROCM_MIGRATION_COOKBOOK.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCm Migration Cookbook
|
| 2 |
+
|
| 3 |
+
## PyTorch device handling
|
| 4 |
+
|
| 5 |
+
Use a runtime device abstraction instead of hardcoding `.cuda()` or `torch.device("cuda")` everywhere.
|
| 6 |
+
|
| 7 |
+
```python
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| 11 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 12 |
+
model = model.to(device)
|
| 13 |
+
inputs = inputs.to(device)
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## GPU inspection
|
| 17 |
+
|
| 18 |
+
Replace NVIDIA-only commands with ROCm equivalents:
|
| 19 |
+
|
| 20 |
+
```bash
|
| 21 |
+
rocm-smi --showproductname --showmeminfo vram --showuse
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
## Containers
|
| 25 |
+
|
| 26 |
+
For vLLM serving on AMD GPUs, use the ROCm-enabled vLLM image:
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
docker pull vllm/vllm-openai-rocm:latest
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
Run with AMD GPU device access:
|
| 33 |
+
|
| 34 |
+
```bash
|
| 35 |
+
docker run --rm -it --device /dev/kfd --device /dev/dri --group-add video --ipc=host --network=host --security-opt seccomp=unconfined vllm/vllm-openai-rocm:latest
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## Manual review cases
|
| 39 |
+
|
| 40 |
+
Manual migration is still required for CUDA C++ kernels, CUDA-only binary wheels, custom Triton kernels, and libraries that ship only CUDA builds.
|
artifacts/check/amd_developer_cloud_runbook.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AMD Developer Cloud Runbook
|
| 2 |
+
|
| 3 |
+
This runbook validates `cuda_first_repo` on AMD Developer Cloud without executing untrusted code inside the ROCmPort AI Space.
|
| 4 |
+
|
| 5 |
+
## 1. Create an AMD GPU VM
|
| 6 |
+
|
| 7 |
+
Use an AMD Developer Cloud VM with an AMD Instinct GPU and ROCm-ready Docker support.
|
| 8 |
+
|
| 9 |
+
## 2. Build the ROCm container
|
| 10 |
+
|
| 11 |
+
```bash
|
| 12 |
+
docker build -f Dockerfile.rocm -t rocmport-cuda_first_repo .
|
| 13 |
+
```
|
| 14 |
+
|
| 15 |
+
## 3. Run a smoke check
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
docker run --rm -it \
|
| 19 |
+
--device /dev/kfd \
|
| 20 |
+
--device /dev/dri \
|
| 21 |
+
--group-add video \
|
| 22 |
+
--ipc=host \
|
| 23 |
+
--network=host \
|
| 24 |
+
--security-opt seccomp=unconfined \
|
| 25 |
+
rocmport-cuda_first_repo
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## 4. Run vLLM on ROCm
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
docker run --rm -it \
|
| 32 |
+
--device /dev/kfd \
|
| 33 |
+
--device /dev/dri \
|
| 34 |
+
--group-add video \
|
| 35 |
+
--ipc=host \
|
| 36 |
+
--network=host \
|
| 37 |
+
--security-opt seccomp=unconfined \
|
| 38 |
+
-v "$PWD:/workspace/cuda_first_repo" \
|
| 39 |
+
vllm/vllm-openai-rocm:latest \
|
| 40 |
+
vllm serve Qwen/Qwen3-Coder-Next-FP8 --tensor-parallel-size 1
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## 5. Capture benchmark metadata
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
rocm-smi --showproductname --showmeminfo vram --showuse
|
| 47 |
+
python scripts/collect_benchmark_result.py --output benchmark_result.json
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
Replace `data/benchmark_result.json` with the captured result before final submission.
|
artifacts/check/benchmark_result.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"verified": false,
|
| 3 |
+
"status": "pending_external_amd_developer_cloud_run",
|
| 4 |
+
"hardware": "AMD Instinct MI300X or compatible AMD Developer Cloud instance",
|
| 5 |
+
"rocm_version": "capture with rocminfo or container metadata",
|
| 6 |
+
"vllm_version": "capture with python -m vllm --version",
|
| 7 |
+
"model": "Qwen/Qwen3-Coder-Next-FP8",
|
| 8 |
+
"prompt_config": {
|
| 9 |
+
"input_tokens": 512,
|
| 10 |
+
"output_tokens": 256,
|
| 11 |
+
"concurrency": 8,
|
| 12 |
+
"requests": 64
|
| 13 |
+
},
|
| 14 |
+
"throughput_tokens_per_second": null,
|
| 15 |
+
"p50_latency_ms": null,
|
| 16 |
+
"p95_latency_ms": null,
|
| 17 |
+
"peak_vram_gb": null,
|
| 18 |
+
"log_excerpt": "Replace this record after running the generated AMD Developer Cloud runbook.",
|
| 19 |
+
"notes": "This file is a transparent benchmark schema, not a fabricated result. The Space displays it as pending until measured values are captured on AMD Developer Cloud."
|
| 20 |
+
}
|
artifacts/check/migration_report.md
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCmPort AI Migration Report: cuda_first_repo
|
| 2 |
+
|
| 3 |
+
## AMD Readiness Score
|
| 4 |
+
|
| 5 |
+
- Before deterministic fixes: 53/100
|
| 6 |
+
- After deterministic fixes: 100/100
|
| 7 |
+
|
| 8 |
+
| Category | Before | After |
|
| 9 |
+
| --- | ---: | ---: |
|
| 10 |
+
| Code portability | 0 | 100 |
|
| 11 |
+
| Environment readiness | 8 | 100 |
|
| 12 |
+
| Serving readiness | 90 | 100 |
|
| 13 |
+
| Benchmark readiness | 65 | 100 |
|
| 14 |
+
| Deployment readiness | 100 | 100 |
|
| 15 |
+
|
| 16 |
+
## Findings
|
| 17 |
+
|
| 18 |
+
| Severity | Category | Location | Finding | Suggested fix |
|
| 19 |
+
| --- | --- | --- | --- | --- |
|
| 20 |
+
| high | Benchmark readiness | `benchmarks/benchmark.py:6` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
|
| 21 |
+
| high | Environment readiness | `Dockerfile:1` | Dockerfile uses an NVIDIA CUDA base image. | Use vllm/vllm-openai-rocm:latest for vLLM serving or rocm/pytorch:latest for PyTorch workloads. |
|
| 22 |
+
| medium | Environment readiness | `Dockerfile:8` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 23 |
+
| high | Code portability | `infer.py:6` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
|
| 24 |
+
| high | Code portability | `infer.py:11` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
|
| 25 |
+
| high | Code portability | `infer.py:12` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
|
| 26 |
+
| low | Code portability | `infer.py:19` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
|
| 27 |
+
| high | Environment readiness | `scripts/serve_vllm.sh:5` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
|
| 28 |
+
| low | Serving readiness | `scripts/serve_vllm.sh:6` | vLLM serving command found without explicit ROCm container guidance. | Run vLLM inside vllm/vllm-openai-rocm with /dev/kfd, /dev/dri, host IPC, and video group access. |
|
| 29 |
+
|
| 30 |
+
## Generated Artifacts
|
| 31 |
+
|
| 32 |
+
- `rocm_patch.diff` contains deterministic MVP fixes.
|
| 33 |
+
- `Dockerfile.rocm` uses the ROCm-enabled vLLM container.
|
| 34 |
+
- `amd_developer_cloud_runbook.md` documents the validation path.
|
| 35 |
+
- `benchmark_result.json` records the AMD benchmark schema and status.
|
| 36 |
+
|
| 37 |
+
## Qwen Agent Notes
|
| 38 |
+
|
| 39 |
+
Qwen endpoint was not configured. The report uses deterministic scanner output only.
|
| 40 |
+
|
| 41 |
+
## Remaining Risks
|
| 42 |
+
|
| 43 |
+
- CUDA C++ kernels, custom Triton kernels, and CUDA-only binary dependencies require manual review.
|
| 44 |
+
- Uploaded repositories are not executed inside the Space; live validation belongs on AMD Developer Cloud.
|
| 45 |
+
- ROCm performance depends on model, batch shape, vLLM version, ROCm version, and GPU instance configuration.
|
artifacts/check/rocm_patch.diff
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--- a/Dockerfile
|
| 2 |
+
+++ b/Dockerfile
|
| 3 |
+
@@ -1,10 +1,10 @@
|
| 4 |
+
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
| 5 |
+
+FROM vllm/vllm-openai-rocm:latest
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 10 |
+
COPY . .
|
| 11 |
+
|
| 12 |
+
-ENV NVIDIA_VISIBLE_DEVICES=all
|
| 13 |
+
+ENV HIP_VISIBLE_DEVICES=all
|
| 14 |
+
|
| 15 |
+
CMD ["python", "infer.py"]
|
| 16 |
+
--- a/infer.py
|
| 17 |
+
+++ b/infer.py
|
| 18 |
+
@@ -1,15 +1,18 @@
|
| 19 |
+
import torch
|
| 20 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 21 |
+
|
| 22 |
+
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| 23 |
+
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 24 |
+
+
|
| 25 |
+
|
| 26 |
+
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
|
| 27 |
+
-device = torch.device("cuda")
|
| 28 |
+
+device = _rocmport_device
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def main():
|
| 32 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 33 |
+
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
|
| 34 |
+
- inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
|
| 35 |
+
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
|
| 36 |
+
+ inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
|
| 37 |
+
with torch.no_grad():
|
| 38 |
+
outputs = model.generate(**inputs, max_new_tokens=64)
|
| 39 |
+
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
| 40 |
+
--- a/scripts/serve_vllm.sh
|
| 41 |
+
+++ b/scripts/serve_vllm.sh
|
| 42 |
+
@@ -2,5 +2,5 @@
|
| 43 |
+
set -euo pipefail
|
| 44 |
+
|
| 45 |
+
export CUDA_VISIBLE_DEVICES=0
|
| 46 |
+
-nvidia-smi
|
| 47 |
+
+rocm-smi
|
| 48 |
+
vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
|
artifacts/check/rocmport_artifacts.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a416cb1bb90125df6f5b63bc41032aa6543b74b8b5b2a431761cde14b5a52d5b
|
| 3 |
+
size 5229
|
artifacts/check2/Dockerfile.rocm
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM vllm/vllm-openai-rocm:latest
|
| 2 |
+
|
| 3 |
+
WORKDIR /workspace/cuda_first_repo
|
| 4 |
+
COPY . /workspace/cuda_first_repo
|
| 5 |
+
|
| 6 |
+
RUN if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi
|
| 7 |
+
|
| 8 |
+
ENV HIP_VISIBLE_DEVICES=0
|
| 9 |
+
ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
|
| 10 |
+
|
| 11 |
+
CMD ["python", "-c", "import torch; print('torch', torch.__version__); print('rocm_gpu_available', torch.cuda.is_available())"]
|
artifacts/check2/ROCM_FEEDBACK.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCm / AMD Developer Cloud Feedback
|
| 2 |
+
|
| 3 |
+
## What worked well
|
| 4 |
+
|
| 5 |
+
- The ROCm-enabled vLLM container gives developers a clear serving path for AMD Instinct GPUs.
|
| 6 |
+
- AMD Developer Cloud is well aligned with hackathon demos because developers can avoid local GPU setup.
|
| 7 |
+
- Qwen3-Coder-Next on AMD Instinct is a strong story for repo-level coding agents.
|
| 8 |
+
|
| 9 |
+
## Friction points to document during the live run
|
| 10 |
+
|
| 11 |
+
- Exact VM image, ROCm version, and Docker image should be easy to capture in benchmark logs.
|
| 12 |
+
- Users need obvious examples for replacing NVIDIA container flags and monitoring commands.
|
| 13 |
+
- More migration examples for common CUDA-first PyTorch repos would reduce onboarding time.
|
| 14 |
+
|
| 15 |
+
## Suggested product improvement
|
| 16 |
+
|
| 17 |
+
Publish a small official CUDA-to-ROCm migration checklist for PyTorch, vLLM, and Hugging Face inference projects, with copyable Docker commands for AMD Developer Cloud.
|
artifacts/check2/ROCM_MIGRATION_COOKBOOK.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCm Migration Cookbook
|
| 2 |
+
|
| 3 |
+
## PyTorch device handling
|
| 4 |
+
|
| 5 |
+
Use a runtime device abstraction instead of hardcoding `.cuda()` or `torch.device("cuda")` everywhere.
|
| 6 |
+
|
| 7 |
+
```python
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| 11 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 12 |
+
model = model.to(device)
|
| 13 |
+
inputs = inputs.to(device)
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## GPU inspection
|
| 17 |
+
|
| 18 |
+
Replace NVIDIA-only commands with ROCm equivalents:
|
| 19 |
+
|
| 20 |
+
```bash
|
| 21 |
+
rocm-smi --showproductname --showmeminfo vram --showuse
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
## Containers
|
| 25 |
+
|
| 26 |
+
For vLLM serving on AMD GPUs, use the ROCm-enabled vLLM image:
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
docker pull vllm/vllm-openai-rocm:latest
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
Run with AMD GPU device access:
|
| 33 |
+
|
| 34 |
+
```bash
|
| 35 |
+
docker run --rm -it --device /dev/kfd --device /dev/dri --group-add video --ipc=host --network=host --security-opt seccomp=unconfined vllm/vllm-openai-rocm:latest
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## Manual review cases
|
| 39 |
+
|
| 40 |
+
Manual migration is still required for CUDA C++ kernels, CUDA-only binary wheels, custom Triton kernels, and libraries that ship only CUDA builds.
|
artifacts/check2/amd_developer_cloud_runbook.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AMD Developer Cloud Runbook
|
| 2 |
+
|
| 3 |
+
This runbook validates `cuda_first_repo` on AMD Developer Cloud without executing untrusted code inside the ROCmPort AI Space.
|
| 4 |
+
|
| 5 |
+
## 1. Create an AMD GPU VM
|
| 6 |
+
|
| 7 |
+
Use an AMD Developer Cloud VM with an AMD Instinct GPU and ROCm-ready Docker support.
|
| 8 |
+
|
| 9 |
+
## 2. Build the ROCm container
|
| 10 |
+
|
| 11 |
+
```bash
|
| 12 |
+
docker build -f Dockerfile.rocm -t rocmport-cuda_first_repo .
|
| 13 |
+
```
|
| 14 |
+
|
| 15 |
+
## 3. Run a smoke check
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
docker run --rm -it \
|
| 19 |
+
--device /dev/kfd \
|
| 20 |
+
--device /dev/dri \
|
| 21 |
+
--group-add video \
|
| 22 |
+
--ipc=host \
|
| 23 |
+
--network=host \
|
| 24 |
+
--security-opt seccomp=unconfined \
|
| 25 |
+
rocmport-cuda_first_repo
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## 4. Run vLLM on ROCm
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
docker run --rm -it \
|
| 32 |
+
--device /dev/kfd \
|
| 33 |
+
--device /dev/dri \
|
| 34 |
+
--group-add video \
|
| 35 |
+
--ipc=host \
|
| 36 |
+
--network=host \
|
| 37 |
+
--security-opt seccomp=unconfined \
|
| 38 |
+
-v "$PWD:/workspace/cuda_first_repo" \
|
| 39 |
+
vllm/vllm-openai-rocm:latest \
|
| 40 |
+
vllm serve Qwen/Qwen3-Coder-Next-FP8 --tensor-parallel-size 1
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## 5. Capture benchmark metadata
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
rocm-smi --showproductname --showmeminfo vram --showuse
|
| 47 |
+
python scripts/collect_benchmark_result.py --output benchmark_result.json
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
Replace `data/benchmark_result.json` with the captured result before final submission.
|
artifacts/check2/benchmark_result.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"verified": false,
|
| 3 |
+
"status": "pending_external_amd_developer_cloud_run",
|
| 4 |
+
"hardware": "AMD Instinct MI300X or compatible AMD Developer Cloud instance",
|
| 5 |
+
"rocm_version": "capture with rocminfo or container metadata",
|
| 6 |
+
"vllm_version": "capture with python -m vllm --version",
|
| 7 |
+
"model": "Qwen/Qwen3-Coder-Next-FP8",
|
| 8 |
+
"prompt_config": {
|
| 9 |
+
"input_tokens": 512,
|
| 10 |
+
"output_tokens": 256,
|
| 11 |
+
"concurrency": 8,
|
| 12 |
+
"requests": 64
|
| 13 |
+
},
|
| 14 |
+
"throughput_tokens_per_second": null,
|
| 15 |
+
"p50_latency_ms": null,
|
| 16 |
+
"p95_latency_ms": null,
|
| 17 |
+
"peak_vram_gb": null,
|
| 18 |
+
"log_excerpt": "Replace this record after running the generated AMD Developer Cloud runbook.",
|
| 19 |
+
"notes": "This file is a transparent benchmark schema, not a fabricated result. The Space displays it as pending until measured values are captured on AMD Developer Cloud."
|
| 20 |
+
}
|
artifacts/check2/migration_report.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCmPort AI Migration Report: cuda_first_repo
|
| 2 |
+
|
| 3 |
+
## AMD Readiness Score
|
| 4 |
+
|
| 5 |
+
- Before deterministic fixes: 51/100
|
| 6 |
+
- After deterministic fixes: 100/100
|
| 7 |
+
|
| 8 |
+
| Category | Before | After |
|
| 9 |
+
| --- | ---: | ---: |
|
| 10 |
+
| Code portability | 0 | 100 |
|
| 11 |
+
| Environment readiness | 0 | 100 |
|
| 12 |
+
| Serving readiness | 90 | 100 |
|
| 13 |
+
| Benchmark readiness | 65 | 100 |
|
| 14 |
+
| Deployment readiness | 100 | 100 |
|
| 15 |
+
|
| 16 |
+
## Findings
|
| 17 |
+
|
| 18 |
+
| Severity | Category | Location | Finding | Suggested fix |
|
| 19 |
+
| --- | --- | --- | --- | --- |
|
| 20 |
+
| high | Benchmark readiness | `benchmarks/benchmark.py:6` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
|
| 21 |
+
| high | Environment readiness | `Dockerfile:1` | Dockerfile uses an NVIDIA CUDA base image. | Use vllm/vllm-openai-rocm:latest for vLLM serving or rocm/pytorch:latest for PyTorch workloads. |
|
| 22 |
+
| medium | Environment readiness | `Dockerfile:8` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 23 |
+
| high | Code portability | `infer.py:6` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
|
| 24 |
+
| high | Code portability | `infer.py:11` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
|
| 25 |
+
| high | Code portability | `infer.py:12` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
|
| 26 |
+
| low | Code portability | `infer.py:19` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
|
| 27 |
+
| medium | Environment readiness | `scripts/serve_vllm.sh:4` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 28 |
+
| high | Environment readiness | `scripts/serve_vllm.sh:5` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
|
| 29 |
+
| low | Serving readiness | `scripts/serve_vllm.sh:6` | vLLM serving command found without explicit ROCm container guidance. | Run vLLM inside vllm/vllm-openai-rocm with /dev/kfd, /dev/dri, host IPC, and video group access. |
|
| 30 |
+
|
| 31 |
+
## Generated Artifacts
|
| 32 |
+
|
| 33 |
+
- `rocm_patch.diff` contains deterministic MVP fixes.
|
| 34 |
+
- `Dockerfile.rocm` uses the ROCm-enabled vLLM container.
|
| 35 |
+
- `amd_developer_cloud_runbook.md` documents the validation path.
|
| 36 |
+
- `benchmark_result.json` records the AMD benchmark schema and status.
|
| 37 |
+
|
| 38 |
+
## Qwen Agent Notes
|
| 39 |
+
|
| 40 |
+
Qwen endpoint was not configured. The report uses deterministic scanner output only.
|
| 41 |
+
|
| 42 |
+
## Remaining Risks
|
| 43 |
+
|
| 44 |
+
- CUDA C++ kernels, custom Triton kernels, and CUDA-only binary dependencies require manual review.
|
| 45 |
+
- Uploaded repositories are not executed inside the Space; live validation belongs on AMD Developer Cloud.
|
| 46 |
+
- ROCm performance depends on model, batch shape, vLLM version, ROCm version, and GPU instance configuration.
|
artifacts/check2/rocm_patch.diff
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--- a/Dockerfile
|
| 2 |
+
+++ b/Dockerfile
|
| 3 |
+
@@ -1,10 +1,10 @@
|
| 4 |
+
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
| 5 |
+
+FROM vllm/vllm-openai-rocm:latest
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 10 |
+
COPY . .
|
| 11 |
+
|
| 12 |
+
-ENV NVIDIA_VISIBLE_DEVICES=all
|
| 13 |
+
+ENV HIP_VISIBLE_DEVICES=all
|
| 14 |
+
|
| 15 |
+
CMD ["python", "infer.py"]
|
| 16 |
+
--- a/infer.py
|
| 17 |
+
+++ b/infer.py
|
| 18 |
+
@@ -1,15 +1,18 @@
|
| 19 |
+
import torch
|
| 20 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 21 |
+
|
| 22 |
+
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| 23 |
+
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 24 |
+
+
|
| 25 |
+
|
| 26 |
+
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
|
| 27 |
+
-device = torch.device("cuda")
|
| 28 |
+
+device = _rocmport_device
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def main():
|
| 32 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 33 |
+
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
|
| 34 |
+
- inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
|
| 35 |
+
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
|
| 36 |
+
+ inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
|
| 37 |
+
with torch.no_grad():
|
| 38 |
+
outputs = model.generate(**inputs, max_new_tokens=64)
|
| 39 |
+
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
| 40 |
+
--- a/scripts/serve_vllm.sh
|
| 41 |
+
+++ b/scripts/serve_vllm.sh
|
| 42 |
+
@@ -1,6 +1,6 @@
|
| 43 |
+
#!/usr/bin/env bash
|
| 44 |
+
set -euo pipefail
|
| 45 |
+
|
| 46 |
+
-export CUDA_VISIBLE_DEVICES=0
|
| 47 |
+
-nvidia-smi
|
| 48 |
+
+export HIP_VISIBLE_DEVICES=0
|
| 49 |
+
+rocm-smi
|
| 50 |
+
vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
|
artifacts/check2/rocmport_artifacts.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b346f1ca8119e31fb879012c85fb6a20663459f9266944b277e346d3d8c89fa
|
| 3 |
+
size 5275
|
artifacts/hackathon_content.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/Dockerfile.rocm
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM vllm/vllm-openai-rocm:latest
|
| 2 |
+
|
| 3 |
+
WORKDIR /workspace/cuda_first_repo
|
| 4 |
+
COPY . /workspace/cuda_first_repo
|
| 5 |
+
|
| 6 |
+
RUN if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi
|
| 7 |
+
|
| 8 |
+
ENV HIP_VISIBLE_DEVICES=0
|
| 9 |
+
ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
|
| 10 |
+
|
| 11 |
+
CMD ["python", "-c", "import torch; print('torch', torch.__version__); print('rocm_gpu_available', torch.cuda.is_available())"]
|
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/ROCM_FEEDBACK.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCm / AMD Developer Cloud Feedback
|
| 2 |
+
|
| 3 |
+
## What worked well
|
| 4 |
+
|
| 5 |
+
- The ROCm-enabled vLLM container gives developers a clear serving path for AMD Instinct GPUs.
|
| 6 |
+
- AMD Developer Cloud is well aligned with hackathon demos because developers can avoid local GPU setup.
|
| 7 |
+
- Qwen3-Coder-Next on AMD Instinct is a strong story for repo-level coding agents.
|
| 8 |
+
|
| 9 |
+
## Friction points to document during the live run
|
| 10 |
+
|
| 11 |
+
- Exact VM image, ROCm version, and Docker image should be easy to capture in benchmark logs.
|
| 12 |
+
- Users need obvious examples for replacing NVIDIA container flags and monitoring commands.
|
| 13 |
+
- More migration examples for common CUDA-first PyTorch repos would reduce onboarding time.
|
| 14 |
+
|
| 15 |
+
## Suggested product improvement
|
| 16 |
+
|
| 17 |
+
Publish a small official CUDA-to-ROCm migration checklist for PyTorch, vLLM, and Hugging Face inference projects, with copyable Docker commands for AMD Developer Cloud.
|
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/ROCM_MIGRATION_COOKBOOK.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCm Migration Cookbook
|
| 2 |
+
|
| 3 |
+
## PyTorch device handling
|
| 4 |
+
|
| 5 |
+
Use a runtime device abstraction instead of hardcoding `.cuda()` or `torch.device("cuda")` everywhere.
|
| 6 |
+
|
| 7 |
+
```python
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| 11 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 12 |
+
model = model.to(device)
|
| 13 |
+
inputs = inputs.to(device)
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## GPU inspection
|
| 17 |
+
|
| 18 |
+
Replace NVIDIA-only commands with ROCm equivalents:
|
| 19 |
+
|
| 20 |
+
```bash
|
| 21 |
+
rocm-smi --showproductname --showmeminfo vram --showuse
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
## Containers
|
| 25 |
+
|
| 26 |
+
For vLLM serving on AMD GPUs, use the ROCm-enabled vLLM image:
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
docker pull vllm/vllm-openai-rocm:latest
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
Run with AMD GPU device access:
|
| 33 |
+
|
| 34 |
+
```bash
|
| 35 |
+
docker run --rm -it --device /dev/kfd --device /dev/dri --group-add video --ipc=host --network=host --security-opt seccomp=unconfined vllm/vllm-openai-rocm:latest
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## Manual review cases
|
| 39 |
+
|
| 40 |
+
Manual migration is still required for CUDA C++ kernels, CUDA-only binary wheels, custom Triton kernels, and libraries that ship only CUDA builds.
|
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/amd_developer_cloud_runbook.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AMD Developer Cloud Runbook
|
| 2 |
+
|
| 3 |
+
This runbook validates `cuda_first_repo` on AMD Developer Cloud without executing untrusted code inside the ROCmPort AI Space.
|
| 4 |
+
|
| 5 |
+
## 1. Create an AMD GPU VM
|
| 6 |
+
|
| 7 |
+
Use an AMD Developer Cloud VM with an AMD Instinct GPU and ROCm-ready Docker support.
|
| 8 |
+
|
| 9 |
+
## 2. Build the ROCm container
|
| 10 |
+
|
| 11 |
+
```bash
|
| 12 |
+
docker build -f Dockerfile.rocm -t rocmport-cuda_first_repo .
|
| 13 |
+
```
|
| 14 |
+
|
| 15 |
+
## 3. Run a smoke check
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
docker run --rm -it \
|
| 19 |
+
--device /dev/kfd \
|
| 20 |
+
--device /dev/dri \
|
| 21 |
+
--group-add video \
|
| 22 |
+
--ipc=host \
|
| 23 |
+
--network=host \
|
| 24 |
+
--security-opt seccomp=unconfined \
|
| 25 |
+
rocmport-cuda_first_repo
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## 4. Run vLLM on ROCm
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
docker run --rm -it \
|
| 32 |
+
--device /dev/kfd \
|
| 33 |
+
--device /dev/dri \
|
| 34 |
+
--group-add video \
|
| 35 |
+
--ipc=host \
|
| 36 |
+
--network=host \
|
| 37 |
+
--security-opt seccomp=unconfined \
|
| 38 |
+
-v "$PWD:/workspace/cuda_first_repo" \
|
| 39 |
+
vllm/vllm-openai-rocm:latest \
|
| 40 |
+
vllm serve Qwen/Qwen3-Coder-Next-FP8 --tensor-parallel-size 1
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## 5. Capture benchmark metadata
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
rocm-smi --showproductname --showmeminfo vram --showuse
|
| 47 |
+
python scripts/collect_benchmark_result.py --output benchmark_result.json
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
Replace `data/benchmark_result.json` with the captured result before final submission.
|
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/benchmark_result.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"verified": false,
|
| 3 |
+
"status": "pending_external_amd_developer_cloud_run",
|
| 4 |
+
"hardware": "AMD Instinct MI300X or compatible AMD Developer Cloud instance",
|
| 5 |
+
"rocm_version": "capture with rocminfo or container metadata",
|
| 6 |
+
"vllm_version": "capture with python -m vllm --version",
|
| 7 |
+
"model": "Qwen/Qwen3-Coder-Next-FP8",
|
| 8 |
+
"prompt_config": {
|
| 9 |
+
"input_tokens": 512,
|
| 10 |
+
"output_tokens": 256,
|
| 11 |
+
"concurrency": 8,
|
| 12 |
+
"requests": 64
|
| 13 |
+
},
|
| 14 |
+
"throughput_tokens_per_second": null,
|
| 15 |
+
"p50_latency_ms": null,
|
| 16 |
+
"p95_latency_ms": null,
|
| 17 |
+
"peak_vram_gb": null,
|
| 18 |
+
"log_excerpt": "Replace this record after running the generated AMD Developer Cloud runbook.",
|
| 19 |
+
"notes": "This file is a transparent benchmark schema, not a fabricated result. The Space displays it as pending until measured values are captured on AMD Developer Cloud."
|
| 20 |
+
}
|
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/migration_report.md
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCmPort AI Migration Report: cuda_first_repo
|
| 2 |
+
|
| 3 |
+
## AMD Readiness Score
|
| 4 |
+
|
| 5 |
+
- Before deterministic fixes: 42/100
|
| 6 |
+
- Migration package generated: 67/100
|
| 7 |
+
- This score means ROCm migration artifacts were generated and are ready for AMD Developer Cloud validation; it is not a production certification.
|
| 8 |
+
|
| 9 |
+
| Category | Before | Migration package |
|
| 10 |
+
| --- | ---: | ---: |
|
| 11 |
+
| Code portability | 0 | 46 |
|
| 12 |
+
| Environment readiness | 0 | 0 |
|
| 13 |
+
| Serving readiness | 80 | 96 |
|
| 14 |
+
| Benchmark readiness | 30 | 92 |
|
| 15 |
+
| Deployment readiness | 100 | 100 |
|
| 16 |
+
|
| 17 |
+
## Findings
|
| 18 |
+
|
| 19 |
+
| Severity | Category | Location | Finding | Suggested fix |
|
| 20 |
+
| --- | --- | --- | --- | --- |
|
| 21 |
+
| medium | Environment readiness | `benchmarks/benchmark.py:13` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 22 |
+
| high | Benchmark readiness | `benchmarks/benchmark.py:22` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
|
| 23 |
+
| high | Benchmark readiness | `benchmarks/benchmark.py:24` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
|
| 24 |
+
| high | Code portability | `benchmarks/benchmark.py:36` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
|
| 25 |
+
| high | Code portability | `benchmarks/benchmark.py:38` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
|
| 26 |
+
| high | Code portability | `benchmarks/benchmark.py:41` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
|
| 27 |
+
| medium | Environment readiness | `docker-compose.yml:6` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 28 |
+
| medium | Environment readiness | `docker-compose.yml:7` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 29 |
+
| medium | Environment readiness | `docker-compose.yml:8` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 30 |
+
| medium | Environment readiness | `docker-compose.yml:24` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 31 |
+
| medium | Environment readiness | `docker-compose.yml:25` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 32 |
+
| high | Environment readiness | `docker-compose.yml:29` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
|
| 33 |
+
| low | Serving readiness | `docker-compose.yml:30` | vLLM serving command found without explicit ROCm container guidance. | Run vLLM inside vllm/vllm-openai-rocm with /dev/kfd, /dev/dri, host IPC, and video group access. |
|
| 34 |
+
| high | Environment readiness | `Dockerfile:1` | Dockerfile uses an NVIDIA CUDA base image. | Use vllm/vllm-openai-rocm:latest for vLLM serving or rocm/pytorch:latest for PyTorch workloads. |
|
| 35 |
+
| medium | Environment readiness | `Dockerfile:8` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 36 |
+
| high | Code portability | `infer.py:6` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
|
| 37 |
+
| high | Code portability | `infer.py:11` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
|
| 38 |
+
| high | Code portability | `infer.py:12` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
|
| 39 |
+
| low | Code portability | `infer.py:19` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
|
| 40 |
+
| medium | Environment readiness | `requirements.txt:4` | Dependency references a CUDA-specific package. | Replace CUDA-specific wheels with ROCm-compatible PyTorch or library builds. |
|
| 41 |
+
| medium | Environment readiness | `requirements.txt:5` | Dependency references a CUDA-specific package. | Replace CUDA-specific wheels with ROCm-compatible PyTorch or library builds. |
|
| 42 |
+
| medium | Environment readiness | `scripts/serve_vllm.sh:4` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 43 |
+
| high | Environment readiness | `scripts/serve_vllm.sh:5` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
|
| 44 |
+
| low | Serving readiness | `scripts/serve_vllm.sh:6` | vLLM serving command found without explicit ROCm container guidance. | Run vLLM inside vllm/vllm-openai-rocm with /dev/kfd, /dev/dri, host IPC, and video group access. |
|
| 45 |
+
| medium | Environment readiness | `scripts/train.py:13` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 46 |
+
| medium | Environment readiness | `scripts/train.py:14` | CUDA toolkit path environment variable found. | Remove CUDA toolkit path assumptions or replace with ROCm installation paths when required. |
|
| 47 |
+
| high | Code portability | `scripts/train.py:18` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
|
| 48 |
+
| low | Code portability | `scripts/train.py:19` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
|
| 49 |
+
| high | Code portability | `scripts/train.py:30` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
|
| 50 |
+
| high | Code portability | `scripts/train.py:35` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
|
| 51 |
+
| high | Code portability | `scripts/train.py:36` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
|
| 52 |
+
| high | Code portability | `scripts/train.py:44` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
|
| 53 |
+
| high | Code portability | `scripts/train.py:45` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
|
| 54 |
+
| low | Code portability | `scripts/train.py:59` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
|
| 55 |
+
|
| 56 |
+
## Generated Artifacts
|
| 57 |
+
|
| 58 |
+
- `rocm_patch.diff` contains deterministic MVP fixes.
|
| 59 |
+
- `Dockerfile.rocm` uses the ROCm-enabled vLLM container.
|
| 60 |
+
- `amd_developer_cloud_runbook.md` documents the validation path.
|
| 61 |
+
- `benchmark_result.json` records the AMD benchmark schema and status.
|
| 62 |
+
|
| 63 |
+
## Qwen Agent Notes
|
| 64 |
+
|
| 65 |
+
Qwen endpoint was not configured. The report uses deterministic scanner output only.
|
| 66 |
+
|
| 67 |
+
## Remaining Risks
|
| 68 |
+
|
| 69 |
+
- CUDA C++ kernels, custom Triton kernels, and CUDA-only binary dependencies require manual review.
|
| 70 |
+
- Uploaded repositories are not executed inside the Space; live validation belongs on AMD Developer Cloud.
|
| 71 |
+
- ROCm performance depends on model, batch shape, vLLM version, ROCm version, and GPU instance configuration.
|
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/rocm_patch.diff
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--- a/benchmarks/benchmark.py
|
| 2 |
+
+++ b/benchmarks/benchmark.py
|
| 3 |
+
@@ -9,6 +9,9 @@
|
| 4 |
+
import json
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 7 |
+
+
|
| 8 |
+
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| 9 |
+
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 10 |
+
|
| 11 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" # should → HIP_VISIBLE_DEVICES
|
| 12 |
+
|
| 13 |
+
@@ -33,12 +36,12 @@
|
| 14 |
+
hw = gpu_info()
|
| 15 |
+
print("GPU info:", hw)
|
| 16 |
+
|
| 17 |
+
- device = torch.device("cuda") # hardcoded CUDA device
|
| 18 |
+
+ device = _rocmport_device # hardcoded CUDA device
|
| 19 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 20 |
+
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda()
|
| 21 |
+
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device)
|
| 22 |
+
model.eval()
|
| 23 |
+
|
| 24 |
+
- inputs = tokenizer(PROMPT, return_tensors="pt").to("cuda") # .to("cuda")
|
| 25 |
+
+ inputs = tokenizer(PROMPT, return_tensors="pt").to(_rocmport_device) # .to(_rocmport_device)
|
| 26 |
+
|
| 27 |
+
# Warm-up
|
| 28 |
+
with torch.no_grad():
|
| 29 |
+
--- a/docker-compose.yml
|
| 30 |
+
+++ b/docker-compose.yml
|
| 31 |
+
@@ -3,9 +3,9 @@
|
| 32 |
+
inference:
|
| 33 |
+
image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
| 34 |
+
environment:
|
| 35 |
+
- - NVIDIA_VISIBLE_DEVICES=all
|
| 36 |
+
- - NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
| 37 |
+
- - CUDA_VISIBLE_DEVICES=0
|
| 38 |
+
+ - HIP_VISIBLE_DEVICES=all
|
| 39 |
+
+ - ROCM_VISIBLE_DEVICES=compute,utility
|
| 40 |
+
+ - HIP_VISIBLE_DEVICES=0
|
| 41 |
+
deploy:
|
| 42 |
+
resources:
|
| 43 |
+
reservations:
|
| 44 |
+
@@ -21,10 +21,10 @@
|
| 45 |
+
vllm_server:
|
| 46 |
+
image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
| 47 |
+
environment:
|
| 48 |
+
- - NVIDIA_VISIBLE_DEVICES=0,1
|
| 49 |
+
- - CUDA_VISIBLE_DEVICES=0,1
|
| 50 |
+
+ - HIP_VISIBLE_DEVICES=0,1
|
| 51 |
+
+ - HIP_VISIBLE_DEVICES=0,1
|
| 52 |
+
ports:
|
| 53 |
+
- "8000:8000"
|
| 54 |
+
command: >
|
| 55 |
+
- bash -c "nvidia-smi && pip install vllm &&
|
| 56 |
+
+ bash -c "rocm-smi && pip install vllm &&
|
| 57 |
+
vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 2"
|
| 58 |
+
--- a/Dockerfile
|
| 59 |
+
+++ b/Dockerfile
|
| 60 |
+
@@ -1,10 +1,10 @@
|
| 61 |
+
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
| 62 |
+
+FROM vllm/vllm-openai-rocm:latest
|
| 63 |
+
|
| 64 |
+
WORKDIR /app
|
| 65 |
+
COPY requirements.txt .
|
| 66 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 67 |
+
COPY . .
|
| 68 |
+
|
| 69 |
+
-ENV NVIDIA_VISIBLE_DEVICES=all
|
| 70 |
+
+ENV HIP_VISIBLE_DEVICES=all
|
| 71 |
+
|
| 72 |
+
CMD ["python", "infer.py"]
|
| 73 |
+
--- a/infer.py
|
| 74 |
+
+++ b/infer.py
|
| 75 |
+
@@ -1,15 +1,18 @@
|
| 76 |
+
import torch
|
| 77 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 78 |
+
|
| 79 |
+
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| 80 |
+
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 81 |
+
+
|
| 82 |
+
|
| 83 |
+
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
|
| 84 |
+
-device = torch.device("cuda")
|
| 85 |
+
+device = _rocmport_device
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def main():
|
| 89 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 90 |
+
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
|
| 91 |
+
- inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
|
| 92 |
+
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
|
| 93 |
+
+ inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
|
| 94 |
+
with torch.no_grad():
|
| 95 |
+
outputs = model.generate(**inputs, max_new_tokens=64)
|
| 96 |
+
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
| 97 |
+
--- a/scripts/serve_vllm.sh
|
| 98 |
+
+++ b/scripts/serve_vllm.sh
|
| 99 |
+
@@ -1,6 +1,6 @@
|
| 100 |
+
#!/usr/bin/env bash
|
| 101 |
+
set -euo pipefail
|
| 102 |
+
|
| 103 |
+
-export CUDA_VISIBLE_DEVICES=0
|
| 104 |
+
-nvidia-smi
|
| 105 |
+
+export HIP_VISIBLE_DEVICES=0
|
| 106 |
+
+rocm-smi
|
| 107 |
+
vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
|
| 108 |
+
--- a/scripts/train.py
|
| 109 |
+
+++ b/scripts/train.py
|
| 110 |
+
@@ -9,13 +9,16 @@
|
| 111 |
+
from torch.utils.data import DataLoader, TensorDataset
|
| 112 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 113 |
+
|
| 114 |
+
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| 115 |
+
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 116 |
+
+
|
| 117 |
+
# ── CUDA-specific patterns that ROCmPort will flag ─────────────────────────
|
| 118 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # should → HIP_VISIBLE_DEVICES
|
| 119 |
+
os.environ["CUDA_HOME"] = "/usr/local/cuda" # should be removed / replaced
|
| 120 |
+
|
| 121 |
+
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
|
| 122 |
+
|
| 123 |
+
-device = torch.device("cuda") # hardcoded CUDA device
|
| 124 |
+
+device = _rocmport_device # hardcoded CUDA device
|
| 125 |
+
print("CUDA available:", torch.cuda.is_available())
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
@@ -27,13 +30,13 @@
|
| 129 |
+
|
| 130 |
+
def train(epochs: int = 3, lr: float = 2e-5):
|
| 131 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 132 |
+
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda() call
|
| 133 |
+
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device) call
|
| 134 |
+
|
| 135 |
+
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
|
| 136 |
+
|
| 137 |
+
ids, labels = get_dummy_batch()
|
| 138 |
+
- ids = ids.to("cuda") # hardcoded "cuda" string
|
| 139 |
+
- labels = labels.to("cuda") # hardcoded "cuda" string
|
| 140 |
+
+ ids = ids.to(_rocmport_device) # hardcoded "cuda" string
|
| 141 |
+
+ labels = labels.to(_rocmport_device) # hardcoded "cuda" string
|
| 142 |
+
|
| 143 |
+
dataset = TensorDataset(ids, labels)
|
| 144 |
+
loader = DataLoader(dataset, batch_size=2)
|
| 145 |
+
@@ -41,8 +44,8 @@
|
| 146 |
+
model.train()
|
| 147 |
+
for epoch in range(epochs):
|
| 148 |
+
for batch_ids, batch_labels in loader:
|
| 149 |
+
- batch_ids = batch_ids.cuda() # another .cuda() call
|
| 150 |
+
- batch_labels = batch_labels.cuda()
|
| 151 |
+
+ batch_ids = batch_ids.to(_rocmport_device) # another .to(_rocmport_device) call
|
| 152 |
+
+ batch_labels = batch_labels.to(_rocmport_device)
|
| 153 |
+
outputs = model(input_ids=batch_ids, labels=batch_labels)
|
| 154 |
+
loss = outputs.loss
|
| 155 |
+
loss.backward()
|
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/rocmport_artifacts.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:232d4aa956fe47770fd6f443c73af4ba5f3bfbc5cc73c997f9fc77538bf7c918
|
| 3 |
+
size 6603
|
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/Dockerfile.rocm
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM vllm/vllm-openai-rocm:latest
|
| 2 |
+
|
| 3 |
+
WORKDIR /workspace/cuda_first_repo
|
| 4 |
+
COPY . /workspace/cuda_first_repo
|
| 5 |
+
|
| 6 |
+
RUN if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi
|
| 7 |
+
|
| 8 |
+
ENV HIP_VISIBLE_DEVICES=0
|
| 9 |
+
ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
|
| 10 |
+
|
| 11 |
+
CMD ["python", "-c", "import torch; print('torch', torch.__version__); print('rocm_gpu_available', torch.cuda.is_available())"]
|
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/ROCM_FEEDBACK.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCm / AMD Developer Cloud Feedback
|
| 2 |
+
|
| 3 |
+
## What worked well
|
| 4 |
+
|
| 5 |
+
- The ROCm-enabled vLLM container gives developers a clear serving path for AMD Instinct GPUs.
|
| 6 |
+
- AMD Developer Cloud is well aligned with hackathon demos because developers can avoid local GPU setup.
|
| 7 |
+
- Qwen3-Coder-Next on AMD Instinct is a strong story for repo-level coding agents.
|
| 8 |
+
|
| 9 |
+
## Friction points to document during the live run
|
| 10 |
+
|
| 11 |
+
- Exact VM image, ROCm version, and Docker image should be easy to capture in benchmark logs.
|
| 12 |
+
- Users need obvious examples for replacing NVIDIA container flags and monitoring commands.
|
| 13 |
+
- More migration examples for common CUDA-first PyTorch repos would reduce onboarding time.
|
| 14 |
+
|
| 15 |
+
## Suggested product improvement
|
| 16 |
+
|
| 17 |
+
Publish a small official CUDA-to-ROCm migration checklist for PyTorch, vLLM, and Hugging Face inference projects, with copyable Docker commands for AMD Developer Cloud.
|
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/ROCM_MIGRATION_COOKBOOK.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCm Migration Cookbook
|
| 2 |
+
|
| 3 |
+
## PyTorch device handling
|
| 4 |
+
|
| 5 |
+
Use a runtime device abstraction instead of hardcoding `.cuda()` or `torch.device("cuda")` everywhere.
|
| 6 |
+
|
| 7 |
+
```python
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| 11 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 12 |
+
model = model.to(device)
|
| 13 |
+
inputs = inputs.to(device)
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## GPU inspection
|
| 17 |
+
|
| 18 |
+
Replace NVIDIA-only commands with ROCm equivalents:
|
| 19 |
+
|
| 20 |
+
```bash
|
| 21 |
+
rocm-smi --showproductname --showmeminfo vram --showuse
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
## Containers
|
| 25 |
+
|
| 26 |
+
For vLLM serving on AMD GPUs, use the ROCm-enabled vLLM image:
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
docker pull vllm/vllm-openai-rocm:latest
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
Run with AMD GPU device access:
|
| 33 |
+
|
| 34 |
+
```bash
|
| 35 |
+
docker run --rm -it --device /dev/kfd --device /dev/dri --group-add video --ipc=host --network=host --security-opt seccomp=unconfined vllm/vllm-openai-rocm:latest
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## Manual review cases
|
| 39 |
+
|
| 40 |
+
Manual migration is still required for CUDA C++ kernels, CUDA-only binary wheels, custom Triton kernels, and libraries that ship only CUDA builds.
|
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/amd_developer_cloud_runbook.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AMD Developer Cloud Runbook
|
| 2 |
+
|
| 3 |
+
This runbook validates `cuda_first_repo` on AMD Developer Cloud without executing untrusted code inside the ROCmPort AI Space.
|
| 4 |
+
|
| 5 |
+
## 1. Create an AMD GPU VM
|
| 6 |
+
|
| 7 |
+
Use an AMD Developer Cloud VM with an AMD Instinct GPU and ROCm-ready Docker support.
|
| 8 |
+
|
| 9 |
+
## 2. Build the ROCm container
|
| 10 |
+
|
| 11 |
+
```bash
|
| 12 |
+
docker build -f Dockerfile.rocm -t rocmport-cuda_first_repo .
|
| 13 |
+
```
|
| 14 |
+
|
| 15 |
+
## 3. Run a smoke check
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
docker run --rm -it \
|
| 19 |
+
--device /dev/kfd \
|
| 20 |
+
--device /dev/dri \
|
| 21 |
+
--group-add video \
|
| 22 |
+
--ipc=host \
|
| 23 |
+
--network=host \
|
| 24 |
+
--security-opt seccomp=unconfined \
|
| 25 |
+
rocmport-cuda_first_repo
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## 4. Run vLLM on ROCm
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
docker run --rm -it \
|
| 32 |
+
--device /dev/kfd \
|
| 33 |
+
--device /dev/dri \
|
| 34 |
+
--group-add video \
|
| 35 |
+
--ipc=host \
|
| 36 |
+
--network=host \
|
| 37 |
+
--security-opt seccomp=unconfined \
|
| 38 |
+
-v "$PWD:/workspace/cuda_first_repo" \
|
| 39 |
+
vllm/vllm-openai-rocm:latest \
|
| 40 |
+
vllm serve Qwen/Qwen3-Coder-Next-FP8 --tensor-parallel-size 1
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## 5. Capture benchmark metadata
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
rocm-smi --showproductname --showmeminfo vram --showuse
|
| 47 |
+
python scripts/collect_benchmark_result.py --output benchmark_result.json
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
Replace `data/benchmark_result.json` with the captured result before final submission.
|
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/benchmark_result.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"verified": false,
|
| 3 |
+
"status": "pending_external_amd_developer_cloud_run",
|
| 4 |
+
"hardware": "AMD Instinct MI300X or compatible AMD Developer Cloud instance",
|
| 5 |
+
"rocm_version": "capture with rocminfo or container metadata",
|
| 6 |
+
"vllm_version": "capture with python -m vllm --version",
|
| 7 |
+
"model": "Qwen/Qwen3-Coder-Next-FP8",
|
| 8 |
+
"prompt_config": {
|
| 9 |
+
"input_tokens": 512,
|
| 10 |
+
"output_tokens": 256,
|
| 11 |
+
"concurrency": 8,
|
| 12 |
+
"requests": 64
|
| 13 |
+
},
|
| 14 |
+
"throughput_tokens_per_second": null,
|
| 15 |
+
"p50_latency_ms": null,
|
| 16 |
+
"p95_latency_ms": null,
|
| 17 |
+
"peak_vram_gb": null,
|
| 18 |
+
"log_excerpt": "Replace this record after running the generated AMD Developer Cloud runbook.",
|
| 19 |
+
"notes": "This file is a transparent benchmark schema, not a fabricated result. The Space displays it as pending until measured values are captured on AMD Developer Cloud."
|
| 20 |
+
}
|
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/migration_report.md
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ROCmPort AI Migration Report: cuda_first_repo
|
| 2 |
+
|
| 3 |
+
## AMD Readiness Score
|
| 4 |
+
|
| 5 |
+
- Before deterministic fixes: 53/100
|
| 6 |
+
- After deterministic fixes: 100/100
|
| 7 |
+
|
| 8 |
+
| Category | Before | After |
|
| 9 |
+
| --- | ---: | ---: |
|
| 10 |
+
| Code portability | 0 | 100 |
|
| 11 |
+
| Environment readiness | 8 | 100 |
|
| 12 |
+
| Serving readiness | 90 | 100 |
|
| 13 |
+
| Benchmark readiness | 65 | 100 |
|
| 14 |
+
| Deployment readiness | 100 | 100 |
|
| 15 |
+
|
| 16 |
+
## Findings
|
| 17 |
+
|
| 18 |
+
| Severity | Category | Location | Finding | Suggested fix |
|
| 19 |
+
| --- | --- | --- | --- | --- |
|
| 20 |
+
| high | Benchmark readiness | `benchmarks/benchmark.py:6` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
|
| 21 |
+
| high | Environment readiness | `Dockerfile:1` | Dockerfile uses an NVIDIA CUDA base image. | Use vllm/vllm-openai-rocm:latest for vLLM serving or rocm/pytorch:latest for PyTorch workloads. |
|
| 22 |
+
| medium | Environment readiness | `Dockerfile:8` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
|
| 23 |
+
| high | Code portability | `infer.py:6` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
|
| 24 |
+
| high | Code portability | `infer.py:11` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
|
| 25 |
+
| high | Code portability | `infer.py:12` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
|
| 26 |
+
| low | Code portability | `infer.py:19` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
|
| 27 |
+
| high | Environment readiness | `scripts/serve_vllm.sh:5` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
|
| 28 |
+
| low | Serving readiness | `scripts/serve_vllm.sh:6` | vLLM serving command found without explicit ROCm container guidance. | Run vLLM inside vllm/vllm-openai-rocm with /dev/kfd, /dev/dri, host IPC, and video group access. |
|
| 29 |
+
|
| 30 |
+
## Generated Artifacts
|
| 31 |
+
|
| 32 |
+
- `rocm_patch.diff` contains deterministic MVP fixes.
|
| 33 |
+
- `Dockerfile.rocm` uses the ROCm-enabled vLLM container.
|
| 34 |
+
- `amd_developer_cloud_runbook.md` documents the validation path.
|
| 35 |
+
- `benchmark_result.json` records the AMD benchmark schema and status.
|
| 36 |
+
|
| 37 |
+
## Qwen Agent Notes
|
| 38 |
+
|
| 39 |
+
Qwen endpoint was not configured. The report uses deterministic scanner output only.
|
| 40 |
+
|
| 41 |
+
## Remaining Risks
|
| 42 |
+
|
| 43 |
+
- CUDA C++ kernels, custom Triton kernels, and CUDA-only binary dependencies require manual review.
|
| 44 |
+
- Uploaded repositories are not executed inside the Space; live validation belongs on AMD Developer Cloud.
|
| 45 |
+
- ROCm performance depends on model, batch shape, vLLM version, ROCm version, and GPU instance configuration.
|
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/rocm_patch.diff
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--- a/Dockerfile
|
| 2 |
+
+++ b/Dockerfile
|
| 3 |
+
@@ -1,10 +1,10 @@
|
| 4 |
+
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
| 5 |
+
+FROM vllm/vllm-openai-rocm:latest
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 10 |
+
COPY . .
|
| 11 |
+
|
| 12 |
+
-ENV NVIDIA_VISIBLE_DEVICES=all
|
| 13 |
+
+ENV HIP_VISIBLE_DEVICES=all
|
| 14 |
+
|
| 15 |
+
CMD ["python", "infer.py"]
|
| 16 |
+
--- a/infer.py
|
| 17 |
+
+++ b/infer.py
|
| 18 |
+
@@ -1,15 +1,18 @@
|
| 19 |
+
import torch
|
| 20 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 21 |
+
|
| 22 |
+
+# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
|
| 23 |
+
+_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 24 |
+
+
|
| 25 |
+
|
| 26 |
+
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
|
| 27 |
+
-device = torch.device("cuda")
|
| 28 |
+
+device = _rocmport_device
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def main():
|
| 32 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 33 |
+
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
|
| 34 |
+
- inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
|
| 35 |
+
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
|
| 36 |
+
+ inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
|
| 37 |
+
with torch.no_grad():
|
| 38 |
+
outputs = model.generate(**inputs, max_new_tokens=64)
|
| 39 |
+
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
| 40 |
+
--- a/scripts/serve_vllm.sh
|
| 41 |
+
+++ b/scripts/serve_vllm.sh
|
| 42 |
+
@@ -2,5 +2,5 @@
|
| 43 |
+
set -euo pipefail
|
| 44 |
+
|
| 45 |
+
export CUDA_VISIBLE_DEVICES=0
|
| 46 |
+
-nvidia-smi
|
| 47 |
+
+rocm-smi
|
| 48 |
+
vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
|
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/rocmport_artifacts.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd2494bdf724b6ec9d98675fea27408938c2ee0f7ee6b2bd219e10a857a1c105
|
| 3 |
+
size 5229
|
artifacts/runtime/rocmport-ui-artifacts-6f4540ff23e142ad9d6ab18154ea44e6/Dockerfile.rocm
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM vllm/vllm-openai-rocm:latest
|
| 2 |
+
|
| 3 |
+
WORKDIR /workspace/cuda_first_repo
|
| 4 |
+
COPY . /workspace/cuda_first_repo
|
| 5 |
+
|
| 6 |
+
RUN if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi
|
| 7 |
+
|
| 8 |
+
ENV HIP_VISIBLE_DEVICES=0
|
| 9 |
+
ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
|
| 10 |
+
|
| 11 |
+
CMD ["python", "-c", "import torch; print('torch', torch.__version__); print('rocm_gpu_available', torch.cuda.is_available())"]
|