Nawangdorjay commited on
Commit
786f63c
·
verified ·
1 Parent(s): f41f9ce

Deploy ROCmPort AI — CUDA-to-ROCm migration scanner

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. .gitignore +12 -0
  3. .venv/Scripts/python.exe +3 -0
  4. .venv/Scripts/pythonw.exe +3 -0
  5. .venv/pyvenv.cfg +5 -0
  6. LICENSE +21 -0
  7. README.md +110 -6
  8. app.py +370 -0
  9. artifacts/check-scoring/Dockerfile.rocm +11 -0
  10. artifacts/check-scoring/ROCM_FEEDBACK.md +17 -0
  11. artifacts/check-scoring/ROCM_MIGRATION_COOKBOOK.md +40 -0
  12. artifacts/check-scoring/amd_developer_cloud_runbook.md +50 -0
  13. artifacts/check-scoring/benchmark_result.json +20 -0
  14. artifacts/check-scoring/migration_report.md +47 -0
  15. artifacts/check-scoring/rocm_patch.diff +50 -0
  16. artifacts/check-scoring/rocmport_artifacts.zip +3 -0
  17. artifacts/check/Dockerfile.rocm +11 -0
  18. artifacts/check/ROCM_FEEDBACK.md +17 -0
  19. artifacts/check/ROCM_MIGRATION_COOKBOOK.md +40 -0
  20. artifacts/check/amd_developer_cloud_runbook.md +50 -0
  21. artifacts/check/benchmark_result.json +20 -0
  22. artifacts/check/migration_report.md +45 -0
  23. artifacts/check/rocm_patch.diff +48 -0
  24. artifacts/check/rocmport_artifacts.zip +3 -0
  25. artifacts/check2/Dockerfile.rocm +11 -0
  26. artifacts/check2/ROCM_FEEDBACK.md +17 -0
  27. artifacts/check2/ROCM_MIGRATION_COOKBOOK.md +40 -0
  28. artifacts/check2/amd_developer_cloud_runbook.md +50 -0
  29. artifacts/check2/benchmark_result.json +20 -0
  30. artifacts/check2/migration_report.md +46 -0
  31. artifacts/check2/rocm_patch.diff +50 -0
  32. artifacts/check2/rocmport_artifacts.zip +3 -0
  33. artifacts/hackathon_content.txt +0 -0
  34. artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/Dockerfile.rocm +11 -0
  35. artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/ROCM_FEEDBACK.md +17 -0
  36. artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/ROCM_MIGRATION_COOKBOOK.md +40 -0
  37. artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/amd_developer_cloud_runbook.md +50 -0
  38. artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/benchmark_result.json +20 -0
  39. artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/migration_report.md +71 -0
  40. artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/rocm_patch.diff +155 -0
  41. artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/rocmport_artifacts.zip +3 -0
  42. artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/Dockerfile.rocm +11 -0
  43. artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/ROCM_FEEDBACK.md +17 -0
  44. artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/ROCM_MIGRATION_COOKBOOK.md +40 -0
  45. artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/amd_developer_cloud_runbook.md +50 -0
  46. artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/benchmark_result.json +20 -0
  47. artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/migration_report.md +45 -0
  48. artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/rocm_patch.diff +48 -0
  49. artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/rocmport_artifacts.zip +3 -0
  50. artifacts/runtime/rocmport-ui-artifacts-6f4540ff23e142ad9d6ab18154ea44e6/Dockerfile.rocm +11 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ .venv/Scripts/python.exe filter=lfs diff=lfs merge=lfs -text
37
+ .venv/Scripts/pythonw.exe filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ .pytest_cache/
3
+ .venv/
4
+ venv/
5
+ dist/
6
+ build/
7
+ *.egg-info/
8
+ .gradio/
9
+ artifacts/
10
+ tmp/
11
+ .tmp/
12
+ pytest-cache-files-*/
.venv/Scripts/python.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8afe555632efdbf8b01309532efc9389c5d4417fac976f6ee4389c7750772745
3
+ size 269072
.venv/Scripts/pythonw.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:080e0247dd5a0240070bb2244b2bd64fc31217c764c8c9d7ce3bf844760ba88e
3
+ size 256784
.venv/pyvenv.cfg ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ home = C:\Users\nawan\anaconda3
2
+ include-system-site-packages = false
3
+ version = 3.12.4
4
+ executable = C:\Users\nawan\anaconda3\python.exe
5
+ command = C:\Users\nawan\anaconda3\python.exe -m venv C:\Users\nawan\Documents\Codex\2026-05-05\come-build-the-next-generation-of\.venv
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 ROCmPort AI contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,11 +1,115 @@
1
  ---
2
  title: ROCmPort AI
3
- emoji: 👁
4
- colorFrom: purple
5
- colorTo: purple
6
- sdk: static
 
7
  pinned: false
8
- short_description: 'CUDA to ROCm/HIP migration assistant using AI agents on AMD '
 
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: ROCmPort AI
3
+ emoji:
4
+ colorFrom: red
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ app_file: app.py
8
  pinned: false
9
+ license: mit
10
+ short_description: CUDA-to-ROCm migration scanner for PyTorch, HF & vLLM repos
11
  ---
12
 
13
+ # ROCmPort AI
14
+
15
+ > **AMD Developer Hackathon — lablab.ai** | Track: AI Agents & Agentic Workflows
16
+
17
+ ROCmPort AI is a **CUDA-to-ROCm migration scanner** powered by a three-agent CrewAI pipeline and Qwen3-Coder running on AMD Instinct GPUs. Drop in any CUDA-first PyTorch, Hugging Face, or vLLM repository and get a full AMD readiness report in seconds.
18
+
19
+ ## What it does
20
+
21
+ ```mermaid
22
+ graph LR
23
+ User([User Repo]) --> Gradio[Gradio UI]
24
+ Gradio --> Pipeline{Pipeline}
25
+
26
+ subgraph Agentic Workflow
27
+ Pipeline --> Auditor[CUDA Auditor]
28
+ Auditor --> Engineer[ROCm Engineer]
29
+ Engineer --> Reporter[Report Writer]
30
+ end
31
+
32
+ Reporter --> LLM[(Qwen3-Coder on AMD Instinct)]
33
+ LLM --> Reporter
34
+
35
+ Pipeline --> Scanner[Deterministic Scanner]
36
+ Scanner --> Patcher[Patcher]
37
+ Patcher --> Artifacts[Artifact Generator]
38
+
39
+ Reporter --> Final([Migration Artifacts & Patch])
40
+ Artifacts --> Final
41
+ ```
42
+
43
+ | Output | Description |
44
+ |---|---|
45
+ | **AMD Readiness Score** | Before/after scores across 5 categories |
46
+ | **Findings table** | File + line references for every CUDA blocker |
47
+ | **ROCm patch diff** | Auto-generated unified diff to apply deterministic fixes |
48
+ | **Dockerfile.rocm** | ROCm-enabled container using vllm/vllm-openai-rocm |
49
+ | **AMD Developer Cloud Runbook** | Exact validation commands for AMD Instinct GPUs |
50
+ | **Migration report** | Narrative report (CrewAI + Qwen when configured) |
51
+ | **Benchmark schema** | Structured result to fill after AMD Developer Cloud run |
52
+ | **Artifact ZIP** | All outputs bundled for download |
53
+
54
+ ## Three-agent pipeline
55
+
56
+ When `QWEN_BASE_URL` and `QWEN_API_KEY` are set (pointing to a Qwen3-Coder endpoint on AMD Instinct MI300X via vLLM), three CrewAI agents collaborate:
57
+
58
+ 1. **CUDA Migration Auditor** — scans every file for blockers using `scan_cuda_repository` tool
59
+ 2. **ROCm Migration Engineer** — generates the patch diff using `generate_rocm_patch` tool
60
+ 3. **Migration Report Writer** — synthesises findings into an actionable Markdown report
61
+
62
+ Without those env vars the app falls back to the fully deterministic scanner + patcher (which always runs).
63
+
64
+ ## Run locally
65
+
66
+ ```bash
67
+ pip install -r requirements.txt
68
+ python app.py
69
+ ```
70
+
71
+ App listens on `http://127.0.0.1:7860`.
72
+
73
+ ## Enable the full CrewAI + Qwen pipeline
74
+
75
+ ```bash
76
+ # Windows
77
+ set QWEN_BASE_URL=https://your-amd-instinct-endpoint/v1
78
+ set QWEN_API_KEY=your-token
79
+ set QWEN_MODEL=Qwen/Qwen3-Coder-Next-FP8
80
+ python app.py
81
+
82
+ # Linux / macOS
83
+ QWEN_BASE_URL=https://your-amd-instinct-endpoint/v1 \
84
+ QWEN_API_KEY=your-token \
85
+ QWEN_MODEL=Qwen/Qwen3-Coder-Next-FP8 \
86
+ python app.py
87
+ ```
88
+
89
+ ## Tests
90
+
91
+ ```bash
92
+ python -m pytest tests/ -v
93
+ ```
94
+
95
+ 7 tests cover the scanner, pipeline, and CrewAI agent layer.
96
+
97
+ ## AMD Benchmark
98
+
99
+ The `data/benchmark_result.json` is a transparent **pending benchmark schema** — not a fabricated result. Run the generated AMD Developer Cloud runbook (shown in the app's Runbook tab) on an AMD Instinct MI300X instance to capture real throughput, latency, and VRAM figures, then replace the file.
100
+
101
+ ## Deploy to Hugging Face Spaces
102
+
103
+ ```bash
104
+ python scripts/deploy_to_hf.py --token hf_... --username YourHFUsername
105
+ ```
106
+
107
+ ## Tech stack
108
+
109
+ - **AMD Developer Cloud** + **AMD Instinct MI300X** for GPU compute
110
+ - **ROCm** — open-source GPU computing platform
111
+ - **CrewAI** — multi-agent orchestration
112
+ - **Qwen3-Coder-Next-FP8** — code-specialist LLM on AMD hardware
113
+ - **vLLM (ROCm build)** — high-throughput serving
114
+ - **Hugging Face** — model hub + Space hosting
115
+ - **Gradio 5** — web UI
app.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import uuid
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import gradio as gr
10
+
11
+ from rocmport.agents import CREWAI_AVAILABLE
12
+ from rocmport.ingest import PreparedRepo, prepare_github_repo, prepare_uploaded_zip, sample_repo_path
13
+ from rocmport.models import CATEGORY_LABELS, MigrationBundle
14
+ from rocmport.pipeline import analyze_repository
15
+
16
+
17
+ def _pipeline_mode_html() -> str:
18
+ """Return an HTML badge indicating whether the agentic CrewAI pipeline is active."""
19
+ if (
20
+ CREWAI_AVAILABLE
21
+ and os.getenv("QWEN_BASE_URL", "").strip()
22
+ and os.getenv("QWEN_API_KEY", "").strip()
23
+ ):
24
+ return (
25
+ "<div class='mode-badge agentic'>"
26
+ "🤖 <strong>CrewAI Agentic Mode</strong> &mdash; "
27
+ "CUDA Auditor &rarr; ROCm Engineer &rarr; Report Writer agents active "
28
+ "(powered by Qwen3-Coder on AMD Instinct)"
29
+ "</div>"
30
+ )
31
+ return (
32
+ "<div class='mode-badge deterministic'>"
33
+ "⚙️ <strong>Deterministic Mode</strong> &mdash; "
34
+ "Set <code>QWEN_BASE_URL</code> &amp; <code>QWEN_API_KEY</code> "
35
+ "to enable the full CrewAI multi-agent pipeline."
36
+ "</div>"
37
+ )
38
+
39
+
40
+ PROJECT_ROOT = Path(__file__).resolve().parent
41
+ RUNTIME_DIR = PROJECT_ROOT / "artifacts" / "runtime"
42
+ RUNTIME_DIR.mkdir(parents=True, exist_ok=True)
43
+ os.environ.setdefault("ROCMPORT_TMP_DIR", str(RUNTIME_DIR))
44
+
45
+
46
+ def run_analysis(source_mode: str, uploaded_zip: str | None, github_url: str, branch: str) -> tuple[Any, ...]:
47
+ try:
48
+ prepared = _prepare_repo(source_mode, uploaded_zip, github_url, branch)
49
+ output_dir = RUNTIME_DIR / f"rocmport-ui-artifacts-{uuid.uuid4().hex}"
50
+ output_dir.mkdir(parents=True, exist_ok=False)
51
+ bundle = analyze_repository(prepared.path, output_dir=output_dir, repo_name=prepared.name)
52
+ return _format_outputs(bundle)
53
+ except Exception as exc:
54
+ error = f"Analysis failed: {exc}"
55
+ empty_scores = "<div class='score-card'><h2>Analysis failed</h2><p>{}</p></div>".format(error)
56
+ return (
57
+ empty_scores,
58
+ [],
59
+ error,
60
+ "",
61
+ "",
62
+ "",
63
+ error,
64
+ "{}",
65
+ error,
66
+ None,
67
+ )
68
+
69
+
70
+ def _prepare_repo(source_mode: str, uploaded_zip: str | None, github_url: str, branch: str) -> PreparedRepo:
71
+ if source_mode == "Built-in sample":
72
+ sample = sample_repo_path(PROJECT_ROOT)
73
+ return PreparedRepo(path=sample, name="cuda_first_repo")
74
+ if source_mode == "Uploaded ZIP":
75
+ if not uploaded_zip:
76
+ raise ValueError("Upload a ZIP file or switch to the built-in sample.")
77
+ return prepare_uploaded_zip(uploaded_zip)
78
+ if source_mode == "Public GitHub URL":
79
+ if not github_url.strip():
80
+ raise ValueError("Enter a public GitHub repository URL.")
81
+ return prepare_github_repo(github_url, branch.strip() or "main")
82
+ raise ValueError("Unknown source mode.")
83
+
84
+
85
+ def _format_outputs(bundle: MigrationBundle) -> tuple[Any, ...]:
86
+ benchmark_json = json.dumps(bundle.benchmark, indent=2)
87
+ return (
88
+ _score_html(bundle),
89
+ bundle.findings_table(),
90
+ _migration_plan_markdown(bundle),
91
+ bundle.patch_diff,
92
+ bundle.dockerfile,
93
+ bundle.runbook,
94
+ _benchmark_markdown(bundle.benchmark),
95
+ benchmark_json,
96
+ bundle.report,
97
+ bundle.artifact_paths.get("rocmport_artifacts.zip"),
98
+ )
99
+
100
+
101
+ def _score_html(bundle: MigrationBundle) -> str:
102
+ rows = []
103
+ for category, label in CATEGORY_LABELS.items():
104
+ before = bundle.before_score.categories[category]
105
+ after = bundle.after_score.categories[category]
106
+ rows.append(
107
+ f"""
108
+ <tr>
109
+ <td>{label}</td>
110
+ <td><div class="meter"><span style="width:{before}%"></span></div><strong>{before}</strong></td>
111
+ <td><div class="meter after"><span style="width:{after}%"></span></div><strong>{after}</strong></td>
112
+ </tr>
113
+ """
114
+ )
115
+ return f"""
116
+ <div class="score-wrap">
117
+ <div class="score-card">
118
+ <div class="score-label">Before</div>
119
+ <div class="score-number">{bundle.before_score.total}</div>
120
+ </div>
121
+ <div class="score-card">
122
+ <div class="score-label">Migration package</div>
123
+ <div class="score-number after-text">{bundle.after_score.total}</div>
124
+ </div>
125
+ <div class="score-card">
126
+ <div class="score-label">Findings</div>
127
+ <div class="score-number">{len(bundle.findings)}</div>
128
+ </div>
129
+ </div>
130
+ <table class="score-table">
131
+ <thead><tr><th>Category</th><th>Before</th><th>Migration package</th></tr></thead>
132
+ <tbody>{''.join(rows)}</tbody>
133
+ </table>
134
+ """
135
+
136
+
137
+ def _migration_plan_markdown(bundle: MigrationBundle) -> str:
138
+ if not bundle.findings:
139
+ return "### Migration Plan\n\nNo blockers were found. Run the generated AMD Developer Cloud smoke test before submission."
140
+ grouped: dict[str, list[str]] = {}
141
+ for finding in bundle.findings:
142
+ grouped.setdefault(finding.category, []).append(
143
+ f"- `{finding.path}:{finding.line}`: {finding.suggested_fix}"
144
+ )
145
+ sections = ["### Migration Plan"]
146
+ for category, label in CATEGORY_LABELS.items():
147
+ if category not in grouped:
148
+ continue
149
+ sections.append(f"\n#### {label}\n" + "\n".join(grouped[category][:8]))
150
+ return "\n".join(sections)
151
+
152
+
153
+ def _benchmark_markdown(benchmark: dict[str, Any]) -> str:
154
+ verified = benchmark.get("verified", False)
155
+ status = "Verified AMD Developer Cloud run" if verified else "Pending AMD Developer Cloud run"
156
+ lines = [
157
+ f"### {status}",
158
+ "",
159
+ f"- Hardware: `{benchmark.get('hardware', 'not captured')}`",
160
+ f"- ROCm: `{benchmark.get('rocm_version', 'not captured')}`",
161
+ f"- vLLM: `{benchmark.get('vllm_version', 'not captured')}`",
162
+ f"- Model: `{benchmark.get('model', 'not captured')}`",
163
+ f"- Throughput tokens/sec: `{benchmark.get('throughput_tokens_per_second', 'not captured')}`",
164
+ f"- P50 latency ms: `{benchmark.get('p50_latency_ms', 'not captured')}`",
165
+ f"- Peak VRAM GB: `{benchmark.get('peak_vram_gb', 'not captured')}`",
166
+ "",
167
+ benchmark.get("notes", "Run the generated AMD Developer Cloud runbook to replace this record with measured values."),
168
+ ]
169
+ return "\n".join(lines)
170
+
171
+
172
+ CSS = """
173
+ .gradio-container { max-width: 1280px !important; }
174
+ .mode-badge {
175
+ padding: 10px 16px;
176
+ border-radius: 8px;
177
+ font-size: 14px;
178
+ margin-bottom: 16px;
179
+ border: 1px solid;
180
+ box-shadow: 0 4px 12px rgba(0,0,0,0.05);
181
+ }
182
+ .mode-badge.agentic {
183
+ background: rgba(8, 127, 91, 0.1);
184
+ border-color: #087f5b;
185
+ color: var(--body-text-color);
186
+ }
187
+ .mode-badge.deterministic {
188
+ background: rgba(54, 79, 199, 0.1);
189
+ border-color: #748ffc;
190
+ color: var(--body-text-color);
191
+ }
192
+ .score-wrap {
193
+ display: grid;
194
+ grid-template-columns: repeat(3, minmax(0, 1fr));
195
+ gap: 16px;
196
+ margin: 12px 0 24px;
197
+ }
198
+ .score-card {
199
+ border: 1px solid var(--border-color-primary);
200
+ border-radius: 12px;
201
+ padding: 20px;
202
+ background: var(--background-fill-secondary);
203
+ box-shadow: 0 4px 20px rgba(0,0,0,0.08);
204
+ transition: transform 0.2s ease, box-shadow 0.2s ease;
205
+ }
206
+ .score-card:hover {
207
+ transform: translateY(-2px);
208
+ box-shadow: 0 8px 24px rgba(237, 28, 36, 0.15);
209
+ }
210
+ .score-label {
211
+ color: var(--body-text-color-subdued);
212
+ font-size: 14px;
213
+ text-transform: uppercase;
214
+ letter-spacing: 0.5px;
215
+ margin-bottom: 8px;
216
+ font-weight: 600;
217
+ }
218
+ .score-number {
219
+ color: var(--body-text-color);
220
+ font-size: 42px;
221
+ line-height: 1;
222
+ font-weight: 800;
223
+ }
224
+ .after-text { color: #ed1c24; } /* AMD Red */
225
+ .score-table {
226
+ width: 100%;
227
+ border-collapse: collapse;
228
+ margin-top: 12px;
229
+ }
230
+ .score-table th {
231
+ text-transform: uppercase;
232
+ font-size: 12px;
233
+ color: var(--body-text-color-subdued);
234
+ }
235
+ .score-table th,
236
+ .score-table td {
237
+ border-bottom: 1px solid var(--border-color-primary);
238
+ padding: 12px 8px;
239
+ text-align: left;
240
+ }
241
+ .meter {
242
+ width: calc(100% - 48px);
243
+ height: 10px;
244
+ background: var(--background-fill-primary);
245
+ border-radius: 5px;
246
+ display: inline-block;
247
+ vertical-align: middle;
248
+ margin-right: 8px;
249
+ overflow: hidden;
250
+ }
251
+ .meter span {
252
+ display: block;
253
+ height: 100%;
254
+ background: var(--body-text-color-subdued);
255
+ border-radius: 5px;
256
+ transition: width 1s cubic-bezier(0.4, 0, 0.2, 1);
257
+ }
258
+ .meter.after span { background: linear-gradient(90deg, #b80000 0%, #ed1c24 100%); }
259
+ #findings-table table {
260
+ table-layout: fixed;
261
+ }
262
+ #findings-table th {
263
+ white-space: nowrap;
264
+ }
265
+ """
266
+
267
+ THEME = gr.themes.Soft(
268
+ primary_hue="red",
269
+ neutral_hue="zinc",
270
+ font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"]
271
+ ).set(
272
+ button_primary_background_fill="linear-gradient(90deg, #ed1c24 0%, #b80000 100%)",
273
+ button_primary_background_fill_hover="linear-gradient(90deg, #ff333a 0%, #cc0000 100%)",
274
+ button_primary_text_color="white",
275
+ block_title_text_weight="600",
276
+ block_shadow="*shadow_drop_lg",
277
+ block_border_width="0px",
278
+ block_radius="*radius_lg"
279
+ )
280
+
281
+
282
+ with gr.Blocks(title="ROCmPort AI ⚡") as demo:
283
+ gr.Markdown("# ROCmPort AI ⚡")
284
+ gr.Markdown("CUDA-to-ROCm migration scanner for PyTorch, Hugging Face, and vLLM repositories.")
285
+ gr.HTML(_pipeline_mode_html())
286
+ gr.Markdown(
287
+ "> **How it works:** Three CrewAI agents collaborate to migrate the repository. All scoring is deterministic.\n\n"
288
+ "```text\n"
289
+ " [ 📁 User Repository ]\n"
290
+ " │\n"
291
+ " ▼\n"
292
+ " [ 🖥️ Gradio UI ]\n"
293
+ " │\n"
294
+ " ▼\n"
295
+ " [ ⚙️ Pipeline ]\n"
296
+ " │\n"
297
+ " ┌─────────────────┴─────────────────┐\n"
298
+ " (Agentic Workflow) (Deterministic Fallback)\n"
299
+ " │ │\n"
300
+ " [ 🕵️ CUDA Auditor ] [ 🔍 Scanner ]\n"
301
+ " │ │\n"
302
+ " [ 🛠️ ROCm Engineer ] [ 🩹 Patcher ]\n"
303
+ " │ │\n"
304
+ " [ 📝 Report Writer ] [ 📦 Artifacts ]\n"
305
+ " │ │\n"
306
+ " (🧠 Qwen3 on MI300X) │\n"
307
+ " │ │\n"
308
+ " └─────────────────┬─────────────────┘\n"
309
+ " ▼\n"
310
+ " [ 🎯 Final Migration Package ]\n"
311
+ "```"
312
+ )
313
+
314
+ with gr.Row():
315
+ source_mode = gr.Radio(
316
+ choices=["Built-in sample", "Uploaded ZIP", "Public GitHub URL"],
317
+ value="Built-in sample",
318
+ label="Repository source",
319
+ )
320
+ uploaded_zip = gr.File(label="Repository ZIP", type="filepath", file_types=[".zip"])
321
+ with gr.Row():
322
+ github_url = gr.Textbox(label="GitHub URL", placeholder="https://github.com/owner/repo")
323
+ branch = gr.Textbox(label="Branch", value="main")
324
+
325
+ analyze_button = gr.Button("Analyze repository", variant="primary")
326
+
327
+ with gr.Tabs():
328
+ with gr.Tab("Scan"):
329
+ score_html = gr.HTML(label="AMD Readiness Score")
330
+ findings_table = gr.Dataframe(
331
+ headers=["Severity", "Category", "Path", "Line", "Finding", "Suggested fix"],
332
+ label="Findings",
333
+ wrap=True,
334
+ column_widths=[92, 210, 260, 72, 500, 620],
335
+ elem_id="findings-table",
336
+ )
337
+ migration_plan = gr.Markdown(label="Migration Plan")
338
+ with gr.Tab("Patch"):
339
+ patch_diff = gr.Code(label="rocm_patch.diff", language=None, lines=20)
340
+ dockerfile = gr.Code(label="Dockerfile.rocm", language="dockerfile", lines=18)
341
+ runbook = gr.Markdown(label="AMD Developer Cloud Runbook")
342
+ with gr.Tab("Benchmark"):
343
+ benchmark_md = gr.Markdown(label="Benchmark Summary")
344
+ benchmark_json = gr.Code(label="benchmark_result.json", language="json", lines=18)
345
+ with gr.Tab("Report"):
346
+ report_md = gr.Markdown(label="Migration Report")
347
+ artifact_zip = gr.File(label="Download migration artifact bundle")
348
+
349
+ analyze_button.click(
350
+ fn=run_analysis,
351
+ inputs=[source_mode, uploaded_zip, github_url, branch],
352
+ outputs=[
353
+ score_html,
354
+ findings_table,
355
+ migration_plan,
356
+ patch_diff,
357
+ dockerfile,
358
+ runbook,
359
+ benchmark_md,
360
+ benchmark_json,
361
+ report_md,
362
+ artifact_zip,
363
+ ],
364
+ )
365
+
366
+
367
+ if __name__ == "__main__":
368
+ server_name = os.getenv("GRADIO_SERVER_NAME") or ("0.0.0.0" if os.getenv("SPACE_ID") else "127.0.0.1")
369
+ server_port = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
370
+ demo.launch(server_name=server_name, server_port=server_port, theme=THEME, css=CSS, quiet=True)
artifacts/check-scoring/Dockerfile.rocm ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM vllm/vllm-openai-rocm:latest
2
+
3
+ WORKDIR /workspace/cuda_first_repo
4
+ COPY . /workspace/cuda_first_repo
5
+
6
+ RUN if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi
7
+
8
+ ENV HIP_VISIBLE_DEVICES=0
9
+ ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
10
+
11
+ CMD ["python", "-c", "import torch; print('torch', torch.__version__); print('rocm_gpu_available', torch.cuda.is_available())"]
artifacts/check-scoring/ROCM_FEEDBACK.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCm / AMD Developer Cloud Feedback
2
+
3
+ ## What worked well
4
+
5
+ - The ROCm-enabled vLLM container gives developers a clear serving path for AMD Instinct GPUs.
6
+ - AMD Developer Cloud is well aligned with hackathon demos because developers can avoid local GPU setup.
7
+ - Qwen3-Coder-Next on AMD Instinct is a strong story for repo-level coding agents.
8
+
9
+ ## Friction points to document during the live run
10
+
11
+ - Exact VM image, ROCm version, and Docker image should be easy to capture in benchmark logs.
12
+ - Users need obvious examples for replacing NVIDIA container flags and monitoring commands.
13
+ - More migration examples for common CUDA-first PyTorch repos would reduce onboarding time.
14
+
15
+ ## Suggested product improvement
16
+
17
+ Publish a small official CUDA-to-ROCm migration checklist for PyTorch, vLLM, and Hugging Face inference projects, with copyable Docker commands for AMD Developer Cloud.
artifacts/check-scoring/ROCM_MIGRATION_COOKBOOK.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCm Migration Cookbook
2
+
3
+ ## PyTorch device handling
4
+
5
+ Use a runtime device abstraction instead of hardcoding `.cuda()` or `torch.device("cuda")` everywhere.
6
+
7
+ ```python
8
+ import torch
9
+
10
+ # ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+ model = model.to(device)
13
+ inputs = inputs.to(device)
14
+ ```
15
+
16
+ ## GPU inspection
17
+
18
+ Replace NVIDIA-only commands with ROCm equivalents:
19
+
20
+ ```bash
21
+ rocm-smi --showproductname --showmeminfo vram --showuse
22
+ ```
23
+
24
+ ## Containers
25
+
26
+ For vLLM serving on AMD GPUs, use the ROCm-enabled vLLM image:
27
+
28
+ ```bash
29
+ docker pull vllm/vllm-openai-rocm:latest
30
+ ```
31
+
32
+ Run with AMD GPU device access:
33
+
34
+ ```bash
35
+ docker run --rm -it --device /dev/kfd --device /dev/dri --group-add video --ipc=host --network=host --security-opt seccomp=unconfined vllm/vllm-openai-rocm:latest
36
+ ```
37
+
38
+ ## Manual review cases
39
+
40
+ Manual migration is still required for CUDA C++ kernels, CUDA-only binary wheels, custom Triton kernels, and libraries that ship only CUDA builds.
artifacts/check-scoring/amd_developer_cloud_runbook.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AMD Developer Cloud Runbook
2
+
3
+ This runbook validates `cuda_first_repo` on AMD Developer Cloud without executing untrusted code inside the ROCmPort AI Space.
4
+
5
+ ## 1. Create an AMD GPU VM
6
+
7
+ Use an AMD Developer Cloud VM with an AMD Instinct GPU and ROCm-ready Docker support.
8
+
9
+ ## 2. Build the ROCm container
10
+
11
+ ```bash
12
+ docker build -f Dockerfile.rocm -t rocmport-cuda_first_repo .
13
+ ```
14
+
15
+ ## 3. Run a smoke check
16
+
17
+ ```bash
18
+ docker run --rm -it \
19
+ --device /dev/kfd \
20
+ --device /dev/dri \
21
+ --group-add video \
22
+ --ipc=host \
23
+ --network=host \
24
+ --security-opt seccomp=unconfined \
25
+ rocmport-cuda_first_repo
26
+ ```
27
+
28
+ ## 4. Run vLLM on ROCm
29
+
30
+ ```bash
31
+ docker run --rm -it \
32
+ --device /dev/kfd \
33
+ --device /dev/dri \
34
+ --group-add video \
35
+ --ipc=host \
36
+ --network=host \
37
+ --security-opt seccomp=unconfined \
38
+ -v "$PWD:/workspace/cuda_first_repo" \
39
+ vllm/vllm-openai-rocm:latest \
40
+ vllm serve Qwen/Qwen3-Coder-Next-FP8 --tensor-parallel-size 1
41
+ ```
42
+
43
+ ## 5. Capture benchmark metadata
44
+
45
+ ```bash
46
+ rocm-smi --showproductname --showmeminfo vram --showuse
47
+ python scripts/collect_benchmark_result.py --output benchmark_result.json
48
+ ```
49
+
50
+ Replace `data/benchmark_result.json` with the captured result before final submission.
artifacts/check-scoring/benchmark_result.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "verified": false,
3
+ "status": "pending_external_amd_developer_cloud_run",
4
+ "hardware": "AMD Instinct MI300X or compatible AMD Developer Cloud instance",
5
+ "rocm_version": "capture with rocminfo or container metadata",
6
+ "vllm_version": "capture with python -m vllm --version",
7
+ "model": "Qwen/Qwen3-Coder-Next-FP8",
8
+ "prompt_config": {
9
+ "input_tokens": 512,
10
+ "output_tokens": 256,
11
+ "concurrency": 8,
12
+ "requests": 64
13
+ },
14
+ "throughput_tokens_per_second": null,
15
+ "p50_latency_ms": null,
16
+ "p95_latency_ms": null,
17
+ "peak_vram_gb": null,
18
+ "log_excerpt": "Replace this record after running the generated AMD Developer Cloud runbook.",
19
+ "notes": "This file is a transparent benchmark schema, not a fabricated result. The Space displays it as pending until measured values are captured on AMD Developer Cloud."
20
+ }
artifacts/check-scoring/migration_report.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCmPort AI Migration Report: cuda_first_repo
2
+
3
+ ## AMD Readiness Score
4
+
5
+ - Before deterministic fixes: 51/100
6
+ - Migration package generated: 90/100
7
+ - This score means ROCm migration artifacts were generated and are ready for AMD Developer Cloud validation; it is not a production certification.
8
+
9
+ | Category | Before | Migration package |
10
+ | --- | ---: | ---: |
11
+ | Code portability | 0 | 86 |
12
+ | Environment readiness | 0 | 86 |
13
+ | Serving readiness | 90 | 98 |
14
+ | Benchmark readiness | 65 | 85 |
15
+ | Deployment readiness | 100 | 95 |
16
+
17
+ ## Findings
18
+
19
+ | Severity | Category | Location | Finding | Suggested fix |
20
+ | --- | --- | --- | --- | --- |
21
+ | high | Benchmark readiness | `benchmarks/benchmark.py:6` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
22
+ | high | Environment readiness | `Dockerfile:1` | Dockerfile uses an NVIDIA CUDA base image. | Use vllm/vllm-openai-rocm:latest for vLLM serving or rocm/pytorch:latest for PyTorch workloads. |
23
+ | medium | Environment readiness | `Dockerfile:8` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
24
+ | high | Code portability | `infer.py:6` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
25
+ | high | Code portability | `infer.py:11` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
26
+ | high | Code portability | `infer.py:12` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
27
+ | low | Code portability | `infer.py:19` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
28
+ | medium | Environment readiness | `scripts/serve_vllm.sh:4` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
29
+ | high | Environment readiness | `scripts/serve_vllm.sh:5` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
30
+ | low | Serving readiness | `scripts/serve_vllm.sh:6` | vLLM serving command found without explicit ROCm container guidance. | Run vLLM inside vllm/vllm-openai-rocm with /dev/kfd, /dev/dri, host IPC, and video group access. |
31
+
32
+ ## Generated Artifacts
33
+
34
+ - `rocm_patch.diff` contains deterministic MVP fixes.
35
+ - `Dockerfile.rocm` uses the ROCm-enabled vLLM container.
36
+ - `amd_developer_cloud_runbook.md` documents the validation path.
37
+ - `benchmark_result.json` records the AMD benchmark schema and status.
38
+
39
+ ## Qwen Agent Notes
40
+
41
+ Qwen endpoint was not configured. The report uses deterministic scanner output only.
42
+
43
+ ## Remaining Risks
44
+
45
+ - CUDA C++ kernels, custom Triton kernels, and CUDA-only binary dependencies require manual review.
46
+ - Uploaded repositories are not executed inside the Space; live validation belongs on AMD Developer Cloud.
47
+ - ROCm performance depends on model, batch shape, vLLM version, ROCm version, and GPU instance configuration.
artifacts/check-scoring/rocm_patch.diff ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --- a/Dockerfile
2
+ +++ b/Dockerfile
3
+ @@ -1,10 +1,10 @@
4
+ -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
5
+ +FROM vllm/vllm-openai-rocm:latest
6
+
7
+ WORKDIR /app
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+ COPY . .
11
+
12
+ -ENV NVIDIA_VISIBLE_DEVICES=all
13
+ +ENV HIP_VISIBLE_DEVICES=all
14
+
15
+ CMD ["python", "infer.py"]
16
+ --- a/infer.py
17
+ +++ b/infer.py
18
+ @@ -1,15 +1,18 @@
19
+ import torch
20
+ from transformers import AutoModelForCausalLM, AutoTokenizer
21
+
22
+ +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
23
+ +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
+ +
25
+
26
+ MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
27
+ -device = torch.device("cuda")
28
+ +device = _rocmport_device
29
+
30
+
31
+ def main():
32
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
33
+ - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
34
+ - inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
35
+ + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
36
+ + inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
37
+ with torch.no_grad():
38
+ outputs = model.generate(**inputs, max_new_tokens=64)
39
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
40
+ --- a/scripts/serve_vllm.sh
41
+ +++ b/scripts/serve_vllm.sh
42
+ @@ -1,6 +1,6 @@
43
+ #!/usr/bin/env bash
44
+ set -euo pipefail
45
+
46
+ -export CUDA_VISIBLE_DEVICES=0
47
+ -nvidia-smi
48
+ +export HIP_VISIBLE_DEVICES=0
49
+ +rocm-smi
50
+ vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
artifacts/check-scoring/rocmport_artifacts.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f46889d5d26e62603bf801b51a98a18d203ef648bf52009a0be2777c57dc359a
3
+ size 5349
artifacts/check/Dockerfile.rocm ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM vllm/vllm-openai-rocm:latest
2
+
3
+ WORKDIR /workspace/cuda_first_repo
4
+ COPY . /workspace/cuda_first_repo
5
+
6
+ RUN if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi
7
+
8
+ ENV HIP_VISIBLE_DEVICES=0
9
+ ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
10
+
11
+ CMD ["python", "-c", "import torch; print('torch', torch.__version__); print('rocm_gpu_available', torch.cuda.is_available())"]
artifacts/check/ROCM_FEEDBACK.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCm / AMD Developer Cloud Feedback
2
+
3
+ ## What worked well
4
+
5
+ - The ROCm-enabled vLLM container gives developers a clear serving path for AMD Instinct GPUs.
6
+ - AMD Developer Cloud is well aligned with hackathon demos because developers can avoid local GPU setup.
7
+ - Qwen3-Coder-Next on AMD Instinct is a strong story for repo-level coding agents.
8
+
9
+ ## Friction points to document during the live run
10
+
11
+ - Exact VM image, ROCm version, and Docker image should be easy to capture in benchmark logs.
12
+ - Users need obvious examples for replacing NVIDIA container flags and monitoring commands.
13
+ - More migration examples for common CUDA-first PyTorch repos would reduce onboarding time.
14
+
15
+ ## Suggested product improvement
16
+
17
+ Publish a small official CUDA-to-ROCm migration checklist for PyTorch, vLLM, and Hugging Face inference projects, with copyable Docker commands for AMD Developer Cloud.
artifacts/check/ROCM_MIGRATION_COOKBOOK.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCm Migration Cookbook
2
+
3
+ ## PyTorch device handling
4
+
5
+ Use a runtime device abstraction instead of hardcoding `.cuda()` or `torch.device("cuda")` everywhere.
6
+
7
+ ```python
8
+ import torch
9
+
10
+ # ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+ model = model.to(device)
13
+ inputs = inputs.to(device)
14
+ ```
15
+
16
+ ## GPU inspection
17
+
18
+ Replace NVIDIA-only commands with ROCm equivalents:
19
+
20
+ ```bash
21
+ rocm-smi --showproductname --showmeminfo vram --showuse
22
+ ```
23
+
24
+ ## Containers
25
+
26
+ For vLLM serving on AMD GPUs, use the ROCm-enabled vLLM image:
27
+
28
+ ```bash
29
+ docker pull vllm/vllm-openai-rocm:latest
30
+ ```
31
+
32
+ Run with AMD GPU device access:
33
+
34
+ ```bash
35
+ docker run --rm -it --device /dev/kfd --device /dev/dri --group-add video --ipc=host --network=host --security-opt seccomp=unconfined vllm/vllm-openai-rocm:latest
36
+ ```
37
+
38
+ ## Manual review cases
39
+
40
+ Manual migration is still required for CUDA C++ kernels, CUDA-only binary wheels, custom Triton kernels, and libraries that ship only CUDA builds.
artifacts/check/amd_developer_cloud_runbook.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AMD Developer Cloud Runbook
2
+
3
+ This runbook validates `cuda_first_repo` on AMD Developer Cloud without executing untrusted code inside the ROCmPort AI Space.
4
+
5
+ ## 1. Create an AMD GPU VM
6
+
7
+ Use an AMD Developer Cloud VM with an AMD Instinct GPU and ROCm-ready Docker support.
8
+
9
+ ## 2. Build the ROCm container
10
+
11
+ ```bash
12
+ docker build -f Dockerfile.rocm -t rocmport-cuda_first_repo .
13
+ ```
14
+
15
+ ## 3. Run a smoke check
16
+
17
+ ```bash
18
+ docker run --rm -it \
19
+ --device /dev/kfd \
20
+ --device /dev/dri \
21
+ --group-add video \
22
+ --ipc=host \
23
+ --network=host \
24
+ --security-opt seccomp=unconfined \
25
+ rocmport-cuda_first_repo
26
+ ```
27
+
28
+ ## 4. Run vLLM on ROCm
29
+
30
+ ```bash
31
+ docker run --rm -it \
32
+ --device /dev/kfd \
33
+ --device /dev/dri \
34
+ --group-add video \
35
+ --ipc=host \
36
+ --network=host \
37
+ --security-opt seccomp=unconfined \
38
+ -v "$PWD:/workspace/cuda_first_repo" \
39
+ vllm/vllm-openai-rocm:latest \
40
+ vllm serve Qwen/Qwen3-Coder-Next-FP8 --tensor-parallel-size 1
41
+ ```
42
+
43
+ ## 5. Capture benchmark metadata
44
+
45
+ ```bash
46
+ rocm-smi --showproductname --showmeminfo vram --showuse
47
+ python scripts/collect_benchmark_result.py --output benchmark_result.json
48
+ ```
49
+
50
+ Replace `data/benchmark_result.json` with the captured result before final submission.
artifacts/check/benchmark_result.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "verified": false,
3
+ "status": "pending_external_amd_developer_cloud_run",
4
+ "hardware": "AMD Instinct MI300X or compatible AMD Developer Cloud instance",
5
+ "rocm_version": "capture with rocminfo or container metadata",
6
+ "vllm_version": "capture with python -m vllm --version",
7
+ "model": "Qwen/Qwen3-Coder-Next-FP8",
8
+ "prompt_config": {
9
+ "input_tokens": 512,
10
+ "output_tokens": 256,
11
+ "concurrency": 8,
12
+ "requests": 64
13
+ },
14
+ "throughput_tokens_per_second": null,
15
+ "p50_latency_ms": null,
16
+ "p95_latency_ms": null,
17
+ "peak_vram_gb": null,
18
+ "log_excerpt": "Replace this record after running the generated AMD Developer Cloud runbook.",
19
+ "notes": "This file is a transparent benchmark schema, not a fabricated result. The Space displays it as pending until measured values are captured on AMD Developer Cloud."
20
+ }
artifacts/check/migration_report.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCmPort AI Migration Report: cuda_first_repo
2
+
3
+ ## AMD Readiness Score
4
+
5
+ - Before deterministic fixes: 53/100
6
+ - After deterministic fixes: 100/100
7
+
8
+ | Category | Before | After |
9
+ | --- | ---: | ---: |
10
+ | Code portability | 0 | 100 |
11
+ | Environment readiness | 8 | 100 |
12
+ | Serving readiness | 90 | 100 |
13
+ | Benchmark readiness | 65 | 100 |
14
+ | Deployment readiness | 100 | 100 |
15
+
16
+ ## Findings
17
+
18
+ | Severity | Category | Location | Finding | Suggested fix |
19
+ | --- | --- | --- | --- | --- |
20
+ | high | Benchmark readiness | `benchmarks/benchmark.py:6` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
21
+ | high | Environment readiness | `Dockerfile:1` | Dockerfile uses an NVIDIA CUDA base image. | Use vllm/vllm-openai-rocm:latest for vLLM serving or rocm/pytorch:latest for PyTorch workloads. |
22
+ | medium | Environment readiness | `Dockerfile:8` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
23
+ | high | Code portability | `infer.py:6` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
24
+ | high | Code portability | `infer.py:11` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
25
+ | high | Code portability | `infer.py:12` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
26
+ | low | Code portability | `infer.py:19` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
27
+ | high | Environment readiness | `scripts/serve_vllm.sh:5` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
28
+ | low | Serving readiness | `scripts/serve_vllm.sh:6` | vLLM serving command found without explicit ROCm container guidance. | Run vLLM inside vllm/vllm-openai-rocm with /dev/kfd, /dev/dri, host IPC, and video group access. |
29
+
30
+ ## Generated Artifacts
31
+
32
+ - `rocm_patch.diff` contains deterministic MVP fixes.
33
+ - `Dockerfile.rocm` uses the ROCm-enabled vLLM container.
34
+ - `amd_developer_cloud_runbook.md` documents the validation path.
35
+ - `benchmark_result.json` records the AMD benchmark schema and status.
36
+
37
+ ## Qwen Agent Notes
38
+
39
+ Qwen endpoint was not configured. The report uses deterministic scanner output only.
40
+
41
+ ## Remaining Risks
42
+
43
+ - CUDA C++ kernels, custom Triton kernels, and CUDA-only binary dependencies require manual review.
44
+ - Uploaded repositories are not executed inside the Space; live validation belongs on AMD Developer Cloud.
45
+ - ROCm performance depends on model, batch shape, vLLM version, ROCm version, and GPU instance configuration.
artifacts/check/rocm_patch.diff ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --- a/Dockerfile
2
+ +++ b/Dockerfile
3
+ @@ -1,10 +1,10 @@
4
+ -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
5
+ +FROM vllm/vllm-openai-rocm:latest
6
+
7
+ WORKDIR /app
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+ COPY . .
11
+
12
+ -ENV NVIDIA_VISIBLE_DEVICES=all
13
+ +ENV HIP_VISIBLE_DEVICES=all
14
+
15
+ CMD ["python", "infer.py"]
16
+ --- a/infer.py
17
+ +++ b/infer.py
18
+ @@ -1,15 +1,18 @@
19
+ import torch
20
+ from transformers import AutoModelForCausalLM, AutoTokenizer
21
+
22
+ +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
23
+ +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
+ +
25
+
26
+ MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
27
+ -device = torch.device("cuda")
28
+ +device = _rocmport_device
29
+
30
+
31
+ def main():
32
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
33
+ - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
34
+ - inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
35
+ + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
36
+ + inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
37
+ with torch.no_grad():
38
+ outputs = model.generate(**inputs, max_new_tokens=64)
39
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
40
+ --- a/scripts/serve_vllm.sh
41
+ +++ b/scripts/serve_vllm.sh
42
+ @@ -2,5 +2,5 @@
43
+ set -euo pipefail
44
+
45
+ export CUDA_VISIBLE_DEVICES=0
46
+ -nvidia-smi
47
+ +rocm-smi
48
+ vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
artifacts/check/rocmport_artifacts.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a416cb1bb90125df6f5b63bc41032aa6543b74b8b5b2a431761cde14b5a52d5b
3
+ size 5229
artifacts/check2/Dockerfile.rocm ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM vllm/vllm-openai-rocm:latest
2
+
3
+ WORKDIR /workspace/cuda_first_repo
4
+ COPY . /workspace/cuda_first_repo
5
+
6
+ RUN if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi
7
+
8
+ ENV HIP_VISIBLE_DEVICES=0
9
+ ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
10
+
11
+ CMD ["python", "-c", "import torch; print('torch', torch.__version__); print('rocm_gpu_available', torch.cuda.is_available())"]
artifacts/check2/ROCM_FEEDBACK.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCm / AMD Developer Cloud Feedback
2
+
3
+ ## What worked well
4
+
5
+ - The ROCm-enabled vLLM container gives developers a clear serving path for AMD Instinct GPUs.
6
+ - AMD Developer Cloud is well aligned with hackathon demos because developers can avoid local GPU setup.
7
+ - Qwen3-Coder-Next on AMD Instinct is a strong story for repo-level coding agents.
8
+
9
+ ## Friction points to document during the live run
10
+
11
+ - Exact VM image, ROCm version, and Docker image should be easy to capture in benchmark logs.
12
+ - Users need obvious examples for replacing NVIDIA container flags and monitoring commands.
13
+ - More migration examples for common CUDA-first PyTorch repos would reduce onboarding time.
14
+
15
+ ## Suggested product improvement
16
+
17
+ Publish a small official CUDA-to-ROCm migration checklist for PyTorch, vLLM, and Hugging Face inference projects, with copyable Docker commands for AMD Developer Cloud.
artifacts/check2/ROCM_MIGRATION_COOKBOOK.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCm Migration Cookbook
2
+
3
+ ## PyTorch device handling
4
+
5
+ Use a runtime device abstraction instead of hardcoding `.cuda()` or `torch.device("cuda")` everywhere.
6
+
7
+ ```python
8
+ import torch
9
+
10
+ # ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+ model = model.to(device)
13
+ inputs = inputs.to(device)
14
+ ```
15
+
16
+ ## GPU inspection
17
+
18
+ Replace NVIDIA-only commands with ROCm equivalents:
19
+
20
+ ```bash
21
+ rocm-smi --showproductname --showmeminfo vram --showuse
22
+ ```
23
+
24
+ ## Containers
25
+
26
+ For vLLM serving on AMD GPUs, use the ROCm-enabled vLLM image:
27
+
28
+ ```bash
29
+ docker pull vllm/vllm-openai-rocm:latest
30
+ ```
31
+
32
+ Run with AMD GPU device access:
33
+
34
+ ```bash
35
+ docker run --rm -it --device /dev/kfd --device /dev/dri --group-add video --ipc=host --network=host --security-opt seccomp=unconfined vllm/vllm-openai-rocm:latest
36
+ ```
37
+
38
+ ## Manual review cases
39
+
40
+ Manual migration is still required for CUDA C++ kernels, CUDA-only binary wheels, custom Triton kernels, and libraries that ship only CUDA builds.
artifacts/check2/amd_developer_cloud_runbook.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AMD Developer Cloud Runbook
2
+
3
+ This runbook validates `cuda_first_repo` on AMD Developer Cloud without executing untrusted code inside the ROCmPort AI Space.
4
+
5
+ ## 1. Create an AMD GPU VM
6
+
7
+ Use an AMD Developer Cloud VM with an AMD Instinct GPU and ROCm-ready Docker support.
8
+
9
+ ## 2. Build the ROCm container
10
+
11
+ ```bash
12
+ docker build -f Dockerfile.rocm -t rocmport-cuda_first_repo .
13
+ ```
14
+
15
+ ## 3. Run a smoke check
16
+
17
+ ```bash
18
+ docker run --rm -it \
19
+ --device /dev/kfd \
20
+ --device /dev/dri \
21
+ --group-add video \
22
+ --ipc=host \
23
+ --network=host \
24
+ --security-opt seccomp=unconfined \
25
+ rocmport-cuda_first_repo
26
+ ```
27
+
28
+ ## 4. Run vLLM on ROCm
29
+
30
+ ```bash
31
+ docker run --rm -it \
32
+ --device /dev/kfd \
33
+ --device /dev/dri \
34
+ --group-add video \
35
+ --ipc=host \
36
+ --network=host \
37
+ --security-opt seccomp=unconfined \
38
+ -v "$PWD:/workspace/cuda_first_repo" \
39
+ vllm/vllm-openai-rocm:latest \
40
+ vllm serve Qwen/Qwen3-Coder-Next-FP8 --tensor-parallel-size 1
41
+ ```
42
+
43
+ ## 5. Capture benchmark metadata
44
+
45
+ ```bash
46
+ rocm-smi --showproductname --showmeminfo vram --showuse
47
+ python scripts/collect_benchmark_result.py --output benchmark_result.json
48
+ ```
49
+
50
+ Replace `data/benchmark_result.json` with the captured result before final submission.
artifacts/check2/benchmark_result.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "verified": false,
3
+ "status": "pending_external_amd_developer_cloud_run",
4
+ "hardware": "AMD Instinct MI300X or compatible AMD Developer Cloud instance",
5
+ "rocm_version": "capture with rocminfo or container metadata",
6
+ "vllm_version": "capture with python -m vllm --version",
7
+ "model": "Qwen/Qwen3-Coder-Next-FP8",
8
+ "prompt_config": {
9
+ "input_tokens": 512,
10
+ "output_tokens": 256,
11
+ "concurrency": 8,
12
+ "requests": 64
13
+ },
14
+ "throughput_tokens_per_second": null,
15
+ "p50_latency_ms": null,
16
+ "p95_latency_ms": null,
17
+ "peak_vram_gb": null,
18
+ "log_excerpt": "Replace this record after running the generated AMD Developer Cloud runbook.",
19
+ "notes": "This file is a transparent benchmark schema, not a fabricated result. The Space displays it as pending until measured values are captured on AMD Developer Cloud."
20
+ }
artifacts/check2/migration_report.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCmPort AI Migration Report: cuda_first_repo
2
+
3
+ ## AMD Readiness Score
4
+
5
+ - Before deterministic fixes: 51/100
6
+ - After deterministic fixes: 100/100
7
+
8
+ | Category | Before | After |
9
+ | --- | ---: | ---: |
10
+ | Code portability | 0 | 100 |
11
+ | Environment readiness | 0 | 100 |
12
+ | Serving readiness | 90 | 100 |
13
+ | Benchmark readiness | 65 | 100 |
14
+ | Deployment readiness | 100 | 100 |
15
+
16
+ ## Findings
17
+
18
+ | Severity | Category | Location | Finding | Suggested fix |
19
+ | --- | --- | --- | --- | --- |
20
+ | high | Benchmark readiness | `benchmarks/benchmark.py:6` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
21
+ | high | Environment readiness | `Dockerfile:1` | Dockerfile uses an NVIDIA CUDA base image. | Use vllm/vllm-openai-rocm:latest for vLLM serving or rocm/pytorch:latest for PyTorch workloads. |
22
+ | medium | Environment readiness | `Dockerfile:8` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
23
+ | high | Code portability | `infer.py:6` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
24
+ | high | Code portability | `infer.py:11` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
25
+ | high | Code portability | `infer.py:12` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
26
+ | low | Code portability | `infer.py:19` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
27
+ | medium | Environment readiness | `scripts/serve_vllm.sh:4` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
28
+ | high | Environment readiness | `scripts/serve_vllm.sh:5` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
29
+ | low | Serving readiness | `scripts/serve_vllm.sh:6` | vLLM serving command found without explicit ROCm container guidance. | Run vLLM inside vllm/vllm-openai-rocm with /dev/kfd, /dev/dri, host IPC, and video group access. |
30
+
31
+ ## Generated Artifacts
32
+
33
+ - `rocm_patch.diff` contains deterministic MVP fixes.
34
+ - `Dockerfile.rocm` uses the ROCm-enabled vLLM container.
35
+ - `amd_developer_cloud_runbook.md` documents the validation path.
36
+ - `benchmark_result.json` records the AMD benchmark schema and status.
37
+
38
+ ## Qwen Agent Notes
39
+
40
+ Qwen endpoint was not configured. The report uses deterministic scanner output only.
41
+
42
+ ## Remaining Risks
43
+
44
+ - CUDA C++ kernels, custom Triton kernels, and CUDA-only binary dependencies require manual review.
45
+ - Uploaded repositories are not executed inside the Space; live validation belongs on AMD Developer Cloud.
46
+ - ROCm performance depends on model, batch shape, vLLM version, ROCm version, and GPU instance configuration.
artifacts/check2/rocm_patch.diff ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --- a/Dockerfile
2
+ +++ b/Dockerfile
3
+ @@ -1,10 +1,10 @@
4
+ -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
5
+ +FROM vllm/vllm-openai-rocm:latest
6
+
7
+ WORKDIR /app
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+ COPY . .
11
+
12
+ -ENV NVIDIA_VISIBLE_DEVICES=all
13
+ +ENV HIP_VISIBLE_DEVICES=all
14
+
15
+ CMD ["python", "infer.py"]
16
+ --- a/infer.py
17
+ +++ b/infer.py
18
+ @@ -1,15 +1,18 @@
19
+ import torch
20
+ from transformers import AutoModelForCausalLM, AutoTokenizer
21
+
22
+ +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
23
+ +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
+ +
25
+
26
+ MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
27
+ -device = torch.device("cuda")
28
+ +device = _rocmport_device
29
+
30
+
31
+ def main():
32
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
33
+ - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
34
+ - inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
35
+ + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
36
+ + inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
37
+ with torch.no_grad():
38
+ outputs = model.generate(**inputs, max_new_tokens=64)
39
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
40
+ --- a/scripts/serve_vllm.sh
41
+ +++ b/scripts/serve_vllm.sh
42
+ @@ -1,6 +1,6 @@
43
+ #!/usr/bin/env bash
44
+ set -euo pipefail
45
+
46
+ -export CUDA_VISIBLE_DEVICES=0
47
+ -nvidia-smi
48
+ +export HIP_VISIBLE_DEVICES=0
49
+ +rocm-smi
50
+ vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
artifacts/check2/rocmport_artifacts.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b346f1ca8119e31fb879012c85fb6a20663459f9266944b277e346d3d8c89fa
3
+ size 5275
artifacts/hackathon_content.txt ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/Dockerfile.rocm ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM vllm/vllm-openai-rocm:latest
2
+
3
+ WORKDIR /workspace/cuda_first_repo
4
+ COPY . /workspace/cuda_first_repo
5
+
6
+ RUN if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi
7
+
8
+ ENV HIP_VISIBLE_DEVICES=0
9
+ ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
10
+
11
+ CMD ["python", "-c", "import torch; print('torch', torch.__version__); print('rocm_gpu_available', torch.cuda.is_available())"]
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/ROCM_FEEDBACK.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCm / AMD Developer Cloud Feedback
2
+
3
+ ## What worked well
4
+
5
+ - The ROCm-enabled vLLM container gives developers a clear serving path for AMD Instinct GPUs.
6
+ - AMD Developer Cloud is well aligned with hackathon demos because developers can avoid local GPU setup.
7
+ - Qwen3-Coder-Next on AMD Instinct is a strong story for repo-level coding agents.
8
+
9
+ ## Friction points to document during the live run
10
+
11
+ - Exact VM image, ROCm version, and Docker image should be easy to capture in benchmark logs.
12
+ - Users need obvious examples for replacing NVIDIA container flags and monitoring commands.
13
+ - More migration examples for common CUDA-first PyTorch repos would reduce onboarding time.
14
+
15
+ ## Suggested product improvement
16
+
17
+ Publish a small official CUDA-to-ROCm migration checklist for PyTorch, vLLM, and Hugging Face inference projects, with copyable Docker commands for AMD Developer Cloud.
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/ROCM_MIGRATION_COOKBOOK.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCm Migration Cookbook
2
+
3
+ ## PyTorch device handling
4
+
5
+ Use a runtime device abstraction instead of hardcoding `.cuda()` or `torch.device("cuda")` everywhere.
6
+
7
+ ```python
8
+ import torch
9
+
10
+ # ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+ model = model.to(device)
13
+ inputs = inputs.to(device)
14
+ ```
15
+
16
+ ## GPU inspection
17
+
18
+ Replace NVIDIA-only commands with ROCm equivalents:
19
+
20
+ ```bash
21
+ rocm-smi --showproductname --showmeminfo vram --showuse
22
+ ```
23
+
24
+ ## Containers
25
+
26
+ For vLLM serving on AMD GPUs, use the ROCm-enabled vLLM image:
27
+
28
+ ```bash
29
+ docker pull vllm/vllm-openai-rocm:latest
30
+ ```
31
+
32
+ Run with AMD GPU device access:
33
+
34
+ ```bash
35
+ docker run --rm -it --device /dev/kfd --device /dev/dri --group-add video --ipc=host --network=host --security-opt seccomp=unconfined vllm/vllm-openai-rocm:latest
36
+ ```
37
+
38
+ ## Manual review cases
39
+
40
+ Manual migration is still required for CUDA C++ kernels, CUDA-only binary wheels, custom Triton kernels, and libraries that ship only CUDA builds.
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/amd_developer_cloud_runbook.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AMD Developer Cloud Runbook
2
+
3
+ This runbook validates `cuda_first_repo` on AMD Developer Cloud without executing untrusted code inside the ROCmPort AI Space.
4
+
5
+ ## 1. Create an AMD GPU VM
6
+
7
+ Use an AMD Developer Cloud VM with an AMD Instinct GPU and ROCm-ready Docker support.
8
+
9
+ ## 2. Build the ROCm container
10
+
11
+ ```bash
12
+ docker build -f Dockerfile.rocm -t rocmport-cuda_first_repo .
13
+ ```
14
+
15
+ ## 3. Run a smoke check
16
+
17
+ ```bash
18
+ docker run --rm -it \
19
+ --device /dev/kfd \
20
+ --device /dev/dri \
21
+ --group-add video \
22
+ --ipc=host \
23
+ --network=host \
24
+ --security-opt seccomp=unconfined \
25
+ rocmport-cuda_first_repo
26
+ ```
27
+
28
+ ## 4. Run vLLM on ROCm
29
+
30
+ ```bash
31
+ docker run --rm -it \
32
+ --device /dev/kfd \
33
+ --device /dev/dri \
34
+ --group-add video \
35
+ --ipc=host \
36
+ --network=host \
37
+ --security-opt seccomp=unconfined \
38
+ -v "$PWD:/workspace/cuda_first_repo" \
39
+ vllm/vllm-openai-rocm:latest \
40
+ vllm serve Qwen/Qwen3-Coder-Next-FP8 --tensor-parallel-size 1
41
+ ```
42
+
43
+ ## 5. Capture benchmark metadata
44
+
45
+ ```bash
46
+ rocm-smi --showproductname --showmeminfo vram --showuse
47
+ python scripts/collect_benchmark_result.py --output benchmark_result.json
48
+ ```
49
+
50
+ Replace `data/benchmark_result.json` with the captured result before final submission.
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/benchmark_result.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "verified": false,
3
+ "status": "pending_external_amd_developer_cloud_run",
4
+ "hardware": "AMD Instinct MI300X or compatible AMD Developer Cloud instance",
5
+ "rocm_version": "capture with rocminfo or container metadata",
6
+ "vllm_version": "capture with python -m vllm --version",
7
+ "model": "Qwen/Qwen3-Coder-Next-FP8",
8
+ "prompt_config": {
9
+ "input_tokens": 512,
10
+ "output_tokens": 256,
11
+ "concurrency": 8,
12
+ "requests": 64
13
+ },
14
+ "throughput_tokens_per_second": null,
15
+ "p50_latency_ms": null,
16
+ "p95_latency_ms": null,
17
+ "peak_vram_gb": null,
18
+ "log_excerpt": "Replace this record after running the generated AMD Developer Cloud runbook.",
19
+ "notes": "This file is a transparent benchmark schema, not a fabricated result. The Space displays it as pending until measured values are captured on AMD Developer Cloud."
20
+ }
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/migration_report.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCmPort AI Migration Report: cuda_first_repo
2
+
3
+ ## AMD Readiness Score
4
+
5
+ - Before deterministic fixes: 42/100
6
+ - Migration package generated: 67/100
7
+ - This score means ROCm migration artifacts were generated and are ready for AMD Developer Cloud validation; it is not a production certification.
8
+
9
+ | Category | Before | Migration package |
10
+ | --- | ---: | ---: |
11
+ | Code portability | 0 | 46 |
12
+ | Environment readiness | 0 | 0 |
13
+ | Serving readiness | 80 | 96 |
14
+ | Benchmark readiness | 30 | 92 |
15
+ | Deployment readiness | 100 | 100 |
16
+
17
+ ## Findings
18
+
19
+ | Severity | Category | Location | Finding | Suggested fix |
20
+ | --- | --- | --- | --- | --- |
21
+ | medium | Environment readiness | `benchmarks/benchmark.py:13` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
22
+ | high | Benchmark readiness | `benchmarks/benchmark.py:22` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
23
+ | high | Benchmark readiness | `benchmarks/benchmark.py:24` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
24
+ | high | Code portability | `benchmarks/benchmark.py:36` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
25
+ | high | Code portability | `benchmarks/benchmark.py:38` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
26
+ | high | Code portability | `benchmarks/benchmark.py:41` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
27
+ | medium | Environment readiness | `docker-compose.yml:6` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
28
+ | medium | Environment readiness | `docker-compose.yml:7` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
29
+ | medium | Environment readiness | `docker-compose.yml:8` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
30
+ | medium | Environment readiness | `docker-compose.yml:24` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
31
+ | medium | Environment readiness | `docker-compose.yml:25` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
32
+ | high | Environment readiness | `docker-compose.yml:29` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
33
+ | low | Serving readiness | `docker-compose.yml:30` | vLLM serving command found without explicit ROCm container guidance. | Run vLLM inside vllm/vllm-openai-rocm with /dev/kfd, /dev/dri, host IPC, and video group access. |
34
+ | high | Environment readiness | `Dockerfile:1` | Dockerfile uses an NVIDIA CUDA base image. | Use vllm/vllm-openai-rocm:latest for vLLM serving or rocm/pytorch:latest for PyTorch workloads. |
35
+ | medium | Environment readiness | `Dockerfile:8` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
36
+ | high | Code portability | `infer.py:6` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
37
+ | high | Code portability | `infer.py:11` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
38
+ | high | Code portability | `infer.py:12` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
39
+ | low | Code portability | `infer.py:19` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
40
+ | medium | Environment readiness | `requirements.txt:4` | Dependency references a CUDA-specific package. | Replace CUDA-specific wheels with ROCm-compatible PyTorch or library builds. |
41
+ | medium | Environment readiness | `requirements.txt:5` | Dependency references a CUDA-specific package. | Replace CUDA-specific wheels with ROCm-compatible PyTorch or library builds. |
42
+ | medium | Environment readiness | `scripts/serve_vllm.sh:4` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
43
+ | high | Environment readiness | `scripts/serve_vllm.sh:5` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
44
+ | low | Serving readiness | `scripts/serve_vllm.sh:6` | vLLM serving command found without explicit ROCm container guidance. | Run vLLM inside vllm/vllm-openai-rocm with /dev/kfd, /dev/dri, host IPC, and video group access. |
45
+ | medium | Environment readiness | `scripts/train.py:13` | CUDA_VISIBLE_DEVICES is used for GPU selection. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
46
+ | medium | Environment readiness | `scripts/train.py:14` | CUDA toolkit path environment variable found. | Remove CUDA toolkit path assumptions or replace with ROCm installation paths when required. |
47
+ | high | Code portability | `scripts/train.py:18` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
48
+ | low | Code portability | `scripts/train.py:19` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
49
+ | high | Code portability | `scripts/train.py:30` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
50
+ | high | Code portability | `scripts/train.py:35` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
51
+ | high | Code portability | `scripts/train.py:36` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
52
+ | high | Code portability | `scripts/train.py:44` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
53
+ | high | Code portability | `scripts/train.py:45` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
54
+ | low | Code portability | `scripts/train.py:59` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
55
+
56
+ ## Generated Artifacts
57
+
58
+ - `rocm_patch.diff` contains deterministic MVP fixes.
59
+ - `Dockerfile.rocm` uses the ROCm-enabled vLLM container.
60
+ - `amd_developer_cloud_runbook.md` documents the validation path.
61
+ - `benchmark_result.json` records the AMD benchmark schema and status.
62
+
63
+ ## Qwen Agent Notes
64
+
65
+ Qwen endpoint was not configured. The report uses deterministic scanner output only.
66
+
67
+ ## Remaining Risks
68
+
69
+ - CUDA C++ kernels, custom Triton kernels, and CUDA-only binary dependencies require manual review.
70
+ - Uploaded repositories are not executed inside the Space; live validation belongs on AMD Developer Cloud.
71
+ - ROCm performance depends on model, batch shape, vLLM version, ROCm version, and GPU instance configuration.
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/rocm_patch.diff ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --- a/benchmarks/benchmark.py
2
+ +++ b/benchmarks/benchmark.py
3
+ @@ -9,6 +9,9 @@
4
+ import json
5
+ import torch
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+ +
8
+ +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
9
+ +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+
11
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" # should → HIP_VISIBLE_DEVICES
12
+
13
+ @@ -33,12 +36,12 @@
14
+ hw = gpu_info()
15
+ print("GPU info:", hw)
16
+
17
+ - device = torch.device("cuda") # hardcoded CUDA device
18
+ + device = _rocmport_device # hardcoded CUDA device
19
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
20
+ - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda()
21
+ + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device)
22
+ model.eval()
23
+
24
+ - inputs = tokenizer(PROMPT, return_tensors="pt").to("cuda") # .to("cuda")
25
+ + inputs = tokenizer(PROMPT, return_tensors="pt").to(_rocmport_device) # .to(_rocmport_device)
26
+
27
+ # Warm-up
28
+ with torch.no_grad():
29
+ --- a/docker-compose.yml
30
+ +++ b/docker-compose.yml
31
+ @@ -3,9 +3,9 @@
32
+ inference:
33
+ image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
34
+ environment:
35
+ - - NVIDIA_VISIBLE_DEVICES=all
36
+ - - NVIDIA_DRIVER_CAPABILITIES=compute,utility
37
+ - - CUDA_VISIBLE_DEVICES=0
38
+ + - HIP_VISIBLE_DEVICES=all
39
+ + - ROCM_VISIBLE_DEVICES=compute,utility
40
+ + - HIP_VISIBLE_DEVICES=0
41
+ deploy:
42
+ resources:
43
+ reservations:
44
+ @@ -21,10 +21,10 @@
45
+ vllm_server:
46
+ image: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
47
+ environment:
48
+ - - NVIDIA_VISIBLE_DEVICES=0,1
49
+ - - CUDA_VISIBLE_DEVICES=0,1
50
+ + - HIP_VISIBLE_DEVICES=0,1
51
+ + - HIP_VISIBLE_DEVICES=0,1
52
+ ports:
53
+ - "8000:8000"
54
+ command: >
55
+ - bash -c "nvidia-smi && pip install vllm &&
56
+ + bash -c "rocm-smi && pip install vllm &&
57
+ vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 2"
58
+ --- a/Dockerfile
59
+ +++ b/Dockerfile
60
+ @@ -1,10 +1,10 @@
61
+ -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
62
+ +FROM vllm/vllm-openai-rocm:latest
63
+
64
+ WORKDIR /app
65
+ COPY requirements.txt .
66
+ RUN pip install --no-cache-dir -r requirements.txt
67
+ COPY . .
68
+
69
+ -ENV NVIDIA_VISIBLE_DEVICES=all
70
+ +ENV HIP_VISIBLE_DEVICES=all
71
+
72
+ CMD ["python", "infer.py"]
73
+ --- a/infer.py
74
+ +++ b/infer.py
75
+ @@ -1,15 +1,18 @@
76
+ import torch
77
+ from transformers import AutoModelForCausalLM, AutoTokenizer
78
+
79
+ +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
80
+ +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
81
+ +
82
+
83
+ MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
84
+ -device = torch.device("cuda")
85
+ +device = _rocmport_device
86
+
87
+
88
+ def main():
89
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
90
+ - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
91
+ - inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
92
+ + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
93
+ + inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
94
+ with torch.no_grad():
95
+ outputs = model.generate(**inputs, max_new_tokens=64)
96
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
97
+ --- a/scripts/serve_vllm.sh
98
+ +++ b/scripts/serve_vllm.sh
99
+ @@ -1,6 +1,6 @@
100
+ #!/usr/bin/env bash
101
+ set -euo pipefail
102
+
103
+ -export CUDA_VISIBLE_DEVICES=0
104
+ -nvidia-smi
105
+ +export HIP_VISIBLE_DEVICES=0
106
+ +rocm-smi
107
+ vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
108
+ --- a/scripts/train.py
109
+ +++ b/scripts/train.py
110
+ @@ -9,13 +9,16 @@
111
+ from torch.utils.data import DataLoader, TensorDataset
112
+ from transformers import AutoModelForCausalLM, AutoTokenizer
113
+
114
+ +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
115
+ +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
116
+ +
117
+ # ── CUDA-specific patterns that ROCmPort will flag ─────────────────────────
118
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0" # should → HIP_VISIBLE_DEVICES
119
+ os.environ["CUDA_HOME"] = "/usr/local/cuda" # should be removed / replaced
120
+
121
+ MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
122
+
123
+ -device = torch.device("cuda") # hardcoded CUDA device
124
+ +device = _rocmport_device # hardcoded CUDA device
125
+ print("CUDA available:", torch.cuda.is_available())
126
+
127
+
128
+ @@ -27,13 +30,13 @@
129
+
130
+ def train(epochs: int = 3, lr: float = 2e-5):
131
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
132
+ - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda() # .cuda() call
133
+ + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device) # .to(_rocmport_device) call
134
+
135
+ optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
136
+
137
+ ids, labels = get_dummy_batch()
138
+ - ids = ids.to("cuda") # hardcoded "cuda" string
139
+ - labels = labels.to("cuda") # hardcoded "cuda" string
140
+ + ids = ids.to(_rocmport_device) # hardcoded "cuda" string
141
+ + labels = labels.to(_rocmport_device) # hardcoded "cuda" string
142
+
143
+ dataset = TensorDataset(ids, labels)
144
+ loader = DataLoader(dataset, batch_size=2)
145
+ @@ -41,8 +44,8 @@
146
+ model.train()
147
+ for epoch in range(epochs):
148
+ for batch_ids, batch_labels in loader:
149
+ - batch_ids = batch_ids.cuda() # another .cuda() call
150
+ - batch_labels = batch_labels.cuda()
151
+ + batch_ids = batch_ids.to(_rocmport_device) # another .to(_rocmport_device) call
152
+ + batch_labels = batch_labels.to(_rocmport_device)
153
+ outputs = model(input_ids=batch_ids, labels=batch_labels)
154
+ loss = outputs.loss
155
+ loss.backward()
artifacts/runtime/rocmport-ui-artifacts-2394ce307a044fcfa1edda12f4f304cb/rocmport_artifacts.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:232d4aa956fe47770fd6f443c73af4ba5f3bfbc5cc73c997f9fc77538bf7c918
3
+ size 6603
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/Dockerfile.rocm ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM vllm/vllm-openai-rocm:latest
2
+
3
+ WORKDIR /workspace/cuda_first_repo
4
+ COPY . /workspace/cuda_first_repo
5
+
6
+ RUN if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi
7
+
8
+ ENV HIP_VISIBLE_DEVICES=0
9
+ ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
10
+
11
+ CMD ["python", "-c", "import torch; print('torch', torch.__version__); print('rocm_gpu_available', torch.cuda.is_available())"]
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/ROCM_FEEDBACK.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCm / AMD Developer Cloud Feedback
2
+
3
+ ## What worked well
4
+
5
+ - The ROCm-enabled vLLM container gives developers a clear serving path for AMD Instinct GPUs.
6
+ - AMD Developer Cloud is well aligned with hackathon demos because developers can avoid local GPU setup.
7
+ - Qwen3-Coder-Next on AMD Instinct is a strong story for repo-level coding agents.
8
+
9
+ ## Friction points to document during the live run
10
+
11
+ - Exact VM image, ROCm version, and Docker image should be easy to capture in benchmark logs.
12
+ - Users need obvious examples for replacing NVIDIA container flags and monitoring commands.
13
+ - More migration examples for common CUDA-first PyTorch repos would reduce onboarding time.
14
+
15
+ ## Suggested product improvement
16
+
17
+ Publish a small official CUDA-to-ROCm migration checklist for PyTorch, vLLM, and Hugging Face inference projects, with copyable Docker commands for AMD Developer Cloud.
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/ROCM_MIGRATION_COOKBOOK.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCm Migration Cookbook
2
+
3
+ ## PyTorch device handling
4
+
5
+ Use a runtime device abstraction instead of hardcoding `.cuda()` or `torch.device("cuda")` everywhere.
6
+
7
+ ```python
8
+ import torch
9
+
10
+ # ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+ model = model.to(device)
13
+ inputs = inputs.to(device)
14
+ ```
15
+
16
+ ## GPU inspection
17
+
18
+ Replace NVIDIA-only commands with ROCm equivalents:
19
+
20
+ ```bash
21
+ rocm-smi --showproductname --showmeminfo vram --showuse
22
+ ```
23
+
24
+ ## Containers
25
+
26
+ For vLLM serving on AMD GPUs, use the ROCm-enabled vLLM image:
27
+
28
+ ```bash
29
+ docker pull vllm/vllm-openai-rocm:latest
30
+ ```
31
+
32
+ Run with AMD GPU device access:
33
+
34
+ ```bash
35
+ docker run --rm -it --device /dev/kfd --device /dev/dri --group-add video --ipc=host --network=host --security-opt seccomp=unconfined vllm/vllm-openai-rocm:latest
36
+ ```
37
+
38
+ ## Manual review cases
39
+
40
+ Manual migration is still required for CUDA C++ kernels, CUDA-only binary wheels, custom Triton kernels, and libraries that ship only CUDA builds.
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/amd_developer_cloud_runbook.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AMD Developer Cloud Runbook
2
+
3
+ This runbook validates `cuda_first_repo` on AMD Developer Cloud without executing untrusted code inside the ROCmPort AI Space.
4
+
5
+ ## 1. Create an AMD GPU VM
6
+
7
+ Use an AMD Developer Cloud VM with an AMD Instinct GPU and ROCm-ready Docker support.
8
+
9
+ ## 2. Build the ROCm container
10
+
11
+ ```bash
12
+ docker build -f Dockerfile.rocm -t rocmport-cuda_first_repo .
13
+ ```
14
+
15
+ ## 3. Run a smoke check
16
+
17
+ ```bash
18
+ docker run --rm -it \
19
+ --device /dev/kfd \
20
+ --device /dev/dri \
21
+ --group-add video \
22
+ --ipc=host \
23
+ --network=host \
24
+ --security-opt seccomp=unconfined \
25
+ rocmport-cuda_first_repo
26
+ ```
27
+
28
+ ## 4. Run vLLM on ROCm
29
+
30
+ ```bash
31
+ docker run --rm -it \
32
+ --device /dev/kfd \
33
+ --device /dev/dri \
34
+ --group-add video \
35
+ --ipc=host \
36
+ --network=host \
37
+ --security-opt seccomp=unconfined \
38
+ -v "$PWD:/workspace/cuda_first_repo" \
39
+ vllm/vllm-openai-rocm:latest \
40
+ vllm serve Qwen/Qwen3-Coder-Next-FP8 --tensor-parallel-size 1
41
+ ```
42
+
43
+ ## 5. Capture benchmark metadata
44
+
45
+ ```bash
46
+ rocm-smi --showproductname --showmeminfo vram --showuse
47
+ python scripts/collect_benchmark_result.py --output benchmark_result.json
48
+ ```
49
+
50
+ Replace `data/benchmark_result.json` with the captured result before final submission.
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/benchmark_result.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "verified": false,
3
+ "status": "pending_external_amd_developer_cloud_run",
4
+ "hardware": "AMD Instinct MI300X or compatible AMD Developer Cloud instance",
5
+ "rocm_version": "capture with rocminfo or container metadata",
6
+ "vllm_version": "capture with python -m vllm --version",
7
+ "model": "Qwen/Qwen3-Coder-Next-FP8",
8
+ "prompt_config": {
9
+ "input_tokens": 512,
10
+ "output_tokens": 256,
11
+ "concurrency": 8,
12
+ "requests": 64
13
+ },
14
+ "throughput_tokens_per_second": null,
15
+ "p50_latency_ms": null,
16
+ "p95_latency_ms": null,
17
+ "peak_vram_gb": null,
18
+ "log_excerpt": "Replace this record after running the generated AMD Developer Cloud runbook.",
19
+ "notes": "This file is a transparent benchmark schema, not a fabricated result. The Space displays it as pending until measured values are captured on AMD Developer Cloud."
20
+ }
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/migration_report.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROCmPort AI Migration Report: cuda_first_repo
2
+
3
+ ## AMD Readiness Score
4
+
5
+ - Before deterministic fixes: 53/100
6
+ - After deterministic fixes: 100/100
7
+
8
+ | Category | Before | After |
9
+ | --- | ---: | ---: |
10
+ | Code portability | 0 | 100 |
11
+ | Environment readiness | 8 | 100 |
12
+ | Serving readiness | 90 | 100 |
13
+ | Benchmark readiness | 65 | 100 |
14
+ | Deployment readiness | 100 | 100 |
15
+
16
+ ## Findings
17
+
18
+ | Severity | Category | Location | Finding | Suggested fix |
19
+ | --- | --- | --- | --- | --- |
20
+ | high | Benchmark readiness | `benchmarks/benchmark.py:6` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
21
+ | high | Environment readiness | `Dockerfile:1` | Dockerfile uses an NVIDIA CUDA base image. | Use vllm/vllm-openai-rocm:latest for vLLM serving or rocm/pytorch:latest for PyTorch workloads. |
22
+ | medium | Environment readiness | `Dockerfile:8` | NVIDIA container environment variable found. | Use HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES for AMD GPU targeting. |
23
+ | high | Code portability | `infer.py:6` | torch.device is hardcoded to CUDA. | Use torch.device("cuda" if torch.cuda.is_available() else "cpu"); ROCm PyTorch reports AMD GPUs through torch.cuda. |
24
+ | high | Code portability | `infer.py:11` | PyTorch tensor or module is moved with a hardcoded .cuda() call. | Replace .cuda() with .to(_rocmport_device) and define a runtime device abstraction. |
25
+ | high | Code portability | `infer.py:12` | Tensor or module transfer hardcodes the CUDA device string. | Replace .to("cuda") with .to(_rocmport_device). |
26
+ | low | Code portability | `infer.py:19` | CUDA availability check may confuse ROCm users because PyTorch ROCm still uses the torch.cuda namespace. | Keep the API call but document that it covers AMD GPUs under ROCm PyTorch. |
27
+ | high | Environment readiness | `scripts/serve_vllm.sh:5` | NVIDIA-specific GPU inspection command found. | Use rocm-smi for AMD GPU monitoring and benchmark metadata collection. |
28
+ | low | Serving readiness | `scripts/serve_vllm.sh:6` | vLLM serving command found without explicit ROCm container guidance. | Run vLLM inside vllm/vllm-openai-rocm with /dev/kfd, /dev/dri, host IPC, and video group access. |
29
+
30
+ ## Generated Artifacts
31
+
32
+ - `rocm_patch.diff` contains deterministic MVP fixes.
33
+ - `Dockerfile.rocm` uses the ROCm-enabled vLLM container.
34
+ - `amd_developer_cloud_runbook.md` documents the validation path.
35
+ - `benchmark_result.json` records the AMD benchmark schema and status.
36
+
37
+ ## Qwen Agent Notes
38
+
39
+ Qwen endpoint was not configured. The report uses deterministic scanner output only.
40
+
41
+ ## Remaining Risks
42
+
43
+ - CUDA C++ kernels, custom Triton kernels, and CUDA-only binary dependencies require manual review.
44
+ - Uploaded repositories are not executed inside the Space; live validation belongs on AMD Developer Cloud.
45
+ - ROCm performance depends on model, batch shape, vLLM version, ROCm version, and GPU instance configuration.
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/rocm_patch.diff ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --- a/Dockerfile
2
+ +++ b/Dockerfile
3
+ @@ -1,10 +1,10 @@
4
+ -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
5
+ +FROM vllm/vllm-openai-rocm:latest
6
+
7
+ WORKDIR /app
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+ COPY . .
11
+
12
+ -ENV NVIDIA_VISIBLE_DEVICES=all
13
+ +ENV HIP_VISIBLE_DEVICES=all
14
+
15
+ CMD ["python", "infer.py"]
16
+ --- a/infer.py
17
+ +++ b/infer.py
18
+ @@ -1,15 +1,18 @@
19
+ import torch
20
+ from transformers import AutoModelForCausalLM, AutoTokenizer
21
+
22
+ +# ROCm PyTorch exposes AMD GPUs through the torch.cuda namespace.
23
+ +_rocmport_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
+ +
25
+
26
+ MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
27
+ -device = torch.device("cuda")
28
+ +device = _rocmport_device
29
+
30
+
31
+ def main():
32
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
33
+ - model = AutoModelForCausalLM.from_pretrained(MODEL_ID).cuda()
34
+ - inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to("cuda")
35
+ + model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(_rocmport_device)
36
+ + inputs = tokenizer("Explain ROCm in one sentence.", return_tensors="pt").to(_rocmport_device)
37
+ with torch.no_grad():
38
+ outputs = model.generate(**inputs, max_new_tokens=64)
39
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
40
+ --- a/scripts/serve_vllm.sh
41
+ +++ b/scripts/serve_vllm.sh
42
+ @@ -2,5 +2,5 @@
43
+ set -euo pipefail
44
+
45
+ export CUDA_VISIBLE_DEVICES=0
46
+ -nvidia-smi
47
+ +rocm-smi
48
+ vllm serve Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1
artifacts/runtime/rocmport-ui-artifacts-3b0405c215eb4a3cb5cf402e4ee6453d/rocmport_artifacts.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd2494bdf724b6ec9d98675fea27408938c2ee0f7ee6b2bd219e10a857a1c105
3
+ size 5229
artifacts/runtime/rocmport-ui-artifacts-6f4540ff23e142ad9d6ab18154ea44e6/Dockerfile.rocm ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM vllm/vllm-openai-rocm:latest
2
+
3
+ WORKDIR /workspace/cuda_first_repo
4
+ COPY . /workspace/cuda_first_repo
5
+
6
+ RUN if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi
7
+
8
+ ENV HIP_VISIBLE_DEVICES=0
9
+ ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
10
+
11
+ CMD ["python", "-c", "import torch; print('torch', torch.__version__); print('rocm_gpu_available', torch.cuda.is_available())"]