""" ROCmPort AI - Gradio Space entry point Calls the deployed FastAPI backend (Render) and streams agent events. """ import gradio as gr import httpx import json BACKEND_URL = "https://rocmport-ai-q2b1.onrender.com" AGENT_LABELS = { "analyzer": "Analyzer", "translator": "Translator", "optimizer": "Optimizer", "tester": "Tester", "coordinator": "Coordinator", } STATUS_LABELS = { "waiting": "[waiting]", "running": "[running]", "done": "[done]", "failed": "[FAILED]", "retrying": "[retrying]", } EXAMPLE_REDUCTION = """\ __global__ void reduction_kernel(float* g_idata, float* g_odata, unsigned int n) { extern __shared__ float sdata[]; unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; float mySum = (i < n) ? g_idata[i] : 0; if (i + blockDim.x < n) mySum += g_idata[i + blockDim.x]; sdata[tid] = mySum; __syncthreads(); for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) { if (tid < s) sdata[tid] = mySum = mySum + sdata[tid + s]; __syncthreads(); } // DELIBERATE BUG: assumes warpSize=32, wrong on AMD (warpSize=64) if (tid < 32) { volatile float* vsmem = sdata; vsmem[tid] = mySum = mySum + vsmem[tid + 32]; vsmem[tid] = mySum = mySum + vsmem[tid + 16]; vsmem[tid] = mySum = mySum + vsmem[tid + 8]; vsmem[tid] = mySum = mySum + vsmem[tid + 4]; vsmem[tid] = mySum = mySum + vsmem[tid + 2]; vsmem[tid] = mySum = mySum + vsmem[tid + 1]; } if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }""" EXAMPLE_VECTOR_ADD = """\ __global__ void vectorAdd(const float *A, const float *B, float *C, int n) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < n) { C[i] = A[i] + B[i]; // Warp-size assumption: 32 threads per warp (wrong on AMD wavefront-64) if (threadIdx.x % 32 == 0) { printf("Warp leader: %d\n", threadIdx.x / 32); } } }""" EXAMPLE_MATMUL = """\ __global__ void matmul(float *A, float *B, float *C, int N) { __shared__ float As[32][32]; __shared__ float Bs[32][32]; int row = blockIdx.y * 32 + threadIdx.y; int col = blockIdx.x * 32 + threadIdx.x; float sum = 0.0f; for (int k = 0; k < N / 32; k++) { As[threadIdx.y][threadIdx.x] = A[row * N + k * 32 + threadIdx.x]; Bs[threadIdx.y][threadIdx.x] = B[(k * 32 + threadIdx.y) * N + col]; __syncthreads(); for (int n = 0; n < 32; n++) sum += As[threadIdx.y][n] * Bs[n][threadIdx.x]; __syncthreads(); } C[row * N + col] = sum; }""" def port_kernel(cuda_code: str, kernel_name: str, simple_mode: bool): """Generator: streams agent events and yields (log_markdown, hip_code).""" if not cuda_code or len(cuda_code.strip()) < 10: yield "Please provide CUDA kernel code (at least 10 characters).", "" return kernel_name = kernel_name.strip() or "custom" log_lines: list[str] = [] hip_code = "" payload = { "cuda_code": cuda_code, "kernel_name": kernel_name, "simple_mode": bool(simple_mode), } log_lines.append("**Connecting to ROCmPort AI backend...**") yield "\n\n".join(log_lines), hip_code try: with httpx.Client(timeout=180.0) as client: with client.stream("POST", f"{BACKEND_URL}/port", json=payload) as resp: resp.raise_for_status() for line in resp.iter_lines(): if not line: continue if not line.startswith("data: "): continue data = line[6:] if data.strip() == "[DONE]": break try: event = json.loads(data) except json.JSONDecodeError: continue agent = event.get("agent", "system") status = event.get("status", "running") message = event.get("message", "") detail = event.get("detail") or "" label = AGENT_LABELS.get(agent, agent.capitalize()) s_label = STATUS_LABELS.get(status, status) log_lines.append(f"**{label}** {s_label} -- {message}") if status == "done" and detail: try: detail_json = json.loads(detail) candidate = ( detail_json.get("hip_code") or detail_json.get("optimized_code") or detail_json.get("translated_code") or "" ) if candidate: hip_code = candidate except (json.JSONDecodeError, AttributeError): pass yield "\n\n".join(log_lines), hip_code except httpx.ConnectError: log_lines.append( "**Could not connect to backend.**\n\n" "> The server may be cold-starting -- please wait ~30 s and retry." ) yield "\n\n".join(log_lines), hip_code return except httpx.TimeoutException: log_lines.append("**Request timed out.** The pipeline may still be running -- try again shortly.") yield "\n\n".join(log_lines), hip_code return except httpx.HTTPStatusError as exc: log_lines.append(f"**HTTP {exc.response.status_code}**: {exc.response.text[:300]}") yield "\n\n".join(log_lines), hip_code return except Exception as exc: # noqa: BLE001 log_lines.append(f"**Unexpected error**: {exc}") yield "\n\n".join(log_lines), hip_code return if not hip_code: log_lines.append("\nPipeline finished but no HIP code was extracted. Check agent logs above.") else: log_lines.append("\n**Migration complete.** HIP code is shown on the right.") yield "\n\n".join(log_lines), hip_code CSS = ( ".panel-header { font-weight: 600; font-size: 1rem; margin-bottom: 4px; } " "footer { display: none !important; }" ) with gr.Blocks(title="ROCmPort AI -- CUDA to ROCm Migration") as demo: gr.Markdown( """# ROCmPort AI ### Agentic CUDA to ROCm/HIP migration with wavefront-64 bug detection > **Backend API**: [rocmport-ai-q2b1.onrender.com](https://rocmport-ai-q2b1.onrender.com) | > **GitHub**: [tazwaryayyyy/ROCmPort-AI](https://github.com/tazwaryayyyy/ROCmPort-AI) `hipify-clang` translates CUDA API calls mechanically -- it **cannot** detect that `if (tid < 32)` in a warp-level reduction silently skips lanes 32-63 on AMD wavefront-64. The code compiles, the output is wrong, no errors. **ROCmPort AI catches this before execution.** """ ) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Input", elem_classes="panel-header") cuda_input = gr.Code( label="CUDA Kernel Code", language="cpp", lines=22, value=EXAMPLE_REDUCTION, ) with gr.Row(): kernel_name = gr.Textbox( label="Kernel Name", value="reduction", placeholder="e.g. reduction, matmul, vector_add", scale=2, ) simple_mode = gr.Checkbox( label="Explain Like I am 5", value=False, scale=1, ) with gr.Row(): port_btn = gr.Button("Port to ROCm", variant="primary", scale=3) clear_btn = gr.Button("Clear", scale=1) gr.Examples( examples=[ [EXAMPLE_REDUCTION, "reduction", False], [EXAMPLE_VECTOR_ADD, "vector_add", False], [EXAMPLE_MATMUL, "matmul", False], ], inputs=[cuda_input, kernel_name, simple_mode], label="Demo Kernels (pre-loaded with intentional AMD bugs)", ) with gr.Column(scale=1): gr.Markdown("### Output", elem_classes="panel-header") log_output = gr.Markdown( value="*Agent steps will appear here once you click **Port to ROCm**.*", label="Agent Pipeline Log", ) hip_output = gr.Code( label="Translated and Optimized HIP Code", language="cpp", lines=18, ) gr.Markdown( """ --- ### How the pipeline works | Agent | Role | |-------|------| | **Analyzer** | Scans CUDA for AMD-specific risks: wavefront size, ballot/shuffle idioms, shared-memory layout | | **Translator** | Runs `hipify` then applies LLM-guided fixes for bugs `hipify` cannot detect | | **Tester** | Verifies compilation with `hipcc` and checks output correctness | | **Optimizer** | Proposes MI300X-specific optimisations; re-tested against baseline | | **Coordinator** | Orchestrates the loop; retries up to 3x if the optimised output regresses | ### The key bug: warp-size assumption ```c // NVIDIA (warpSize = 32) -- silently WRONG on AMD if (tid < 32) { vsmem[tid] += vsmem[tid + 32]; ... } // AMD-correct (wavefront = 64) if (tid < 64) { vsmem[tid] += vsmem[tid + 32]; if (tid < 32) { vsmem[tid] += vsmem[tid + 16]; ... } } ``` ### Benchmark highlights (MI300X, ROCm 7.0) | Kernel | Result | |--------|--------| | matrix_multiply 512x512 | 2.91x speedup over baseline HIP | | vector_add 32M elements | ~3918 GB/s (~74% of MI300X peak) | | reduction 16M elements | correctness PASS after wavefront-64 fix | > Source: `docs/benchmark_runs/` -- real `rocprof` CSV output, May 2026. > Results vary with kernel complexity; these figures are not guaranteed on every input. """ ) port_btn.click( fn=port_kernel, inputs=[cuda_input, kernel_name, simple_mode], outputs=[log_output, hip_output], ) clear_btn.click( fn=lambda: ("*Agent steps will appear here once you click **Port to ROCm**.*", ""), outputs=[log_output, hip_output], ) if __name__ == "__main__": demo.launch(theme=gr.themes.Default(primary_hue="orange"), css=CSS)