"""
ROCmPort AI - Gradio Space entry point
Calls the deployed FastAPI backend (Render) and streams agent events.
"""

import gradio as gr
import httpx
import json

BACKEND_URL = "https://rocmport-ai-q2b1.onrender.com"

AGENT_LABELS = {
    "analyzer":    "Analyzer",
    "translator":  "Translator",
    "optimizer":   "Optimizer",
    "tester":      "Tester",
    "coordinator": "Coordinator",
}

STATUS_LABELS = {
    "waiting":  "[waiting]",
    "running":  "[running]",
    "done":     "[done]",
    "failed":   "[FAILED]",
    "retrying": "[retrying]",
}

EXAMPLE_REDUCTION = """\
__global__ void reduction_kernel(float* g_idata, float* g_odata, unsigned int n) {
    extern __shared__ float sdata[];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
    float mySum = (i < n) ? g_idata[i] : 0;
    if (i + blockDim.x < n) mySum += g_idata[i + blockDim.x];
    sdata[tid] = mySum;
    __syncthreads();
    for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
        if (tid < s) sdata[tid] = mySum = mySum + sdata[tid + s];
        __syncthreads();
    }
    // DELIBERATE BUG: assumes warpSize=32, wrong on AMD (warpSize=64)
    if (tid < 32) {
        volatile float* vsmem = sdata;
        vsmem[tid] = mySum = mySum + vsmem[tid + 32];
        vsmem[tid] = mySum = mySum + vsmem[tid + 16];
        vsmem[tid] = mySum = mySum + vsmem[tid + 8];
        vsmem[tid] = mySum = mySum + vsmem[tid + 4];
        vsmem[tid] = mySum = mySum + vsmem[tid + 2];
        vsmem[tid] = mySum = mySum + vsmem[tid + 1];
    }
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}"""

EXAMPLE_VECTOR_ADD = """\
__global__ void vectorAdd(const float *A, const float *B, float *C, int n) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n) {
        C[i] = A[i] + B[i];
        // Warp-size assumption: 32 threads per warp (wrong on AMD wavefront-64)
        if (threadIdx.x % 32 == 0) {
            printf("Warp leader: %d\n", threadIdx.x / 32);
        }
    }
}"""

EXAMPLE_MATMUL = """\
__global__ void matmul(float *A, float *B, float *C, int N) {
    __shared__ float As[32][32];
    __shared__ float Bs[32][32];
    int row = blockIdx.y * 32 + threadIdx.y;
    int col = blockIdx.x * 32 + threadIdx.x;
    float sum = 0.0f;
    for (int k = 0; k < N / 32; k++) {
        As[threadIdx.y][threadIdx.x] = A[row * N + k * 32 + threadIdx.x];
        Bs[threadIdx.y][threadIdx.x] = B[(k * 32 + threadIdx.y) * N + col];
        __syncthreads();
        for (int n = 0; n < 32; n++)
            sum += As[threadIdx.y][n] * Bs[n][threadIdx.x];
        __syncthreads();
    }
    C[row * N + col] = sum;
}"""


def port_kernel(cuda_code: str, kernel_name: str, simple_mode: bool):
    """Generator: streams agent events and yields (log_markdown, hip_code)."""
    if not cuda_code or len(cuda_code.strip()) < 10:
        yield "Please provide CUDA kernel code (at least 10 characters).", ""
        return

    kernel_name = kernel_name.strip() or "custom"
    log_lines: list[str] = []
    hip_code = ""

    payload = {
        "cuda_code": cuda_code,
        "kernel_name": kernel_name,
        "simple_mode": bool(simple_mode),
    }

    log_lines.append("**Connecting to ROCmPort AI backend...**")
    yield "\n\n".join(log_lines), hip_code

    try:
        with httpx.Client(timeout=180.0) as client:
            with client.stream("POST", f"{BACKEND_URL}/port", json=payload) as resp:
                resp.raise_for_status()

                for line in resp.iter_lines():
                    if not line:
                        continue
                    if not line.startswith("data: "):
                        continue

                    data = line[6:]
                    if data.strip() == "[DONE]":
                        break

                    try:
                        event = json.loads(data)
                    except json.JSONDecodeError:
                        continue

                    agent = event.get("agent", "system")
                    status = event.get("status", "running")
                    message = event.get("message", "")
                    detail = event.get("detail") or ""

                    label = AGENT_LABELS.get(agent, agent.capitalize())
                    s_label = STATUS_LABELS.get(status, status)

                    log_lines.append(f"**{label}** {s_label} -- {message}")

                    if status == "done" and detail:
                        try:
                            detail_json = json.loads(detail)
                            candidate = (
                                detail_json.get("hip_code")
                                or detail_json.get("optimized_code")
                                or detail_json.get("translated_code")
                                or ""
                            )
                            if candidate:
                                hip_code = candidate
                        except (json.JSONDecodeError, AttributeError):
                            pass

                    yield "\n\n".join(log_lines), hip_code

    except httpx.ConnectError:
        log_lines.append(
            "**Could not connect to backend.**\n\n"
            "> The server may be cold-starting -- please wait ~30 s and retry."
        )
        yield "\n\n".join(log_lines), hip_code
        return
    except httpx.TimeoutException:
        log_lines.append("**Request timed out.** The pipeline may still be running -- try again shortly.")
        yield "\n\n".join(log_lines), hip_code
        return
    except httpx.HTTPStatusError as exc:
        log_lines.append(f"**HTTP {exc.response.status_code}**: {exc.response.text[:300]}")
        yield "\n\n".join(log_lines), hip_code
        return
    except Exception as exc:  # noqa: BLE001
        log_lines.append(f"**Unexpected error**: {exc}")
        yield "\n\n".join(log_lines), hip_code
        return

    if not hip_code:
        log_lines.append("\nPipeline finished but no HIP code was extracted. Check agent logs above.")
    else:
        log_lines.append("\n**Migration complete.** HIP code is shown on the right.")

    yield "\n\n".join(log_lines), hip_code


CSS = (
    ".panel-header { font-weight: 600; font-size: 1rem; margin-bottom: 4px; } "
    "footer { display: none !important; }"
)

with gr.Blocks(title="ROCmPort AI -- CUDA to ROCm Migration") as demo:

    gr.Markdown(
        """# ROCmPort AI
### Agentic CUDA to ROCm/HIP migration with wavefront-64 bug detection

> **Backend API**: [rocmport-ai-q2b1.onrender.com](https://rocmport-ai-q2b1.onrender.com) |
> **GitHub**: [tazwaryayyyy/ROCmPort-AI](https://github.com/tazwaryayyyy/ROCmPort-AI)

`hipify-clang` translates CUDA API calls mechanically -- it **cannot** detect that `if (tid < 32)` in a
warp-level reduction silently skips lanes 32-63 on AMD wavefront-64.
The code compiles, the output is wrong, no errors. **ROCmPort AI catches this before execution.**
"""
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Input", elem_classes="panel-header")
            cuda_input = gr.Code(
                label="CUDA Kernel Code",
                language="cpp",
                lines=22,
                value=EXAMPLE_REDUCTION,
            )
            with gr.Row():
                kernel_name = gr.Textbox(
                    label="Kernel Name",
                    value="reduction",
                    placeholder="e.g. reduction, matmul, vector_add",
                    scale=2,
                )
                simple_mode = gr.Checkbox(
                    label="Explain Like I am 5",
                    value=False,
                    scale=1,
                )
            with gr.Row():
                port_btn = gr.Button("Port to ROCm", variant="primary", scale=3)
                clear_btn = gr.Button("Clear", scale=1)

            gr.Examples(
                examples=[
                    [EXAMPLE_REDUCTION, "reduction", False],
                    [EXAMPLE_VECTOR_ADD, "vector_add", False],
                    [EXAMPLE_MATMUL, "matmul", False],
                ],
                inputs=[cuda_input, kernel_name, simple_mode],
                label="Demo Kernels (pre-loaded with intentional AMD bugs)",
            )

        with gr.Column(scale=1):
            gr.Markdown("### Output", elem_classes="panel-header")
            log_output = gr.Markdown(
                value="*Agent steps will appear here once you click **Port to ROCm**.*",
                label="Agent Pipeline Log",
            )
            hip_output = gr.Code(
                label="Translated and Optimized HIP Code",
                language="cpp",
                lines=18,
            )

    gr.Markdown(
        """
---
### How the pipeline works

| Agent | Role |
|-------|------|
| **Analyzer** | Scans CUDA for AMD-specific risks: wavefront size, ballot/shuffle idioms, shared-memory layout |
| **Translator** | Runs `hipify` then applies LLM-guided fixes for bugs `hipify` cannot detect |
| **Tester** | Verifies compilation with `hipcc` and checks output correctness |
| **Optimizer** | Proposes MI300X-specific optimisations; re-tested against baseline |
| **Coordinator** | Orchestrates the loop; retries up to 3x if the optimised output regresses |

### The key bug: warp-size assumption

```c
// NVIDIA (warpSize = 32) -- silently WRONG on AMD
if (tid < 32) { vsmem[tid] += vsmem[tid + 32]; ... }

// AMD-correct (wavefront = 64)
if (tid < 64) {
    vsmem[tid] += vsmem[tid + 32];
    if (tid < 32) { vsmem[tid] += vsmem[tid + 16]; ... }
}
```

### Benchmark highlights (MI300X, ROCm 7.0)

| Kernel | Result |
|--------|--------|
| matrix_multiply 512x512 | 2.91x speedup over baseline HIP |
| vector_add 32M elements | ~3918 GB/s (~74% of MI300X peak) |
| reduction 16M elements | correctness PASS after wavefront-64 fix |

> Source: `docs/benchmark_runs/` -- real `rocprof` CSV output, May 2026.
> Results vary with kernel complexity; these figures are not guaranteed on every input.
"""
    )

    port_btn.click(
        fn=port_kernel,
        inputs=[cuda_input, kernel_name, simple_mode],
        outputs=[log_output, hip_output],
    )

    clear_btn.click(
        fn=lambda: ("*Agent steps will appear here once you click **Port to ROCm**.*", ""),
        outputs=[log_output, hip_output],
    )

if __name__ == "__main__":
    demo.launch(theme=gr.themes.Default(primary_hue="orange"), css=CSS)