"""axentx coder-zero-gpu-1 — proxy to HF Inference Router for Qwen3-Coder.

No model loading on Space (avoid GPU init issues). Just a thin wrapper
that forwards to HF Router with the Space owner's token. Adds independent
rate-limit bucket for the pipeline.
"""
import os, json, urllib.request
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import gradio as gr

HF_TOKEN = os.environ.get("HF_TOKEN", "")  # auto-set by Space
MODEL = os.environ.get("MODEL_ID", "Qwen/Qwen3-Coder-30B-A3B-Instruct")


def _call_hf_router(messages, max_tokens=1024, temperature=0.3):
    body = json.dumps({
        "model": MODEL, "messages": messages,
        "max_tokens": max_tokens, "temperature": temperature,
    }).encode()
    req = urllib.request.Request(
        "https://router.huggingface.co/v1/chat/completions",
        data=body, method="POST",
        headers={"Authorization": f"Bearer {HF_TOKEN}",
                 "Content-Type": "application/json"},
    )
    with urllib.request.urlopen(req, timeout=60) as r:
        return json.loads(r.read())


app = FastAPI()
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])


class ChatRequest(BaseModel):
    messages: list
    max_tokens: int = 1024
    temperature: float = 0.3
    model: str = "axentx-coder-1"


@app.post("/v1/chat/completions")
def chat(req: ChatRequest):
    return _call_hf_router(req.messages, req.max_tokens, req.temperature)


@app.get("/health")
def h():
    return {"status": "ok", "backend": "hf-router", "model": MODEL}


def _ui(message, history):
    msgs = [{"role": h["role"], "content": h["content"]}
            for h in (history or []) if h.get("role")]
    msgs.append({"role": "user", "content": message})
    r = _call_hf_router(msgs)
    return r["choices"][0]["message"]["content"]


demo = gr.ChatInterface(_ui, title=f"axentx Coder Proxy — {MODEL}", type="messages")
app = gr.mount_gradio_app(app, demo, path="/")