import os
from threading import Thread

import gradio as gr
import spaces
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TextIteratorStreamer,
)

MODEL_ID = "Qwen/Qwen3.6-27B"
TITLE = "Qwen3.6-27B Zero"
SUBTITLE = "Text-only Qwen3.6 deployment for ZeroGPU with 4-bit loading, thinking controls, and streaming chat."
DESCRIPTION = (
    "Optimized for ZeroGPU usage: text-only chat, NF4 4-bit quantization, bounded context, "
    "and shorter default generation lengths for better queue behavior."
)
SYSTEM_PROMPT = (
    "You are Qwen3.6-27B, a highly capable assistant for coding, research, and long-form reasoning. "
    "Be clear, accurate, and useful."
)
PLACEHOLDER = (
    "Ask for code, debugging, planning, long-form answers, or agentic workflows. "
    "Thinking mode is enabled by default."
)
MAX_INPUT_TOKENS = 16384
DEFAULT_MAX_NEW_TOKENS = 4096
MAX_NEW_TOKENS = 8192
HF_TOKEN = os.environ.get("HF_TOKEN")

os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
torch.backends.cuda.matmul.allow_tf32 = True

BNB_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    token=HF_TOKEN,
    device_map={"": 0},
    dtype=torch.bfloat16,
    quantization_config=BNB_CONFIG,
    attn_implementation="sdpa",
    low_cpu_mem_usage=True,
)
model.eval()


def model_input_device():
    return next(model.parameters()).device


def estimate_duration(
    message,
    history,
    system_prompt,
    enable_thinking,
    preserve_thinking,
    temperature,
    max_new_tokens,
    top_p,
    top_k,
    repetition_penalty,
):
    del message, history, system_prompt, enable_thinking, preserve_thinking, temperature, top_p, top_k, repetition_penalty
    return min(240, max(90, 60 + int(max_new_tokens / 64)))


def build_messages(history, message, system_prompt):
    messages = []
    if system_prompt.strip():
        messages.append({"role": "system", "content": system_prompt.strip()})
    trimmed_history = history[-8:]
    for user_text, assistant_text in trimmed_history:
        if user_text:
            messages.append({"role": "user", "content": user_text})
        if assistant_text:
            messages.append({"role": "assistant", "content": assistant_text})
    messages.append({"role": "user", "content": message})
    return messages


@spaces.GPU(duration=estimate_duration, size="large")
def stream_chat(
    message: str,
    history: list,
    system_prompt: str,
    enable_thinking: bool,
    preserve_thinking: bool,
    temperature: float,
    max_new_tokens: int,
    top_p: float,
    top_k: int,
    repetition_penalty: float,
):
    messages = build_messages(history, message, system_prompt)
    rendered_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        chat_template_kwargs={
            "enable_thinking": enable_thinking,
            "preserve_thinking": preserve_thinking,
        },
    )
    inputs = tokenizer(
        rendered_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_INPUT_TOKENS,
    ).to(model_input_device())

    streamer = TextIteratorStreamer(
        tokenizer,
        timeout=120.0,
        skip_prompt=True,
        skip_special_tokens=True,
    )

    generation_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=temperature > 0,
        temperature=max(temperature, 1e-5),
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        use_cache=True,
    )

    worker = Thread(target=model.generate, kwargs=generation_kwargs)
    worker.start()

    output = ""
    for chunk in streamer:
        output += chunk
        yield output


CSS = """
.gradio-container { max-width: 1180px !important; margin: 0 auto !important; }
.title h1 { text-align: center; margin-bottom: 0.2rem !important; }
.subtitle p, .meta p { text-align: center; }
.meta p { font-size: 0.95rem; color: #6b7280; margin-top: 0.35rem !important; }
.duplicate-button { margin: 0 auto 14px auto !important; }
"""

chatbot = gr.Chatbot(height=680, placeholder=PLACEHOLDER)

with gr.Blocks(css=CSS, theme="soft") as demo:
    gr.Markdown(f"# {TITLE}", elem_classes="title")
    gr.Markdown(SUBTITLE, elem_classes="subtitle")
    gr.Markdown(
        f"{DESCRIPTION} Model: [{MODEL_ID}](https://huggingface.co/{MODEL_ID})",
        elem_classes="meta",
    )
    gr.DuplicateButton("Duplicate Space", elem_classes="duplicate-button")
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion("⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Textbox(value=SYSTEM_PROMPT, label="System prompt", lines=3, render=False),
            gr.Checkbox(value=True, label="Enable thinking", render=False),
            gr.Checkbox(value=False, label="Preserve thinking across turns", render=False),
            gr.Slider(minimum=0.0, maximum=1.2, step=0.05, value=1.0, label="Temperature", render=False),
            gr.Slider(
                minimum=1024,
                maximum=MAX_NEW_TOKENS,
                step=512,
                value=DEFAULT_MAX_NEW_TOKENS,
                label="Max new tokens",
                render=False,
            ),
            gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.95, label="Top-p", render=False),
            gr.Slider(minimum=1, maximum=100, step=1, value=20, label="Top-k", render=False),
            gr.Slider(minimum=1.0, maximum=1.5, step=0.05, value=1.0, label="Repetition penalty", render=False),
        ],
        examples=[
            ["Design a production-ready architecture for a SaaS analytics platform with clear tradeoffs."],
            ["Write a detailed debugging plan for a flaky async Python test suite."],
            ["Build a responsive landing page in React and Tailwind for a premium AI coding product."],
            ["Refactor this idea into a clear engineering plan: multi-tenant background job processing with retries and observability."],
        ],
        cache_examples=False,
    )

if __name__ == "__main__":
    demo.launch()