Spaces:

Ngixdev
/

qwen-api

Running on Zero

File size: 8,605 Bytes

import os
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

MODEL_REPO = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
MODEL_FILE = "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf"

print("Downloading model...")
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
print(f"Model downloaded to: {model_path}")

print("Loading model...")
llm = Llama(
    model_path=model_path,
    n_ctx=8192,
    n_gpu_layers=-1,
    verbose=False,
)
print("Model loaded!")


def format_messages(message: str, history: list, system_prompt: str = "") -> str:
    formatted = ""
    
    if system_prompt.strip():
        formatted += f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
    
    for user_msg, assistant_msg in history:
        if user_msg:
            formatted += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
        if assistant_msg:
            formatted += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
    
    formatted += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
    return formatted


def generate_response(
    message: str,
    history: list,
    system_prompt: str = "",
    temperature: float = 0.7,
    top_p: float = 0.8,
    top_k: int = 20,
    max_tokens: int = 2048,
) -> str:
    prompt = format_messages(message, history, system_prompt)
    
    output = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        stop=["<|im_end|>", "<|im_start|>"],
    )
    
    return output["choices"][0]["text"].strip()


def api_generate(
    prompt: str,
    system_prompt: str = "",
    temperature: float = 0.7,
    top_p: float = 0.8,
    max_tokens: int = 2048,
) -> dict:
    """
    API endpoint for text generation.
    
    Args:
        prompt: The user prompt/question
        system_prompt: Optional system instruction
        temperature: Sampling temperature (0.0-2.0)
        top_p: Nucleus sampling parameter (0.0-1.0)
        max_tokens: Maximum tokens to generate
        
    Returns:
        Dictionary with 'response' key containing generated text
    """
    try:
        response = generate_response(
            message=prompt,
            history=[],
            system_prompt=system_prompt,
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_tokens,
        )
        return {"response": response, "status": "success"}
    except Exception as e:
        return {"response": None, "status": "error", "error": str(e)}


with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🤖 Qwen3.5-9B Uncensored API Interface
        
        Powered by [HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive](https://huggingface.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive)
        
        **Features:**
        - 9B parameters with 262K context window
        - Fully uncensored (0/465 refusals)
        - Multimodal capable (text, image, video)
        - Supports 201 languages
        - Running with Q4_K_M quantization via llama.cpp
        
        Use the chat interface below or access via API.
        """
    )
    
    with gr.Tab("💬 Chat"):
        chatbot = gr.Chatbot(height=500, label="Conversation")
        
        with gr.Row():
            msg = gr.Textbox(
                label="Message",
                placeholder="Type your message here...",
                scale=4,
                lines=2,
            )
            submit_btn = gr.Button("Send", variant="primary", scale=1)
        
        with gr.Accordion("⚙️ Settings", open=False):
            system_prompt = gr.Textbox(
                label="System Prompt",
                placeholder="Optional: Set behavior/personality for the model",
                lines=3,
            )
            with gr.Row():
                temperature = gr.Slider(
                    minimum=0.0,
                    maximum=2.0,
                    value=0.7,
                    step=0.1,
                    label="Temperature",
                )
                top_p = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    value=0.8,
                    step=0.05,
                    label="Top P",
                )
            with gr.Row():
                top_k = gr.Slider(
                    minimum=1,
                    maximum=100,
                    value=20,
                    step=1,
                    label="Top K",
                )
                max_tokens = gr.Slider(
                    minimum=64,
                    maximum=4096,
                    value=1024,
                    step=64,
                    label="Max Tokens",
                )
        
        clear_btn = gr.Button("🗑️ Clear Chat")
        
        def user_submit(message, history):
            return "", history + [[message, None]]
        
        def bot_response(history, system_prompt, temperature, top_p, top_k, max_tokens):
            if not history:
                return history
            
            message = history[-1][0]
            history_without_last = history[:-1]
            
            response = generate_response(
                message, 
                history_without_last, 
                system_prompt, 
                temperature, 
                top_p, 
                top_k, 
                max_tokens
            )
            history[-1][1] = response
            return history
        
        msg.submit(
            user_submit, 
            [msg, chatbot], 
            [msg, chatbot]
        ).then(
            bot_response,
            [chatbot, system_prompt, temperature, top_p, top_k, max_tokens],
            chatbot,
        )
        
        submit_btn.click(
            user_submit,
            [msg, chatbot],
            [msg, chatbot]
        ).then(
            bot_response,
            [chatbot, system_prompt, temperature, top_p, top_k, max_tokens],
            chatbot,
        )
        
        clear_btn.click(lambda: [], None, chatbot)
    
    with gr.Tab("🔌 API"):
        gr.Markdown(
            """
            ## API Usage
            
            This Space provides a REST API for programmatic access.
            
            ### Python Example
            
            ```python
            from gradio_client import Client
            
            client = Client("Ngixdev/qwen-api")
            
            result = client.predict(
                prompt="Explain quantum computing in simple terms",
                system_prompt="You are a helpful assistant",
                temperature=0.7,
                top_p=0.8,
                max_tokens=1024,
                api_name="/api_generate"
            )
            print(result)
            ```
            
            ### cURL Example
            
            ```bash
            curl -X POST https://ngixdev-qwen-api.hf.space/api/api_generate \\
                -H "Content-Type: application/json" \\
                -d '{
                    "data": [
                        "Explain quantum computing",
                        "You are a helpful assistant",
                        0.7,
                        0.8,
                        1024
                    ]
                }'
            ```
            """
        )
        
        with gr.Row():
            with gr.Column():
                api_prompt = gr.Textbox(
                    label="Prompt",
                    placeholder="Enter your prompt here...",
                    lines=4,
                )
                api_system = gr.Textbox(
                    label="System Prompt (Optional)",
                    placeholder="Set behavior/personality...",
                    lines=2,
                )
                with gr.Row():
                    api_temp = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
                    api_top_p = gr.Slider(0.0, 1.0, 0.8, step=0.05, label="Top P")
                api_max_tokens = gr.Slider(64, 4096, 1024, step=64, label="Max Tokens")
                api_submit = gr.Button("Generate", variant="primary")
            
            with gr.Column():
                api_output = gr.JSON(label="API Response")
        
        api_submit.click(
            api_generate,
            [api_prompt, api_system, api_temp, api_top_p, api_max_tokens],
            api_output,
            api_name="api_generate",
        )

demo.launch(server_name="0.0.0.0", server_port=7860)