qwen-api / app.py
Ngixdev's picture
Switch to Docker SDK with CUDA for llama-cpp
31b5080 verified
raw
history blame
8.61 kB
import os
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
MODEL_REPO = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
MODEL_FILE = "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf"
print("Downloading model...")
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
print(f"Model downloaded to: {model_path}")
print("Loading model...")
llm = Llama(
model_path=model_path,
n_ctx=8192,
n_gpu_layers=-1,
verbose=False,
)
print("Model loaded!")
def format_messages(message: str, history: list, system_prompt: str = "") -> str:
formatted = ""
if system_prompt.strip():
formatted += f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
for user_msg, assistant_msg in history:
if user_msg:
formatted += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
if assistant_msg:
formatted += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
formatted += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
return formatted
def generate_response(
message: str,
history: list,
system_prompt: str = "",
temperature: float = 0.7,
top_p: float = 0.8,
top_k: int = 20,
max_tokens: int = 2048,
) -> str:
prompt = format_messages(message, history, system_prompt)
output = llm(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
stop=["<|im_end|>", "<|im_start|>"],
)
return output["choices"][0]["text"].strip()
def api_generate(
prompt: str,
system_prompt: str = "",
temperature: float = 0.7,
top_p: float = 0.8,
max_tokens: int = 2048,
) -> dict:
"""
API endpoint for text generation.
Args:
prompt: The user prompt/question
system_prompt: Optional system instruction
temperature: Sampling temperature (0.0-2.0)
top_p: Nucleus sampling parameter (0.0-1.0)
max_tokens: Maximum tokens to generate
Returns:
Dictionary with 'response' key containing generated text
"""
try:
response = generate_response(
message=prompt,
history=[],
system_prompt=system_prompt,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens,
)
return {"response": response, "status": "success"}
except Exception as e:
return {"response": None, "status": "error", "error": str(e)}
with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# 🤖 Qwen3.5-9B Uncensored API Interface
Powered by [HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive](https://huggingface.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive)
**Features:**
- 9B parameters with 262K context window
- Fully uncensored (0/465 refusals)
- Multimodal capable (text, image, video)
- Supports 201 languages
- Running with Q4_K_M quantization via llama.cpp
Use the chat interface below or access via API.
"""
)
with gr.Tab("💬 Chat"):
chatbot = gr.Chatbot(height=500, label="Conversation")
with gr.Row():
msg = gr.Textbox(
label="Message",
placeholder="Type your message here...",
scale=4,
lines=2,
)
submit_btn = gr.Button("Send", variant="primary", scale=1)
with gr.Accordion("⚙️ Settings", open=False):
system_prompt = gr.Textbox(
label="System Prompt",
placeholder="Optional: Set behavior/personality for the model",
lines=3,
)
with gr.Row():
temperature = gr.Slider(
minimum=0.0,
maximum=2.0,
value=0.7,
step=0.1,
label="Temperature",
)
top_p = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.8,
step=0.05,
label="Top P",
)
with gr.Row():
top_k = gr.Slider(
minimum=1,
maximum=100,
value=20,
step=1,
label="Top K",
)
max_tokens = gr.Slider(
minimum=64,
maximum=4096,
value=1024,
step=64,
label="Max Tokens",
)
clear_btn = gr.Button("🗑️ Clear Chat")
def user_submit(message, history):
return "", history + [[message, None]]
def bot_response(history, system_prompt, temperature, top_p, top_k, max_tokens):
if not history:
return history
message = history[-1][0]
history_without_last = history[:-1]
response = generate_response(
message,
history_without_last,
system_prompt,
temperature,
top_p,
top_k,
max_tokens
)
history[-1][1] = response
return history
msg.submit(
user_submit,
[msg, chatbot],
[msg, chatbot]
).then(
bot_response,
[chatbot, system_prompt, temperature, top_p, top_k, max_tokens],
chatbot,
)
submit_btn.click(
user_submit,
[msg, chatbot],
[msg, chatbot]
).then(
bot_response,
[chatbot, system_prompt, temperature, top_p, top_k, max_tokens],
chatbot,
)
clear_btn.click(lambda: [], None, chatbot)
with gr.Tab("🔌 API"):
gr.Markdown(
"""
## API Usage
This Space provides a REST API for programmatic access.
### Python Example
```python
from gradio_client import Client
client = Client("Ngixdev/qwen-api")
result = client.predict(
prompt="Explain quantum computing in simple terms",
system_prompt="You are a helpful assistant",
temperature=0.7,
top_p=0.8,
max_tokens=1024,
api_name="/api_generate"
)
print(result)
```
### cURL Example
```bash
curl -X POST https://ngixdev-qwen-api.hf.space/api/api_generate \\
-H "Content-Type: application/json" \\
-d '{
"data": [
"Explain quantum computing",
"You are a helpful assistant",
0.7,
0.8,
1024
]
}'
```
"""
)
with gr.Row():
with gr.Column():
api_prompt = gr.Textbox(
label="Prompt",
placeholder="Enter your prompt here...",
lines=4,
)
api_system = gr.Textbox(
label="System Prompt (Optional)",
placeholder="Set behavior/personality...",
lines=2,
)
with gr.Row():
api_temp = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
api_top_p = gr.Slider(0.0, 1.0, 0.8, step=0.05, label="Top P")
api_max_tokens = gr.Slider(64, 4096, 1024, step=64, label="Max Tokens")
api_submit = gr.Button("Generate", variant="primary")
with gr.Column():
api_output = gr.JSON(label="API Response")
api_submit.click(
api_generate,
[api_prompt, api_system, api_temp, api_top_p, api_max_tokens],
api_output,
api_name="api_generate",
)
demo.launch(server_name="0.0.0.0", server_port=7860)