Spaces:

Ngixdev
/

qwen-api

Running on Zero

App Files Files Community

qwen-api / app.py

Ngixdev

Switch to Docker SDK with CUDA for llama-cpp

31b5080 verified about 1 month ago

raw

history blame

8.61 kB

	import os
	import gradio as gr
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	MODEL_REPO = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
	MODEL_FILE = "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf"

	print("Downloading model...")
	model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
	print(f"Model downloaded to: {model_path}")

	print("Loading model...")
	llm = Llama(
	model_path=model_path,
	n_ctx=8192,
	n_gpu_layers=-1,
	verbose=False,
	)
	print("Model loaded!")


	def format_messages(message: str, history: list, system_prompt: str = "") -> str:
	formatted = ""

	if system_prompt.strip():
	formatted += f"<\|im_start\|>system\n{system_prompt}<\|im_end\|>\n"

	for user_msg, assistant_msg in history:
	if user_msg:
	formatted += f"<\|im_start\|>user\n{user_msg}<\|im_end\|>\n"
	if assistant_msg:
	formatted += f"<\|im_start\|>assistant\n{assistant_msg}<\|im_end\|>\n"

	formatted += f"<\|im_start\|>user\n{message}<\|im_end\|>\n<\|im_start\|>assistant\n"
	return formatted


	def generate_response(
	message: str,
	history: list,
	system_prompt: str = "",
	temperature: float = 0.7,
	top_p: float = 0.8,
	top_k: int = 20,
	max_tokens: int = 2048,
	) -> str:
	prompt = format_messages(message, history, system_prompt)

	output = llm(
	prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	stop=["<\|im_end\|>", "<\|im_start\|>"],
	)

	return output["choices"][0]["text"].strip()


	def api_generate(
	prompt: str,
	system_prompt: str = "",
	temperature: float = 0.7,
	top_p: float = 0.8,
	max_tokens: int = 2048,
	) -> dict:
	"""
	API endpoint for text generation.

	Args:
	prompt: The user prompt/question
	system_prompt: Optional system instruction
	temperature: Sampling temperature (0.0-2.0)
	top_p: Nucleus sampling parameter (0.0-1.0)
	max_tokens: Maximum tokens to generate

	Returns:
	Dictionary with 'response' key containing generated text
	"""
	try:
	response = generate_response(
	message=prompt,
	history=[],
	system_prompt=system_prompt,
	temperature=temperature,
	top_p=top_p,
	max_tokens=max_tokens,
	)
	return {"response": response, "status": "success"}
	except Exception as e:
	return {"response": None, "status": "error", "error": str(e)}


	with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# 🤖 Qwen3.5-9B Uncensored API Interface

	Powered by [HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive](https://huggingface.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive)

	Features:
	- 9B parameters with 262K context window
	- Fully uncensored (0/465 refusals)
	- Multimodal capable (text, image, video)
	- Supports 201 languages
	- Running with Q4_K_M quantization via llama.cpp

	Use the chat interface below or access via API.
	"""
	)

	with gr.Tab("💬 Chat"):
	chatbot = gr.Chatbot(height=500, label="Conversation")

	with gr.Row():
	msg = gr.Textbox(
	label="Message",
	placeholder="Type your message here...",
	scale=4,
	lines=2,
	)
	submit_btn = gr.Button("Send", variant="primary", scale=1)

	with gr.Accordion("⚙️ Settings", open=False):
	system_prompt = gr.Textbox(
	label="System Prompt",
	placeholder="Optional: Set behavior/personality for the model",
	lines=3,
	)
	with gr.Row():
	temperature = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	value=0.7,
	step=0.1,
	label="Temperature",
	)
	top_p = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.8,
	step=0.05,
	label="Top P",
	)
	with gr.Row():
	top_k = gr.Slider(
	minimum=1,
	maximum=100,
	value=20,
	step=1,
	label="Top K",
	)
	max_tokens = gr.Slider(
	minimum=64,
	maximum=4096,
	value=1024,
	step=64,
	label="Max Tokens",
	)

	clear_btn = gr.Button("🗑️ Clear Chat")

	def user_submit(message, history):
	return "", history + [[message, None]]

	def bot_response(history, system_prompt, temperature, top_p, top_k, max_tokens):
	if not history:
	return history

	message = history[-1][0]
	history_without_last = history[:-1]

	response = generate_response(
	message,
	history_without_last,
	system_prompt,
	temperature,
	top_p,
	top_k,
	max_tokens
	)
	history[-1][1] = response
	return history

	msg.submit(
	user_submit,
	[msg, chatbot],
	[msg, chatbot]
	).then(
	bot_response,
	[chatbot, system_prompt, temperature, top_p, top_k, max_tokens],
	chatbot,
	)

	submit_btn.click(
	user_submit,
	[msg, chatbot],
	[msg, chatbot]
	).then(
	bot_response,
	[chatbot, system_prompt, temperature, top_p, top_k, max_tokens],
	chatbot,
	)

	clear_btn.click(lambda: [], None, chatbot)

	with gr.Tab("🔌 API"):
	gr.Markdown(
	"""
	## API Usage

	This Space provides a REST API for programmatic access.

	### Python Example

	```python
	from gradio_client import Client

	client = Client("Ngixdev/qwen-api")

	result = client.predict(
	prompt="Explain quantum computing in simple terms",
	system_prompt="You are a helpful assistant",
	temperature=0.7,
	top_p=0.8,
	max_tokens=1024,
	api_name="/api_generate"
	)
	print(result)
	```

	### cURL Example

	```bash
	curl -X POST https://ngixdev-qwen-api.hf.space/api/api_generate \\
	-H "Content-Type: application/json" \\
	-d '{
	"data": [
	"Explain quantum computing",
	"You are a helpful assistant",
	0.7,
	0.8,
	1024
	]
	}'
	```
	"""
	)

	with gr.Row():
	with gr.Column():
	api_prompt = gr.Textbox(
	label="Prompt",
	placeholder="Enter your prompt here...",
	lines=4,
	)
	api_system = gr.Textbox(
	label="System Prompt (Optional)",
	placeholder="Set behavior/personality...",
	lines=2,
	)
	with gr.Row():
	api_temp = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
	api_top_p = gr.Slider(0.0, 1.0, 0.8, step=0.05, label="Top P")
	api_max_tokens = gr.Slider(64, 4096, 1024, step=64, label="Max Tokens")
	api_submit = gr.Button("Generate", variant="primary")

	with gr.Column():
	api_output = gr.JSON(label="API Response")

	api_submit.click(
	api_generate,
	[api_prompt, api_system, api_temp, api_top_p, api_max_tokens],
	api_output,
	api_name="api_generate",
	)

	demo.launch(server_name="0.0.0.0", server_port=7860)