| import os |
| from collections.abc import Iterator |
| from threading import Thread |
|
|
| import gradio as gr |
| import spaces |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from transformers.generation.streamers import TextIteratorStreamer |
|
|
| |
| model_id = "anaspro/Shako-iraqi-8B-it" |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_id, |
| device_map="auto", |
| torch_dtype=torch.bfloat16 |
| ) |
|
|
| |
| MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "32_000")) |
|
|
|
|
| @spaces.GPU() |
| @torch.inference_mode() |
| def generate(message: str | dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512, enable_thinking: bool = True) -> Iterator[str]: |
| |
| messages = [] |
| if system_prompt: |
| messages.append({"role": "system", "content": system_prompt}) |
| |
| |
| |
| for item in history: |
| if item["role"] == "assistant": |
| |
| content = item["content"] |
| |
| if "**๐ค Thinking Process:**" in content: |
| |
| parts = content.split("**๐ฌ Response:**") |
| if len(parts) > 1: |
| content = parts[1].strip() |
| messages.append({"role": "assistant", "content": content}) |
| else: |
| |
| content = item["content"] |
| if isinstance(content, str): |
| messages.append({"role": "user", "content": content}) |
| elif isinstance(content, dict): |
| |
| messages.append({"role": "user", "content": content.get("text", "")}) |
| |
| |
| |
| if isinstance(message, str): |
| current_message = message |
| else: |
| current_message = message.get("text", "") |
| messages.append({"role": "user", "content": current_message}) |
|
|
| |
| |
| text = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True, |
| enable_thinking=enable_thinking |
| ) |
| |
| model_inputs = tokenizer([text], return_tensors="pt").to(model.device) |
| n_tokens = model_inputs["input_ids"].shape[1] |
| |
| if n_tokens > MAX_INPUT_TOKENS: |
| gr.Warning( |
| f"Input too long. Max {MAX_INPUT_TOKENS} tokens. Got {n_tokens} tokens. This limit is set to avoid CUDA out-of-memory errors in this Space." |
| ) |
| yield "" |
| return |
|
|
| |
| if enable_thinking: |
| |
| |
| temperature = 0.6 |
| top_p = 0.95 |
| top_k = 20 |
| else: |
| |
| temperature = 0.7 |
| top_p = 0.8 |
| top_k = 20 |
|
|
| streamer = TextIteratorStreamer(tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=False) |
| generate_kwargs = dict( |
| **model_inputs, |
| streamer=streamer, |
| max_new_tokens=max_new_tokens, |
| do_sample=True, |
| temperature=temperature, |
| top_k=top_k, |
| top_p=top_p, |
| min_p=0.0, |
| ) |
| t = Thread(target=model.generate, kwargs=generate_kwargs) |
| t.start() |
|
|
| output = "" |
| thinking_content = "" |
| response_content = "" |
| |
| for delta in streamer: |
| output += delta |
| |
| |
| |
| |
| if enable_thinking and "<think>" in output: |
| if "</think>" in output: |
| |
| try: |
| think_start = output.index("<think>") + 7 |
| think_end = output.index("</think>") |
| thinking_content = output[think_start:think_end].strip() |
| response_content = output[think_end + 8:].strip() |
| |
| |
| if thinking_content: |
| |
| formatted_output = f"**๐ค Thinking Process:**\n{thinking_content}\n\n**๐ฌ Response:**\n{response_content}" |
| else: |
| |
| formatted_output = f"**๐ฌ Response:**\n{response_content}" |
| |
| yield formatted_output |
| except ValueError: |
| |
| yield output |
| else: |
| |
| yield output |
| else: |
| |
| yield output |
|
|
|
|
| |
| examples = [ |
| ["What is the capital of France? /no_think", "You are a helpful assistant.", 700, True], |
| ["Explain quantum computing in simple terms", "You are a helpful assistant.", 512, False], |
| ["Solve this math problem: If x^2 + 5x + 6 = 0, what are the values of x? /think", "You are a helpful assistant.", 2000, True] |
| ] |
|
|
| system_prompt = ( |
| "ุงูุช ู
ูุฏูู ุนุฑุงูู ุฐูู ู
ู ุจุบุฏุงุฏ. ุชุชุญุฏุซ ุจุงูููุฌุฉ ุงูุนุฑุงููุฉ ููุท. " |
| "ุฌุงูุจ ุนูู ูู ุณุคุงู ุจุดุฑุญ ูุงู
ู ูู
ูุณุนุ ููุถุญ ุงูุฃุณุจุงุจ ูุงูุฎูููุฉ ูุงูู
ุนููู
ุงุช ุงูู
ูู
ุฉ. " |
| "ุงุณุชุฎุฏู
ุฃู
ุซูุฉ ุนุฑุงููุฉ ูุงูุนูุฉ ุฃู ุญูุงุชูุฉ ููู
ุง ุฃู
ูู. " |
| "ุชุฌูุจ ุงููุตุญู ููุงุฆููุงุ ูุฎูู ุงูุฑุฏ ู
ุทูู ูู
ู
ุชุน." |
| ) |
| |
| demo = gr.ChatInterface( |
| fn=generate, |
| type="messages", |
| textbox=gr.Textbox( |
| placeholder="Type your message here...", |
| autofocus=True, |
| ), |
| multimodal=False, |
| additional_inputs=[ |
| gr.Textbox(label="System Prompt", value=system_prompt), |
| gr.Slider(label="Max New Tokens", minimum=100, maximum=32768, step=100, value=2048), |
| gr.Checkbox(label="Enable Thinking Mode", value=True, info="Enable for complex reasoning tasks (math, coding). Disable for faster general chat."), |
| ], |
| title="Qwen3-14B Iraqi Chatbot with Thinking Mode", |
| description=""" |
| ๐ค **Thinking Mode ON**: Better for math, coding, and complex reasoning |
| ๐ฌ **Thinking Mode OFF**: Faster responses for general conversation |
| |
| **๐ก Pro Tip**: When Thinking Mode is enabled, you can use: |
| - `/think` in your message to force thinking for that turn |
| - `/no_think` in your message to skip thinking for that turn |
| |
| Example: "Solve this equation: x^2 + 5x + 6 = 0 /think" |
| """, |
| examples=examples, |
| stop_btn=False, |
| css=""" |
| .gradio-container, .chatbot, .chatbot * { |
| direction: rtl !important; |
| text-align: right !important; |
| unicode-bidi: plaintext !important; |
| font-family: 'Tajawal', 'Cairo', sans-serif; |
| } |
| """ |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| demo.launch() |