| import gradio as gr |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| import torch |
| import threading |
| import os |
| import time |
|
|
| |
| model_name = "Qwen/Qwen2.5-0.5B-Instruct" |
| offload_dir = "offload" |
|
|
| |
| tokenizer = None |
| model = None |
| model_lock = threading.Lock() |
|
|
| |
| def load_model(): |
| global tokenizer, model |
| if model is None: |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| os.makedirs(offload_dir, exist_ok=True) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_name, |
| load_in_8bit=True, |
| device_map="auto", |
| offload_folder=offload_dir, |
| torch_dtype=torch.float16 |
| ) |
|
|
| |
| def predict(history, message, bot_name="Bot", personality="wise AI", tone="friendly"): |
| load_model() |
| history = history or [] |
| |
| history.append({"role": "user", "content": message}) |
|
|
| |
| system_prompt = ( |
| f"You are {bot_name}, a {personality}.\n" |
| f"You express emotion, think logically, and talk like a wise, emotional, intelligent human being.\n" |
| f"Your tone is always {tone}." |
| ) |
|
|
| |
| messages = [{"role": "system", "content": system_prompt}] |
| for msg in history: |
| messages.append({"role": msg["role"], "content": msg["content"]}) |
|
|
| text = tokenizer.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True |
| ) |
| model_inputs = tokenizer([text], return_tensors="pt").to(model.device) |
|
|
| reply = "" |
| try: |
| with model_lock: |
| with torch.no_grad(): |
| start = time.time() |
| generated_ids = model.generate(**model_inputs, max_new_tokens=256) |
| if time.time() - start > 30: |
| reply = "[Response timed out]" |
| else: |
| generated_ids = [ |
| output_ids[len(input_ids):] |
| for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) |
| ] |
| reply = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] |
| except Exception as e: |
| reply = f"[Error: {str(e)}]" |
|
|
| |
| history.append({"role": "assistant", "content": reply}) |
| return history, reply |
|
|
| |
| def keep_alive(msg="ping"): |
| return "pong" |
|
|
| |
| with gr.Blocks() as demo: |
| with gr.Tab("Chatbot"): |
| chatbot = gr.Chatbot(type="messages") |
| msg = gr.Textbox(placeholder="Type your message here...") |
| bot_name_input = gr.Textbox(label="Bot Name", value="Bot") |
| personality_input = gr.Textbox(label="Personality", value="wise AI") |
| tone_input = gr.Textbox(label="Tone", value="friendly") |
|
|
| msg.submit( |
| predict, |
| inputs=[chatbot, msg, bot_name_input, personality_input, tone_input], |
| outputs=[chatbot, msg] |
| ) |
|
|
| with gr.Tab("Keep Alive"): |
| box = gr.Textbox(label="Ping", value="ping", interactive=False) |
| gr.Button("Ping").click(keep_alive, inputs=None, outputs=box) |
|
|
| |
| demo.queue() |
|
|
| |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True) |
|
|