import gradio as gr from transformers import pipeline, AutoTokenizer model_id = "Qwen/Qwen2.5-0.5B-Instruct" pipe = pipeline("text-generation", model=model_id, device_map="auto") #adding a auto detect gpu tokenizer = AutoTokenizer.from_pretrained(model_id) def chat(message, history): # 1. Format the conversation using the official template messages = [{"role": "user", "content": message}, {"role": "system", "content": "Use context if relevant, otherwise answer generally."},] prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # 2. Generate with a 'stop' token outputs = pipe( prompt, max_new_tokens=200, do_sample=True, temperature=0.2, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id ) # 3. Strip the prompt so you ONLY get the answer generated_text = outputs[0]['generated_text'] answer = generated_text[len(prompt):].split("<|im_end|>")[0].strip() return answer from threading import Thread from transformers import TextIteratorStreamer model = pipe.model #reverting back to original non streaming method def chat_new(message, history): messages = [{"role": "user", "content": message}, {"role": "system", "content": "Use context if relevant, otherwise answer generally."},] for msg in history: messages.append(msg) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(prompt, return_tensors="pt").to(pipe.device) generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512) # 4. Start generation in a separate thread to avoid blocking the UI thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() partial_message = "" for new_text in streamer: partial_message += new_text yield partial_message gr.ChatInterface(chat).launch()