Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline, AutoTokenizer | |
| model_id = "Qwen/Qwen2.5-0.5B-Instruct" | |
| pipe = pipeline("text-generation", model=model_id, device_map="auto") #adding a auto detect gpu | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| def chat(message, history): | |
| # 1. Format the conversation using the official template | |
| messages = [{"role": "user", "content": message}, | |
| {"role": "system", "content": "Use context if relevant, otherwise answer generally."},] | |
| prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| # 2. Generate with a 'stop' token | |
| outputs = pipe( | |
| prompt, | |
| max_new_tokens=200, | |
| do_sample=True, | |
| temperature=0.2, | |
| eos_token_id=tokenizer.eos_token_id, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| # 3. Strip the prompt so you ONLY get the answer | |
| generated_text = outputs[0]['generated_text'] | |
| answer = generated_text[len(prompt):].split("<|im_end|>")[0].strip() | |
| return answer | |
| from threading import Thread | |
| from transformers import TextIteratorStreamer | |
| model = pipe.model | |
| #reverting back to original non streaming method | |
| def chat_new(message, history): | |
| messages = [{"role": "user", "content": message}, | |
| {"role": "system", "content": "Use context if relevant, otherwise answer generally."},] | |
| for msg in history: | |
| messages.append(msg) | |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
| prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = tokenizer(prompt, return_tensors="pt").to(pipe.device) | |
| generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512) | |
| # 4. Start generation in a separate thread to avoid blocking the UI | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| partial_message = "" | |
| for new_text in streamer: | |
| partial_message += new_text | |
| yield partial_message | |
| gr.ChatInterface(chat).launch() |