PLNB-Model-SMOL / app.py
rr19tech's picture
Reverted
c154452 verified
import gradio as gr
from transformers import pipeline, AutoTokenizer
model_id = "Qwen/Qwen2.5-0.5B-Instruct"
pipe = pipeline("text-generation", model=model_id, device_map="auto") #adding a auto detect gpu
tokenizer = AutoTokenizer.from_pretrained(model_id)
def chat(message, history):
# 1. Format the conversation using the official template
messages = [{"role": "user", "content": message},
{"role": "system", "content": "Use context if relevant, otherwise answer generally."},]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# 2. Generate with a 'stop' token
outputs = pipe(
prompt,
max_new_tokens=200,
do_sample=True,
temperature=0.2,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id
)
# 3. Strip the prompt so you ONLY get the answer
generated_text = outputs[0]['generated_text']
answer = generated_text[len(prompt):].split("<|im_end|>")[0].strip()
return answer
from threading import Thread
from transformers import TextIteratorStreamer
model = pipe.model
#reverting back to original non streaming method
def chat_new(message, history):
messages = [{"role": "user", "content": message},
{"role": "system", "content": "Use context if relevant, otherwise answer generally."},]
for msg in history:
messages.append(msg)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(pipe.device)
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512)
# 4. Start generation in a separate thread to avoid blocking the UI
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
partial_message = ""
for new_text in streamer:
partial_message += new_text
yield partial_message
gr.ChatInterface(chat).launch()