Eeppa commited on
Commit
fe9267f
·
verified ·
1 Parent(s): 56bf91e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -23
app.py CHANGED
@@ -1,39 +1,54 @@
1
  import gradio as gr
2
- from transformers import pipeline
 
 
3
 
4
- # Initialize the pipeline
5
- generator = pipeline("text-generation", model="HuggingFaceTB/SmolLM2-135M-Instruct")
 
 
6
 
7
  def generate_response(message, history):
8
- # This 'system_prompt' anchors the AI
9
- system_prompt = "<|im_start|>system\nYou are a concise and helpful assistant. No yapping.<|im_end|>\n"
10
 
11
- # Build the conversation history so it has a memory
12
- full_prompt = system_prompt
13
  for user_msg, assistant_msg in history:
14
- full_prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
 
 
15
 
16
- # Add the current message
17
- full_prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
 
18
 
19
- # Generate with settings that prevent rambling
20
- output = generator(
21
- full_prompt,
22
- max_new_tokens=256,
23
- temperature=0.4,
24
- do_sample=True,
25
- repetition_penalty=1.2
 
 
 
 
26
  )
27
 
28
- # Clean the output to only show the assistant's new text
29
- response = output[0]['generated_text']
30
- return response.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip()
 
 
 
 
 
31
 
32
- # Create the Chat Interface
33
  demo = gr.ChatInterface(
34
  fn=generate_response,
35
- title="Not So Slow AI",
36
- description="SmolLM2 135M: Now with 100% less accidental festival advice."
37
  )
38
 
39
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
3
+ import torch
4
+ from threading import Thread
5
 
6
+ # Load model and tokenizer properly for streaming
7
+ model_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
8
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
9
+ model = AutoModelForCausalLM.from_pretrained(model_id)
10
 
11
  def generate_response(message, history):
12
+ # Strict system prompt to keep it grounded
13
+ system_prompt = "You are a helpful, very brief assistant. Do not imagine stories or contexts. Answer only what is asked."
14
 
15
+ # Build chat format
16
+ messages = [{"role": "system", "content": system_prompt}]
17
  for user_msg, assistant_msg in history:
18
+ messages.append({"role": "user", "content": user_msg})
19
+ messages.append({"role": "assistant", "content": assistant_msg})
20
+ messages.append({"role": "user", "content": message})
21
 
22
+ # Convert to model's specific format
23
+ input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
24
+ inputs = tokenizer([input_text], return_tensors="pt")
25
 
26
+ # Set up the streamer
27
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
28
+
29
+ # Run generation in a separate thread
30
+ generation_kwargs = dict(
31
+ inputs,
32
+ streamer=streamer,
33
+ max_new_tokens=150, # Keep responses short to prevent yapping
34
+ temperature=0.3, # Low temp = more "sane"
35
+ repetition_penalty=1.2,
36
+ do_sample=True
37
  )
38
 
39
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
40
+ thread.start()
41
+
42
+ # Yield the text as it comes in
43
+ partial_text = ""
44
+ for new_text in streamer:
45
+ partial_text += new_text
46
+ yield partial_text
47
 
 
48
  demo = gr.ChatInterface(
49
  fn=generate_response,
50
+ title="Actually Fast AI",
51
+ description="SmolLM2 135M with Streaming. No more imaginary stories!"
52
  )
53
 
54
  if __name__ == "__main__":