Spaces:

Eeppa
/

Very_Slow.Ai

Sleeping

App Files Files Community

Eeppa commited on 11 days ago

Commit

fe9267f

verified ·

1 Parent(s): 56bf91e

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -23

app.py CHANGED Viewed

@@ -1,39 +1,54 @@
 import gradio as gr
-from transformers import pipeline
-# Initialize the pipeline
-generator = pipeline("text-generation", model="HuggingFaceTB/SmolLM2-135M-Instruct")
 def generate_response(message, history):
-    # This 'system_prompt' anchors the AI
-    system_prompt = "<|im_start|>system\nYou are a concise and helpful assistant. No yapping.<|im_end|>\n"
-    # Build the conversation history so it has a memory
-    full_prompt = system_prompt
     for user_msg, assistant_msg in history:
-        full_prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
-    # Add the current message
-    full_prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
-    # Generate with settings that prevent rambling
-    output = generator(
-        full_prompt,
-        max_new_tokens=256,
-        temperature=0.4,
-        do_sample=True,
-        repetition_penalty=1.2
     )
-    # Clean the output to only show the assistant's new text
-    response = output[0]['generated_text']
-    return response.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip()
-# Create the Chat Interface
 demo = gr.ChatInterface(
     fn=generate_response,
-    title="Not So Slow AI",
-    description="SmolLM2 135M: Now with 100% less accidental festival advice."
 )
 if __name__ == "__main__":

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+import torch
+from threading import Thread
+# Load model and tokenizer properly for streaming
+model_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id)
 def generate_response(message, history):
+    # Strict system prompt to keep it grounded
+    system_prompt = "You are a helpful, very brief assistant. Do not imagine stories or contexts. Answer only what is asked."
+    # Build chat format
+    messages = [{"role": "system", "content": system_prompt}]
     for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        messages.append({"role": "assistant", "content": assistant_msg})
+    messages.append({"role": "user", "content": message})
+    # Convert to model's specific format
+    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer([input_text], return_tensors="pt")
+    # Set up the streamer
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    # Run generation in a separate thread
+    generation_kwargs = dict(
+        inputs,
+        streamer=streamer,
+        max_new_tokens=150, # Keep responses short to prevent yapping
+        temperature=0.3,    # Low temp = more "sane"
+        repetition_penalty=1.2,
+        do_sample=True
     )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Yield the text as it comes in
+    partial_text = ""
+    for new_text in streamer:
+        partial_text += new_text
+        yield partial_text
 demo = gr.ChatInterface(
     fn=generate_response,
+    title="Actually Fast AI",
+    description="SmolLM2 135M with Streaming. No more imaginary stories!"
 )
 if __name__ == "__main__":