Spaces:

rufatronics
/

smol-ai

Runtime error

App Files Files Community

rufatronics commited on Feb 3

Commit

2fec00c

verified ·

1 Parent(s): 732bdc1

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -22

app.py CHANGED Viewed

@@ -1,36 +1,50 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-# Load the model and tokenizer
-checkpoint = "HuggingFaceTB/SmolLM2-135M-Instruct"
-device = "cpu" # Since we are on free tier CPU
-tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
 def chat(message, history):
-    # Prepare the chat history for the model
-    # SmolLM2 uses a specific 'instruct' format
-    messages = [{"role": "system", "content": "You are a helpful assistant."}]
-    for val in history:
-        if val[0]: messages.append({"role": "user", "content": val[0]})
-        if val[1]: messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    # Convert to model-ready format
-    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer(input_text, return_tensors="pt").to(device)
-    # Generate response
-    outputs = model.generate(**inputs, max_new_tokens=500, temperature=0.7, top_p=0.9, do_sample=True)
-    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
-    return response
-# Create the Gradio interface
-demo = gr.ChatInterface(fn=chat, title="SmolLM2-135M Personal Assistant")
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# Model ID for the stable Instruct version
+MODEL_ID = "HuggingFaceTB/SmolLM2-135M-Instruct"
+# Load tokenizer and model once at startup
+print("System: Booting Stable-Lite Brain...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="cpu",
+    torch_dtype=torch.float32
+)
 def chat(message, history):
+    # Standard Instruct Format for SmolLM2
+    # 'Be helpful and precise' is the only instruction to save RAM/Attention
+    prompt = f"<|user|>\nBe helpful and precise: {message}<|endoftext|>\n<|assistant|>\n"
+    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=150,
+            temperature=0.1,
+            do_sample=True,
+            repetition_penalty=1.2,
+            eos_token_id=tokenizer.eos_token_id
+        )
+    # Extracting only the new tokens (the response)
+    input_length = inputs.input_ids.shape[1]
+    response_tokens = outputs[0][input_length:]
+    response = tokenizer.decode(response_tokens, skip_special_tokens=True)
+    return response.strip()
+# Gradio Interface configured for Stability
+demo = gr.ChatInterface(
+    fn=chat,
+    title="Smol-AI Kano (Stable-Lite)",
+    description="Optimized for local students and businesses on 4GB RAM devices.",
+    cache_examples=False # Prevents the Python 3.13 caching error
+)
 if __name__ == "__main__":
     demo.launch()