Spaces:

rufatronics
/

smol-ai

Runtime error

File size: 1,607 Bytes

732bdc1
 
2fec00c
732bdc1
2fec00c
 
732bdc1
2fec00c
 
 
 
 
 
 
 
732bdc1
 
2fec00c
 
 
732bdc1
2fec00c
732bdc1
2fec00c
 
 
 
 
 
 
 
 
732bdc1
2fec00c
 
 
 
732bdc1
2fec00c
732bdc1
2fec00c
 
 
 
 
 
 
732bdc1

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Model ID for the stable Instruct version
MODEL_ID = "HuggingFaceTB/SmolLM2-135M-Instruct"

# Load tokenizer and model once at startup
print("System: Booting Stable-Lite Brain...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, 
    device_map="cpu", 
    torch_dtype=torch.float32
)

def chat(message, history):
    # Standard Instruct Format for SmolLM2
    # 'Be helpful and precise' is the only instruction to save RAM/Attention
    prompt = f"<|user|>\nBe helpful and precise: {message}<|endoftext|>\n<|assistant|>\n"
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=150, 
            temperature=0.1, 
            do_sample=True,
            repetition_penalty=1.2,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Extracting only the new tokens (the response)
    input_length = inputs.input_ids.shape[1]
    response_tokens = outputs[0][input_length:]
    response = tokenizer.decode(response_tokens, skip_special_tokens=True)
    
    return response.strip()

# Gradio Interface configured for Stability
demo = gr.ChatInterface(
    fn=chat, 
    title="Smol-AI Kano (Stable-Lite)",
    description="Optimized for local students and businesses on 4GB RAM devices.",
    cache_examples=False # Prevents the Python 3.13 caching error
)

if __name__ == "__main__":
    demo.launch()