rufatronics commited on
Commit
2fec00c
Β·
verified Β·
1 Parent(s): 732bdc1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -22
app.py CHANGED
@@ -1,36 +1,50 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
 
4
 
5
- # Load the model and tokenizer
6
- checkpoint = "HuggingFaceTB/SmolLM2-135M-Instruct"
7
- device = "cpu" # Since we are on free tier CPU
8
 
9
- tokenizer = AutoTokenizer.from_pretrained(checkpoint)
10
- model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
 
 
 
 
 
 
11
 
12
  def chat(message, history):
13
- # Prepare the chat history for the model
14
- # SmolLM2 uses a specific 'instruct' format
15
- messages = [{"role": "system", "content": "You are a helpful assistant."}]
16
- for val in history:
17
- if val[0]: messages.append({"role": "user", "content": val[0]})
18
- if val[1]: messages.append({"role": "assistant", "content": val[1]})
19
 
20
- messages.append({"role": "user", "content": message})
21
 
22
- # Convert to model-ready format
23
- input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
24
- inputs = tokenizer(input_text, return_tensors="pt").to(device)
 
 
 
 
 
 
25
 
26
- # Generate response
27
- outputs = model.generate(**inputs, max_new_tokens=500, temperature=0.7, top_p=0.9, do_sample=True)
28
- response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
 
29
 
30
- return response
31
 
32
- # Create the Gradio interface
33
- demo = gr.ChatInterface(fn=chat, title="SmolLM2-135M Personal Assistant")
 
 
 
 
 
34
 
35
  if __name__ == "__main__":
36
  demo.launch()
 
1
  import gradio as gr
 
2
  import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
 
5
+ # Model ID for the stable Instruct version
6
+ MODEL_ID = "HuggingFaceTB/SmolLM2-135M-Instruct"
 
7
 
8
+ # Load tokenizer and model once at startup
9
+ print("System: Booting Stable-Lite Brain...")
10
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
11
+ model = AutoModelForCausalLM.from_pretrained(
12
+ MODEL_ID,
13
+ device_map="cpu",
14
+ torch_dtype=torch.float32
15
+ )
16
 
17
  def chat(message, history):
18
+ # Standard Instruct Format for SmolLM2
19
+ # 'Be helpful and precise' is the only instruction to save RAM/Attention
20
+ prompt = f"<|user|>\nBe helpful and precise: {message}<|endoftext|>\n<|assistant|>\n"
 
 
 
21
 
22
+ inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
23
 
24
+ with torch.no_grad():
25
+ outputs = model.generate(
26
+ **inputs,
27
+ max_new_tokens=150,
28
+ temperature=0.1,
29
+ do_sample=True,
30
+ repetition_penalty=1.2,
31
+ eos_token_id=tokenizer.eos_token_id
32
+ )
33
 
34
+ # Extracting only the new tokens (the response)
35
+ input_length = inputs.input_ids.shape[1]
36
+ response_tokens = outputs[0][input_length:]
37
+ response = tokenizer.decode(response_tokens, skip_special_tokens=True)
38
 
39
+ return response.strip()
40
 
41
+ # Gradio Interface configured for Stability
42
+ demo = gr.ChatInterface(
43
+ fn=chat,
44
+ title="Smol-AI Kano (Stable-Lite)",
45
+ description="Optimized for local students and businesses on 4GB RAM devices.",
46
+ cache_examples=False # Prevents the Python 3.13 caching error
47
+ )
48
 
49
  if __name__ == "__main__":
50
  demo.launch()