rufatronics commited on
Commit
cd49dba
·
verified ·
1 Parent(s): 953f149

Update app.py

Browse files

import gradio as gr
from transformers import AutoModelForImageTextToText, AutoProcessor
import torch
import os

# 1. Setup Model & Token
model_id = "google/gemma-3n-E2B-it"
hf_token = os.getenv("HF_TOKEN")
device = "cpu"

print("Loading Gemma 3n (10GB)... This takes a few minutes.")

# We add low_cpu_mem_usage=True to prevent crashing on load
processor = AutoProcessor.from_pretrained(model_id, token=hf_token)
model = AutoModelForImageTextToText.from_pretrained(
model_id,
token=hf_token,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
device_map="auto"
)

def chat_function(message, history):
# Prepare history for the model
msgs = []
for user_msg, assistant_msg in history:
if user_msg: msgs.append({"role": "user", "content": [{"type": "text", "text": user_msg}]})
if assistant_msg: msgs.append({"role": "model", "content": [{"type": "text", "text": assistant_msg}]})

# Add new message
msgs.append({"role": "user", "content": [{"type": "text", "text": message}]})

# Apply template
inputs = processor.apply_chat_template(
msgs,
add_generation_prompt=True,
tokenize=True,
return_tensors="pt"
).to(device)

# Generate
with torch.no_grad(): # Saves memory during generation
outputs = model.generate(
**inputs,
max_new_tokens=400,
do_sample=True,
temperature=0.4
)

response = processor.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
return response

# 5. The Interface
demo = gr.ChatInterface(
fn=chat_function,
title="Gemma 3n E2B (Fixed)",
description="Now with 'timm' installed and optimized for 16GB RAM!",
)

if __name__ == "__main__":
demo.launch()

Files changed (1) hide show
  1. app.py +0 -65
app.py CHANGED
@@ -1,65 +0,0 @@
1
- import gradio as gr
2
- from transformers import AutoModelForImageTextToText, AutoProcessor
3
- import torch
4
- import os
5
-
6
- # 1. Setup Model & Token
7
- model_id = "google/gemma-3n-E2B-it"
8
- hf_token = os.getenv("HF_TOKEN")
9
-
10
- # Use CPU for Free Tier
11
- device = "cpu"
12
-
13
- print("Loading Gemma 3n... This takes about 2 minutes on CPU.")
14
-
15
- # Gemma 3n uses AutoProcessor and AutoModelForImageTextToText for its multimodal brain
16
- processor = AutoProcessor.from_pretrained(model_id, token=hf_token)
17
- model = AutoModelForImageTextToText.from_pretrained(
18
- model_id,
19
- token=hf_token,
20
- torch_dtype=torch.float32,
21
- device_map="auto"
22
- )
23
-
24
- def chat_function(message, history):
25
- # 2. Correct History Formatting
26
- # This prevents the 'list error' you saw before
27
- msgs = []
28
- for user_msg, assistant_msg in history:
29
- if user_msg: msgs.append({"role": "user", "content": [{"type": "text", "text": user_msg}]})
30
- if assistant_msg: msgs.append({"role": "model", "content": [{"type": "text", "text": assistant_msg}]})
31
-
32
- # Add the new message
33
- msgs.append({"role": "user", "content": [{"type": "text", "text": message}]})
34
-
35
- # 3. Prepare Inputs
36
- inputs = processor.apply_chat_template(
37
- msgs,
38
- add_generation_prompt=True,
39
- tokenize=True,
40
- return_tensors="pt"
41
- ).to(device)
42
-
43
- # 4. Generate Answer
44
- # We use a lower temperature (0.4) for better school/fact accuracy
45
- outputs = model.generate(
46
- **inputs,
47
- max_new_tokens=512,
48
- do_sample=True,
49
- temperature=0.4
50
- )
51
-
52
- # Extract only the response
53
- response = processor.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
54
- return response
55
-
56
- # 5. The Interface
57
- demo = gr.ChatInterface(
58
- fn=chat_function,
59
- title="Gemma 3n E2B: The Smart Assistant",
60
- description="I have much more 'sense' than the 135M model. Try me with logic riddles or school assignments!",
61
- theme="ocean"
62
- )
63
-
64
- if __name__ == "__main__":
65
- demo.launch()