Spaces:

rufatronics
/

Gemma3n

Sleeping

App Files Files Community

rufatronics commited on Feb 2

Commit

cd49dba

verified ·

1 Parent(s): 953f149

Update app.py

Browse files

import gradio as gr
from transformers import AutoModelForImageTextToText, AutoProcessor
import torch
import os

# 1. Setup Model & Token
model_id = "google/gemma-3n-E2B-it"
hf_token = os.getenv("HF_TOKEN")
device = "cpu"

print("Loading Gemma 3n (10GB)... This takes a few minutes.")

# We add low_cpu_mem_usage=True to prevent crashing on load
processor = AutoProcessor.from_pretrained(model_id, token=hf_token)
model = AutoModelForImageTextToText.from_pretrained(
model_id,
token=hf_token,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
device_map="auto"
)

def chat_function(message, history):
# Prepare history for the model
msgs = []
for user_msg, assistant_msg in history:
if user_msg: msgs.append({"role": "user", "content": [{"type": "text", "text": user_msg}]})
if assistant_msg: msgs.append({"role": "model", "content": [{"type": "text", "text": assistant_msg}]})

# Add new message
msgs.append({"role": "user", "content": [{"type": "text", "text": message}]})

# Apply template
inputs = processor.apply_chat_template(
msgs,
add_generation_prompt=True,
tokenize=True,
return_tensors="pt"
).to(device)

# Generate
with torch.no_grad(): # Saves memory during generation
outputs = model.generate(
**inputs,
max_new_tokens=400,
do_sample=True,
temperature=0.4
)

response = processor.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
return response

# 5. The Interface
demo = gr.ChatInterface(
fn=chat_function,
title="Gemma 3n E2B (Fixed)",
description="Now with 'timm' installed and optimized for 16GB RAM!",
)

if __name__ == "__main__":
demo.launch()

Files changed (1) hide show

app.py +0 -65

app.py CHANGED Viewed

@@ -1,65 +0,0 @@
-import gradio as gr
-from transformers import AutoModelForImageTextToText, AutoProcessor
-import torch
-import os
-# 1. Setup Model & Token
-model_id = "google/gemma-3n-E2B-it"
-hf_token = os.getenv("HF_TOKEN")
-# Use CPU for Free Tier
-device = "cpu"
-print("Loading Gemma 3n... This takes about 2 minutes on CPU.")
-# Gemma 3n uses AutoProcessor and AutoModelForImageTextToText for its multimodal brain
-processor = AutoProcessor.from_pretrained(model_id, token=hf_token)
-model = AutoModelForImageTextToText.from_pretrained(
-    model_id,
-    token=hf_token,
-    torch_dtype=torch.float32,
-    device_map="auto"
-)
-def chat_function(message, history):
-    # 2. Correct History Formatting
-    # This prevents the 'list error' you saw before
-    msgs = []
-    for user_msg, assistant_msg in history:
-        if user_msg: msgs.append({"role": "user", "content": [{"type": "text", "text": user_msg}]})
-        if assistant_msg: msgs.append({"role": "model", "content": [{"type": "text", "text": assistant_msg}]})
-    # Add the new message
-    msgs.append({"role": "user", "content": [{"type": "text", "text": message}]})
-    # 3. Prepare Inputs
-    inputs = processor.apply_chat_template(
-        msgs,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_tensors="pt"
-    ).to(device)
-    # 4. Generate Answer
-    # We use a lower temperature (0.4) for better school/fact accuracy
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=512,
-        do_sample=True,
-        temperature=0.4
-    )
-    # Extract only the response
-    response = processor.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
-    return response
-# 5. The Interface
-demo = gr.ChatInterface(
-    fn=chat_function,
-    title="Gemma 3n E2B: The Smart Assistant",
-    description="I have much more 'sense' than the 135M model. Try me with logic riddles or school assignments!",
-    theme="ocean"
-)
-if __name__ == "__main__":
-    demo.launch()