from fastapi import FastAPI import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import torch MODEL = "google/gemma-4-E4B" # Carregamento otimista para CPU tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL, device_map={"": "cpu"}, torch_dtype=torch.float32, low_cpu_mem_usage=True, trust_remote_code=True ) generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1) def generate(prompt): out = generator(prompt, max_new_tokens=128, do_sample=False) return out[0]["generated_text"] demo = gr.Interface(fn=generate, inputs=gr.Textbox(lines=4, label="Prompt"), outputs="text", title="Gemma-4-E4B (CPU)") app = FastAPI() app = gr.mount_gradio_app(app, demo, path="/")