import os import gradio as gr from fastapi import FastAPI from transformers import AutoTokenizer, pipeline import threading MODEL = os.environ.get("MODEL_NAME", "google/gemma-4-E4B") # Para Space grátis, defina MODEL_NAME=google/gemma-4-E2B no settings se E4B falhar. tokenizer = None generator = None _model_lock = threading.Lock() _loading = False def load_model(): global tokenizer, generator, _loading with _model_lock: if tokenizer is not None and generator is not None: return _loading = True try: tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) # carregamento em CPU from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained( MODEL, device_map={"": "cpu"}, torch_dtype="float32", low_cpu_mem_usage=True, trust_remote_code=True ) generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1) finally: _loading = False def generate(prompt): if generator is None: load_model() # limite de tokens para reduzir uso de memória out = generator(prompt, max_new_tokens=64, do_sample=False) return out[0]["generated_text"] demo = gr.Interface(fn=generate, inputs=gr.Textbox(lines=4, label="Prompt"), outputs="text", title="Gemma (Space CPU)") app = FastAPI() app = gr.mount_gradio_app(app, demo, path="/")