Spaces:
Configuration error
Configuration error
| import os | |
| import gradio as gr | |
| from fastapi import FastAPI | |
| from transformers import AutoTokenizer, pipeline | |
| import threading | |
| MODEL = os.environ.get("MODEL_NAME", "google/gemma-4-E4B") | |
| # Para Space grátis, defina MODEL_NAME=google/gemma-4-E2B no settings se E4B falhar. | |
| tokenizer = None | |
| generator = None | |
| _model_lock = threading.Lock() | |
| _loading = False | |
| def load_model(): | |
| global tokenizer, generator, _loading | |
| with _model_lock: | |
| if tokenizer is not None and generator is not None: | |
| return | |
| _loading = True | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) | |
| # carregamento em CPU | |
| from transformers import AutoModelForCausalLM | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL, | |
| device_map={"": "cpu"}, | |
| torch_dtype="float32", | |
| low_cpu_mem_usage=True, | |
| trust_remote_code=True | |
| ) | |
| generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1) | |
| finally: | |
| _loading = False | |
| def generate(prompt): | |
| if generator is None: | |
| load_model() | |
| # limite de tokens para reduzir uso de memória | |
| out = generator(prompt, max_new_tokens=64, do_sample=False) | |
| return out[0]["generated_text"] | |
| demo = gr.Interface(fn=generate, inputs=gr.Textbox(lines=4, label="Prompt"), outputs="text", title="Gemma (Space CPU)") | |
| app = FastAPI() | |
| app = gr.mount_gradio_app(app, demo, path="/") | |