import gradio as gr from transformers import pipeline # Carrega o modelo na CPU pipe = pipeline("text-generation", model="Qwen/Qwen2.5-0.5B-Instruct", device_map="cpu") def predict(message, history): messages = [{"role": "user", "content": message}] # Gera a resposta results = pipe(messages, max_new_tokens=512) # Retorna apenas o texto da resposta return results[0]['generated_text'][-1]['content'] # O segredo está aqui: Definimos o nome do endpoint como "chat" demo = gr.ChatInterface(fn=predict).queue() if __name__ == "__main__": demo.launch()