Gemma4 / app.py
simler's picture
Update app.py
5e5ba81 verified
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import os
# 强制设置环境,让 Gradio 知道它在 Spaces 运行
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
repo_id = "unsloth/gemma-4-E2B-it-GGUF"
filename = "gemma-4-E2B-it-Q4_K_M.gguf"
print("正在加载模型...")
model_path = hf_hub_download(repo_id=repo_id, filename=filename)
llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2, chat_format="gemma")
def chat_with_gemma(prompt, history):
messages = []
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": prompt})
stream = llm.create_chat_completion(messages=messages, max_tokens=512, stream=True)
response = ""
for chunk in stream:
delta = chunk['choices'][0]['delta']
if 'content' in delta:
response += delta['content']
yield response
# 使用最简单的 Block 结构,绕过复杂的 ChatInterface 模板带来的 Jinja2 报错
with gr.Blocks() as demo:
gr.Markdown("# Gemma 4 E2B (Docker CPU版)")
chatbot = gr.Chatbot()
msg = gr.Textbox()
def respond(message, chat_history):
bot_message = ""
# 这里为了简化,直接调用 generate,也可以使用 generator
messages = [{"role": "user", "content": message}]
stream = llm.create_chat_completion(messages=messages, stream=True)
for chunk in stream:
delta = chunk['choices'][0]['delta']
if 'content' in delta:
bot_message += delta['content']
chat_history.append((message, bot_message))
yield "", chat_history
chat_history.pop()
msg.submit(respond, [msg, chatbot], [msg, chatbot])
if __name__ == "__main__":
# 彻底避开一切参数校验
# 只需这一行,在 Hugging Face Space 上它会自动完成所有配置
demo.launch(debug=True)