| import gradio as gr |
| from huggingface_hub import hf_hub_download |
| from llama_cpp import Llama |
| import os |
|
|
| |
| os.environ["GRADIO_ANALYTICS_ENABLED"] = "False" |
|
|
| repo_id = "unsloth/gemma-4-E2B-it-GGUF" |
| filename = "gemma-4-E2B-it-Q4_K_M.gguf" |
|
|
| print("正在加载模型...") |
| model_path = hf_hub_download(repo_id=repo_id, filename=filename) |
| llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2, chat_format="gemma") |
|
|
| def chat_with_gemma(prompt, history): |
| messages = [] |
| for user_msg, bot_msg in history: |
| messages.append({"role": "user", "content": user_msg}) |
| messages.append({"role": "assistant", "content": bot_msg}) |
| messages.append({"role": "user", "content": prompt}) |
| |
| stream = llm.create_chat_completion(messages=messages, max_tokens=512, stream=True) |
| |
| response = "" |
| for chunk in stream: |
| delta = chunk['choices'][0]['delta'] |
| if 'content' in delta: |
| response += delta['content'] |
| yield response |
|
|
| |
| with gr.Blocks() as demo: |
| gr.Markdown("# Gemma 4 E2B (Docker CPU版)") |
| chatbot = gr.Chatbot() |
| msg = gr.Textbox() |
| |
| def respond(message, chat_history): |
| bot_message = "" |
| |
| messages = [{"role": "user", "content": message}] |
| stream = llm.create_chat_completion(messages=messages, stream=True) |
| for chunk in stream: |
| delta = chunk['choices'][0]['delta'] |
| if 'content' in delta: |
| bot_message += delta['content'] |
| chat_history.append((message, bot_message)) |
| yield "", chat_history |
| chat_history.pop() |
| |
| msg.submit(respond, [msg, chatbot], [msg, chatbot]) |
|
|
| if __name__ == "__main__": |
| |
| |
| demo.launch(debug=True) |