| import gradio as gr |
| from huggingface_hub import hf_hub_download |
| from llama_cpp import Llama |
| import os |
|
|
| print("Lade DeepSeek-R1 GGUF...") |
|
|
| |
| model_path = hf_hub_download( |
| repo_id="unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF", |
| filename="DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf" |
| ) |
|
|
| |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=1024, |
| n_threads=2, |
| verbose=False |
| ) |
|
|
| def respond(message, history): |
| |
| prompt = f"User: {message}\nAssistant: <think>\n" |
| |
| response = "" |
| |
| stream = llm( |
| prompt, |
| max_tokens=512, |
| stop=["User:", "<|endoftext|>"], |
| stream=True, |
| temperature=0.7 |
| ) |
|
|
| for chunk in stream: |
| if "text" in chunk["choices"][0]: |
| token = chunk["choices"][0]["text"] |
| response += token |
| yield response |
|
|
| |
| demo = gr.ChatInterface( |
| fn=respond, |
| title="DeepSeek-R1 CPU (GGUF Safe Mode)", |
| description="Läuft stabil auf Hugging Face CPU Hardware." |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(server_name="0.0.0.0", server_port=7860) |