import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama import os print("Lade DeepSeek-R1 GGUF...") # Modell-Download model_path = hf_hub_download( repo_id="unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF", filename="DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf" ) # Initialisierung (n_ctx auf 1024 begrenzt für CPU-Stabilität) llm = Llama( model_path=model_path, n_ctx=1024, n_threads=2, verbose=False ) def respond(message, history): # DeepSeek Format: Wir starten direkt mit dem Denk-Tag prompt = f"User: {message}\nAssistant: \n" response = "" # Streaming-Generierung stream = llm( prompt, max_tokens=512, stop=["User:", "<|endoftext|>"], stream=True, temperature=0.7 ) for chunk in stream: if "text" in chunk["choices"][0]: token = chunk["choices"][0]["text"] response += token yield response # Gradio Interface demo = gr.ChatInterface( fn=respond, title="DeepSeek-R1 CPU (GGUF Safe Mode)", description="Läuft stabil auf Hugging Face CPU Hardware." ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)