Spaces:

ntdservices
/

chatbot

Runtime error

App Files Files Community

ntdservices commited on Aug 6, 2025

Commit

c4bf23b

verified ·

1 Parent(s): 68a9921

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile.txt +16 -0
app.py +106 -0
requirements.txt +5 -0

Dockerfile.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.11-slim
+# minimal build tools for llama-cpp’s C++ extension
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends build-essential git && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY app.py .
+# `PORT` is provided by HF; default to 7860 locally
+ENV NUM_CPU=4
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Hugging Face Space – tiny streaming chatbot + JSON API
+#   • Gradio UI with incremental token streaming
+#   • POST /api/generate  → {"response": "..."}
+#   • Easily swap the model path / prompt template later
+#
+# Tested on HF free CPU – 16 vCPU, 16 GB RAM
+# ---------------------------------------------------------
+import os, asyncio
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import gradio as gr
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+MODEL_REPO   = "TheBloke/TinyLlama-1.1B-Chat-GGUF"
+GGUF_FILE    = "tinyllama-1.1b-chat.q4_K_M.gguf"   # 2 GB, 4-bit
+N_CTX        = 4096                                # tokens of context
+MAX_TOKENS   = 512                                 # generation limit
+# ---------- model load (one-time) ----------
+model_path = hf_hub_download(repo_id=MODEL_REPO, filename=GGUF_FILE)
+llm = Llama(model_path=model_path,
+            n_ctx=N_CTX,
+            n_threads=int(os.getenv("NUM_CPU", "8")),  # feel free to tune
+            n_gpu_layers=0,                            # CPU-only
+            logits_all=False,
+            use_mlock=True)
+SYSTEM_PROMPT = "You are a helpful, concise news assistant."
+# ---------- streaming generation ----------
+def stream_chat(prompt, history):
+    # Llama.cpp wants the full conversation in a single string
+    dialogue = [f"<|system|>{SYSTEM_PROMPT}"]
+    for user, bot in history:
+        dialogue.append(f"<|user|>{user}")
+        dialogue.append(f"<|assistant|>{bot}")
+    dialogue.append(f"<|user|>{prompt}")
+    final_prompt = "\n".join(dialogue)
+    stream = llm.create_completion(
+        final_prompt,
+        max_tokens=MAX_TOKENS,
+        temperature=0.7,
+        top_p=0.9,
+        stream=True,
+        stop=["<|user|>", "<|assistant|>", "</s>"],
+    )
+    partial = ""
+    for chunk in stream:
+        token = chunk["choices"][0]["text"]
+        partial += token
+        yield partial
+# ---------- Gradio interface ----------
+with gr.Blocks(title="Tiny Chatbot") as demo:
+    gr.Markdown("### TinyLlama Chatbot – streams as it thinks")
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        txt = gr.Textbox(show_label=False, placeholder="Paste or type…", lines=4)
+    send_btn = gr.Button("Send", variant="primary")
+    def user_submit(message, chat_history):
+        chat_history = chat_history + [[message, ""]]
+        return "", chat_history
+    def bot_reply(chat_history):
+        user_msg = chat_history[-1][0]
+        gen = stream_chat(user_msg, chat_history[:-1])
+        for answer in gen:
+            chat_history[-1][1] = answer
+            yield chat_history
+    txt.submit(user_submit, [txt, chatbot], [txt, chatbot]).then(
+        bot_reply, chatbot, chatbot
+    )
+    send_btn.click(user_submit, [txt, chatbot], [txt, chatbot]).then(
+        bot_reply, chatbot, chatbot
+    )
+# ---------- JSON API ----------
+app = FastAPI()
+app = gr.mount_gradio_app(app, demo, path="/")      # UI on root “/”
+class GenRequest(BaseModel):
+    prompt: str
+    max_tokens: int | None = None
+@app.post("/api/generate")
+async def api_generate(req: GenRequest):
+    if not req.prompt:
+        raise HTTPException(400, detail="prompt missing")
+    gen = llm.create_completion(
+        f"<|system|>{SYSTEM_PROMPT}\n<|user|>{req.prompt}",
+        max_tokens=req.max_tokens or MAX_TOKENS,
+        temperature=0.7,
+        top_p=0.9,
+        stream=False,
+        stop=["<|user|>", "<|assistant|>", "</s>"],
+    )
+    return {"response": gen["choices"][0]["text"].strip()}
+if __name__ == "__main__":       # HF launches `python app.py`
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=4.29.0
+fastapi
+uvicorn[standard]
+llama-cpp-python>=0.2.38
+huggingface-hub