ntdservices commited on
Commit
c4bf23b
·
verified ·
1 Parent(s): 68a9921

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile.txt +16 -0
  2. app.py +106 -0
  3. requirements.txt +5 -0
Dockerfile.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # minimal build tools for llama-cpp’s C++ extension
4
+ RUN apt-get update && \
5
+ apt-get install -y --no-install-recommends build-essential git && \
6
+ rm -rf /var/lib/apt/lists/*
7
+
8
+ WORKDIR /app
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ COPY app.py .
13
+
14
+ # `PORT` is provided by HF; default to 7860 locally
15
+ ENV NUM_CPU=4
16
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Space – tiny streaming chatbot + JSON API
2
+ # • Gradio UI with incremental token streaming
3
+ # • POST /api/generate → {"response": "..."}
4
+ # • Easily swap the model path / prompt template later
5
+ #
6
+ # Tested on HF free CPU – 16 vCPU, 16 GB RAM
7
+ # ---------------------------------------------------------
8
+
9
+ import os, asyncio
10
+ from fastapi import FastAPI, HTTPException
11
+ from pydantic import BaseModel
12
+ import gradio as gr
13
+ from llama_cpp import Llama
14
+ from huggingface_hub import hf_hub_download
15
+
16
+ MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-GGUF"
17
+ GGUF_FILE = "tinyllama-1.1b-chat.q4_K_M.gguf" # 2 GB, 4-bit
18
+ N_CTX = 4096 # tokens of context
19
+ MAX_TOKENS = 512 # generation limit
20
+
21
+ # ---------- model load (one-time) ----------
22
+ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=GGUF_FILE)
23
+ llm = Llama(model_path=model_path,
24
+ n_ctx=N_CTX,
25
+ n_threads=int(os.getenv("NUM_CPU", "8")), # feel free to tune
26
+ n_gpu_layers=0, # CPU-only
27
+ logits_all=False,
28
+ use_mlock=True)
29
+
30
+ SYSTEM_PROMPT = "You are a helpful, concise news assistant."
31
+
32
+ # ---------- streaming generation ----------
33
+ def stream_chat(prompt, history):
34
+ # Llama.cpp wants the full conversation in a single string
35
+ dialogue = [f"<|system|>{SYSTEM_PROMPT}"]
36
+ for user, bot in history:
37
+ dialogue.append(f"<|user|>{user}")
38
+ dialogue.append(f"<|assistant|>{bot}")
39
+ dialogue.append(f"<|user|>{prompt}")
40
+ final_prompt = "\n".join(dialogue)
41
+
42
+ stream = llm.create_completion(
43
+ final_prompt,
44
+ max_tokens=MAX_TOKENS,
45
+ temperature=0.7,
46
+ top_p=0.9,
47
+ stream=True,
48
+ stop=["<|user|>", "<|assistant|>", "</s>"],
49
+ )
50
+ partial = ""
51
+ for chunk in stream:
52
+ token = chunk["choices"][0]["text"]
53
+ partial += token
54
+ yield partial
55
+
56
+ # ---------- Gradio interface ----------
57
+ with gr.Blocks(title="Tiny Chatbot") as demo:
58
+ gr.Markdown("### TinyLlama Chatbot – streams as it thinks")
59
+ chatbot = gr.Chatbot()
60
+ with gr.Row():
61
+ txt = gr.Textbox(show_label=False, placeholder="Paste or type…", lines=4)
62
+ send_btn = gr.Button("Send", variant="primary")
63
+
64
+ def user_submit(message, chat_history):
65
+ chat_history = chat_history + [[message, ""]]
66
+ return "", chat_history
67
+
68
+ def bot_reply(chat_history):
69
+ user_msg = chat_history[-1][0]
70
+ gen = stream_chat(user_msg, chat_history[:-1])
71
+ for answer in gen:
72
+ chat_history[-1][1] = answer
73
+ yield chat_history
74
+
75
+ txt.submit(user_submit, [txt, chatbot], [txt, chatbot]).then(
76
+ bot_reply, chatbot, chatbot
77
+ )
78
+ send_btn.click(user_submit, [txt, chatbot], [txt, chatbot]).then(
79
+ bot_reply, chatbot, chatbot
80
+ )
81
+
82
+ # ---------- JSON API ----------
83
+ app = FastAPI()
84
+ app = gr.mount_gradio_app(app, demo, path="/") # UI on root “/”
85
+
86
+ class GenRequest(BaseModel):
87
+ prompt: str
88
+ max_tokens: int | None = None
89
+
90
+ @app.post("/api/generate")
91
+ async def api_generate(req: GenRequest):
92
+ if not req.prompt:
93
+ raise HTTPException(400, detail="prompt missing")
94
+ gen = llm.create_completion(
95
+ f"<|system|>{SYSTEM_PROMPT}\n<|user|>{req.prompt}",
96
+ max_tokens=req.max_tokens or MAX_TOKENS,
97
+ temperature=0.7,
98
+ top_p=0.9,
99
+ stream=False,
100
+ stop=["<|user|>", "<|assistant|>", "</s>"],
101
+ )
102
+ return {"response": gen["choices"][0]["text"].strip()}
103
+
104
+ if __name__ == "__main__": # HF launches `python app.py`
105
+ import uvicorn
106
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.29.0
2
+ fastapi
3
+ uvicorn[standard]
4
+ llama-cpp-python>=0.2.38
5
+ huggingface-hub