| import os |
| import time |
| import json |
| import threading |
| from flask import Flask, request, Response, stream_with_context |
| from huggingface_hub import hf_hub_download |
|
|
| |
| MODEL_DIR = "/tmp/models" |
| REPO = "mradermacher/LFM2-2.6B-Uncensored-X64-GGUF" |
| FILENAME = "LFM2-2.6B-Uncensored-X64.Q3_K_S.gguf" |
| MODEL_PATH = os.path.join(MODEL_DIR, FILENAME) |
|
|
| os.makedirs(MODEL_DIR, exist_ok=True) |
|
|
| |
| if not os.path.exists(MODEL_PATH): |
| print(f"Downloading {FILENAME} ...") |
| hf_hub_download(repo_id=REPO, filename=FILENAME, local_dir=MODEL_DIR) |
| print("Download complete.") |
|
|
| |
| from vllm import LLM, SamplingParams |
|
|
| print("Loading model with vLLM ...") |
| llm_engine = LLM( |
| model=MODEL_PATH, |
| tokenizer="meta-llama/Llama-2-7b-hf", |
| max_model_len=2048, |
| dtype="float32", |
| device="cpu", |
| enforce_eager=True, |
| gpu_memory_utilization=0.0, |
| ) |
| print("Model loaded.") |
|
|
| app = Flask(__name__) |
|
|
| HTML_PAGE = """ |
| <!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| <title>LFM2-2.6B Chat</title> |
| <style> |
| * { box-sizing: border-box; margin: 0; padding: 0; } |
| body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background: #1a1a2e; color: #eee; display: flex; justify-content: center; align-items: center; min-height: 100vh; } |
| .container { width: 100%; max-width: 800px; padding: 20px; } |
| h1 { text-align: center; margin-bottom: 6px; color: #e94560; font-size: 1.5rem; } |
| .subtitle { text-align: center; margin-bottom: 20px; color: #888; font-size: 0.85rem; } |
| .chatbox { background: #16213e; border-radius: 12px; padding: 20px; height: 55vh; overflow-y: auto; margin-bottom: 15px; border: 1px solid #0f3460; } |
| .msg { margin-bottom: 14px; line-height: 1.6; white-space: pre-wrap; } |
| .msg.user { color: #e94560; } |
| .msg.user::before { content: "You: "; font-weight: bold; } |
| .msg.bot { color: #a8d8ea; } |
| .msg.bot::before { content: "AI: "; font-weight: bold; } |
| .stats { color: #666; font-size: 0.78rem; margin-top: 4px; } |
| .input-row { display: flex; gap: 10px; } |
| textarea { flex: 1; padding: 12px; border-radius: 8px; border: 1px solid #0f3460; background: #16213e; color: #eee; font-size: 1rem; resize: none; height: 60px; font-family: inherit; } |
| textarea:focus { outline: none; border-color: #e94560; } |
| button { padding: 12px 28px; border-radius: 8px; border: none; background: #e94560; color: #fff; font-size: 1rem; cursor: pointer; font-weight: bold; } |
| button:hover { background: #c73650; } |
| button:disabled { background: #555; cursor: not-allowed; } |
| .settings { display: flex; gap: 15px; margin-bottom: 15px; flex-wrap: wrap; align-items: center; } |
| .settings label { font-size: 0.85rem; color: #aaa; } |
| .settings input { background: #16213e; border: 1px solid #0f3460; color: #eee; padding: 5px 8px; border-radius: 6px; width: 80px; } |
| </style> |
| </head> |
| <body> |
| <div class="container"> |
| <h1>LFM2-2.6B Uncensored</h1> |
| <p class="subtitle">Running on CPU via vLLM</p> |
| <div class="settings"> |
| <label>Max tokens: <input type="number" id="maxTokens" value="256" min="16" max="2048"></label> |
| <label>Temperature: <input type="number" id="temperature" value="0.7" min="0" max="2" step="0.1"></label> |
| <label>Top-P: <input type="number" id="topP" value="0.9" min="0" max="1" step="0.05"></label> |
| <button onclick="clearChat()" style="padding:5px 14px;font-size:0.85rem;">Clear</button> |
| </div> |
| <div class="chatbox" id="chatbox"></div> |
| <div class="input-row"> |
| <textarea id="userInput" placeholder="Type your message..." onkeydown="if(event.key==='Enter'&&!event.shiftKey){event.preventDefault();sendMsg();}"></textarea> |
| <button id="sendBtn" onclick="sendMsg()">Send</button> |
| </div> |
| </div> |
| <script> |
| const chatbox = document.getElementById('chatbox'); |
| const userInput = document.getElementById('userInput'); |
| const sendBtn = document.getElementById('sendBtn'); |
| let history = []; |
| |
| function addMsg(role, text) { |
| const div = document.createElement('div'); |
| div.className = 'msg ' + role; |
| div.textContent = text; |
| chatbox.appendChild(div); |
| chatbox.scrollTop = chatbox.scrollHeight; |
| return div; |
| } |
| |
| function clearChat() { history = []; chatbox.innerHTML = ''; } |
| |
| async function sendMsg() { |
| const text = userInput.value.trim(); |
| if (!text) return; |
| userInput.value = ''; |
| addMsg('user', text); |
| history.push({role:'user', content:text}); |
| sendBtn.disabled = true; |
| |
| const botDiv = addMsg('bot', ''); |
| botDiv.textContent = ''; |
| |
| try { |
| const resp = await fetch('/chat', { |
| method: 'POST', |
| headers: {'Content-Type':'application/json'}, |
| body: JSON.stringify({ |
| messages: history, |
| max_tokens: parseInt(document.getElementById('maxTokens').value)||256, |
| temperature: parseFloat(document.getElementById('temperature').value)||0.7, |
| top_p: parseFloat(document.getElementById('topP').value)||0.9 |
| }) |
| }); |
| const reader = resp.body.getReader(); |
| const decoder = new TextDecoder(); |
| let full = ''; |
| |
| while (true) { |
| const {done, value} = await reader.read(); |
| if (done) break; |
| const chunk = decoder.decode(value, {stream:true}); |
| for (const line of chunk.split('\\n')) { |
| if (!line.startsWith('data: ')) continue; |
| const d = line.slice(6); |
| if (d === '[DONE]') continue; |
| try { |
| const j = JSON.parse(d); |
| if (j.token) { full += j.token; } |
| if (j.stats) { |
| const s = document.createElement('div'); |
| s.className = 'stats'; |
| s.textContent = j.stats; |
| botDiv.textContent = full; |
| botDiv.appendChild(s); |
| } else { |
| botDiv.textContent = full; |
| } |
| } catch(e){} |
| } |
| } |
| history.push({role:'assistant', content:full}); |
| } catch(e) { |
| botDiv.textContent = 'Error: ' + e.message; |
| } |
| sendBtn.disabled = false; |
| chatbox.scrollTop = chatbox.scrollHeight; |
| } |
| </script> |
| </body> |
| </html> |
| """ |
|
|
|
|
| def build_prompt(messages): |
| prompt = "" |
| for msg in messages: |
| role = msg["role"] |
| content = msg["content"] |
| if role == "user": |
| prompt += f"<|user|>\n{content}\n" |
| elif role == "assistant": |
| prompt += f"<|assistant|>\n{content}\n" |
| elif role == "system": |
| prompt += f"<|system|>\n{content}\n" |
| prompt += "<|assistant|>\n" |
| return prompt |
|
|
|
|
| @app.route("/") |
| def index(): |
| return HTML_PAGE |
|
|
|
|
| @app.route("/chat", methods=["POST"]) |
| def chat(): |
| data = request.json |
| messages = data.get("messages", []) |
| max_tokens = min(data.get("max_tokens", 256), 2048) |
| temperature = data.get("temperature", 0.7) |
| top_p = data.get("top_p", 0.9) |
|
|
| prompt = build_prompt(messages) |
|
|
| sampling_params = SamplingParams( |
| max_tokens=max_tokens, |
| temperature=temperature, |
| top_p=top_p, |
| stop=["<|user|>", "<|assistant|>", "<|end|>", "<|endoftext|>"], |
| ) |
|
|
| def generate(): |
| start = time.perf_counter() |
| token_count = 0 |
|
|
| |
| results = llm_engine.generate([prompt], sampling_params, use_tqdm=False) |
|
|
| for request_output in results: |
| output_text = request_output.outputs[0].text |
| token_ids = request_output.outputs[0].token_ids |
| token_count = len(token_ids) |
|
|
| |
| yield f"data: {json.dumps({'token': output_text})}\n\n" |
|
|
| elapsed = time.perf_counter() - start |
| tps = token_count / elapsed if elapsed > 0 else 0 |
| stats = f"{token_count} tokens in {elapsed:.1f}s \u2014 {tps:.2f} tokens/s" |
| yield f"data: {json.dumps({'stats': stats})}\n\n" |
| yield "data: [DONE]\n\n" |
|
|
| return Response(stream_with_context(generate()), mimetype="text/event-stream") |
|
|
|
|
| if __name__ == "__main__": |
| port = int(os.environ.get("PORT", 7860)) |
| app.run(host="0.0.0.0", port=port, debug=False, threaded=True) |