hf_dead / app.py
OrbitMC's picture
Update app.py
89d037e verified
import os
import time
import json
import threading
from flask import Flask, request, Response, stream_with_context
from huggingface_hub import hf_hub_download
# --- Config ---
MODEL_DIR = "/tmp/models"
REPO = "mradermacher/LFM2-2.6B-Uncensored-X64-GGUF"
FILENAME = "LFM2-2.6B-Uncensored-X64.Q3_K_S.gguf"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
os.makedirs(MODEL_DIR, exist_ok=True)
# --- Download model ---
if not os.path.exists(MODEL_PATH):
print(f"Downloading {FILENAME} ...")
hf_hub_download(repo_id=REPO, filename=FILENAME, local_dir=MODEL_DIR)
print("Download complete.")
# --- Load vLLM engine ---
from vllm import LLM, SamplingParams
print("Loading model with vLLM ...")
llm_engine = LLM(
model=MODEL_PATH,
tokenizer="meta-llama/Llama-2-7b-hf", # fallback tokenizer for GGUF
max_model_len=2048,
dtype="float32", # CPU needs float32
device="cpu",
enforce_eager=True, # no CUDA graphs on CPU
gpu_memory_utilization=0.0,
)
print("Model loaded.")
app = Flask(__name__)
HTML_PAGE = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>LFM2-2.6B Chat</title>
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background: #1a1a2e; color: #eee; display: flex; justify-content: center; align-items: center; min-height: 100vh; }
.container { width: 100%; max-width: 800px; padding: 20px; }
h1 { text-align: center; margin-bottom: 6px; color: #e94560; font-size: 1.5rem; }
.subtitle { text-align: center; margin-bottom: 20px; color: #888; font-size: 0.85rem; }
.chatbox { background: #16213e; border-radius: 12px; padding: 20px; height: 55vh; overflow-y: auto; margin-bottom: 15px; border: 1px solid #0f3460; }
.msg { margin-bottom: 14px; line-height: 1.6; white-space: pre-wrap; }
.msg.user { color: #e94560; }
.msg.user::before { content: "You: "; font-weight: bold; }
.msg.bot { color: #a8d8ea; }
.msg.bot::before { content: "AI: "; font-weight: bold; }
.stats { color: #666; font-size: 0.78rem; margin-top: 4px; }
.input-row { display: flex; gap: 10px; }
textarea { flex: 1; padding: 12px; border-radius: 8px; border: 1px solid #0f3460; background: #16213e; color: #eee; font-size: 1rem; resize: none; height: 60px; font-family: inherit; }
textarea:focus { outline: none; border-color: #e94560; }
button { padding: 12px 28px; border-radius: 8px; border: none; background: #e94560; color: #fff; font-size: 1rem; cursor: pointer; font-weight: bold; }
button:hover { background: #c73650; }
button:disabled { background: #555; cursor: not-allowed; }
.settings { display: flex; gap: 15px; margin-bottom: 15px; flex-wrap: wrap; align-items: center; }
.settings label { font-size: 0.85rem; color: #aaa; }
.settings input { background: #16213e; border: 1px solid #0f3460; color: #eee; padding: 5px 8px; border-radius: 6px; width: 80px; }
</style>
</head>
<body>
<div class="container">
<h1>LFM2-2.6B Uncensored</h1>
<p class="subtitle">Running on CPU via vLLM</p>
<div class="settings">
<label>Max tokens: <input type="number" id="maxTokens" value="256" min="16" max="2048"></label>
<label>Temperature: <input type="number" id="temperature" value="0.7" min="0" max="2" step="0.1"></label>
<label>Top-P: <input type="number" id="topP" value="0.9" min="0" max="1" step="0.05"></label>
<button onclick="clearChat()" style="padding:5px 14px;font-size:0.85rem;">Clear</button>
</div>
<div class="chatbox" id="chatbox"></div>
<div class="input-row">
<textarea id="userInput" placeholder="Type your message..." onkeydown="if(event.key==='Enter'&&!event.shiftKey){event.preventDefault();sendMsg();}"></textarea>
<button id="sendBtn" onclick="sendMsg()">Send</button>
</div>
</div>
<script>
const chatbox = document.getElementById('chatbox');
const userInput = document.getElementById('userInput');
const sendBtn = document.getElementById('sendBtn');
let history = [];
function addMsg(role, text) {
const div = document.createElement('div');
div.className = 'msg ' + role;
div.textContent = text;
chatbox.appendChild(div);
chatbox.scrollTop = chatbox.scrollHeight;
return div;
}
function clearChat() { history = []; chatbox.innerHTML = ''; }
async function sendMsg() {
const text = userInput.value.trim();
if (!text) return;
userInput.value = '';
addMsg('user', text);
history.push({role:'user', content:text});
sendBtn.disabled = true;
const botDiv = addMsg('bot', '');
botDiv.textContent = '';
try {
const resp = await fetch('/chat', {
method: 'POST',
headers: {'Content-Type':'application/json'},
body: JSON.stringify({
messages: history,
max_tokens: parseInt(document.getElementById('maxTokens').value)||256,
temperature: parseFloat(document.getElementById('temperature').value)||0.7,
top_p: parseFloat(document.getElementById('topP').value)||0.9
})
});
const reader = resp.body.getReader();
const decoder = new TextDecoder();
let full = '';
while (true) {
const {done, value} = await reader.read();
if (done) break;
const chunk = decoder.decode(value, {stream:true});
for (const line of chunk.split('\\n')) {
if (!line.startsWith('data: ')) continue;
const d = line.slice(6);
if (d === '[DONE]') continue;
try {
const j = JSON.parse(d);
if (j.token) { full += j.token; }
if (j.stats) {
const s = document.createElement('div');
s.className = 'stats';
s.textContent = j.stats;
botDiv.textContent = full;
botDiv.appendChild(s);
} else {
botDiv.textContent = full;
}
} catch(e){}
}
}
history.push({role:'assistant', content:full});
} catch(e) {
botDiv.textContent = 'Error: ' + e.message;
}
sendBtn.disabled = false;
chatbox.scrollTop = chatbox.scrollHeight;
}
</script>
</body>
</html>
"""
def build_prompt(messages):
prompt = ""
for msg in messages:
role = msg["role"]
content = msg["content"]
if role == "user":
prompt += f"<|user|>\n{content}\n"
elif role == "assistant":
prompt += f"<|assistant|>\n{content}\n"
elif role == "system":
prompt += f"<|system|>\n{content}\n"
prompt += "<|assistant|>\n"
return prompt
@app.route("/")
def index():
return HTML_PAGE
@app.route("/chat", methods=["POST"])
def chat():
data = request.json
messages = data.get("messages", [])
max_tokens = min(data.get("max_tokens", 256), 2048)
temperature = data.get("temperature", 0.7)
top_p = data.get("top_p", 0.9)
prompt = build_prompt(messages)
sampling_params = SamplingParams(
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stop=["<|user|>", "<|assistant|>", "<|end|>", "<|endoftext|>"],
)
def generate():
start = time.perf_counter()
token_count = 0
# vLLM streaming via generate iterator
results = llm_engine.generate([prompt], sampling_params, use_tqdm=False)
for request_output in results:
output_text = request_output.outputs[0].text
token_ids = request_output.outputs[0].token_ids
token_count = len(token_ids)
# Send full text as a single chunk (vLLM batches on CPU)
yield f"data: {json.dumps({'token': output_text})}\n\n"
elapsed = time.perf_counter() - start
tps = token_count / elapsed if elapsed > 0 else 0
stats = f"{token_count} tokens in {elapsed:.1f}s \u2014 {tps:.2f} tokens/s"
yield f"data: {json.dumps({'stats': stats})}\n\n"
yield "data: [DONE]\n\n"
return Response(stream_with_context(generate()), mimetype="text/event-stream")
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
app.run(host="0.0.0.0", port=port, debug=False, threaded=True)