Spaces:

OrbitMC
/

hf_dead

Runtime error

App Files Files Community

hf_dead / app.py

OrbitMC

Update app.py

89d037e verified 21 days ago

raw

history blame contribute delete

8.27 kB

	import os
	import time
	import json
	import threading
	from flask import Flask, request, Response, stream_with_context
	from huggingface_hub import hf_hub_download

	# --- Config ---
	MODEL_DIR = "/tmp/models"
	REPO = "mradermacher/LFM2-2.6B-Uncensored-X64-GGUF"
	FILENAME = "LFM2-2.6B-Uncensored-X64.Q3_K_S.gguf"
	MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)

	os.makedirs(MODEL_DIR, exist_ok=True)

	# --- Download model ---
	if not os.path.exists(MODEL_PATH):
	print(f"Downloading {FILENAME} ...")
	hf_hub_download(repo_id=REPO, filename=FILENAME, local_dir=MODEL_DIR)
	print("Download complete.")

	# --- Load vLLM engine ---
	from vllm import LLM, SamplingParams

	print("Loading model with vLLM ...")
	llm_engine = LLM(
	model=MODEL_PATH,
	tokenizer="meta-llama/Llama-2-7b-hf", # fallback tokenizer for GGUF
	max_model_len=2048,
	dtype="float32", # CPU needs float32
	device="cpu",
	enforce_eager=True, # no CUDA graphs on CPU
	gpu_memory_utilization=0.0,
	)
	print("Model loaded.")

	app = Flask(__name__)

	HTML_PAGE = """
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>LFM2-2.6B Chat</title>
	<style>
	* { box-sizing: border-box; margin: 0; padding: 0; }
	body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background: #1a1a2e; color: #eee; display: flex; justify-content: center; align-items: center; min-height: 100vh; }
	.container { width: 100%; max-width: 800px; padding: 20px; }
	h1 { text-align: center; margin-bottom: 6px; color: #e94560; font-size: 1.5rem; }
	.subtitle { text-align: center; margin-bottom: 20px; color: #888; font-size: 0.85rem; }
	.chatbox { background: #16213e; border-radius: 12px; padding: 20px; height: 55vh; overflow-y: auto; margin-bottom: 15px; border: 1px solid #0f3460; }
	.msg { margin-bottom: 14px; line-height: 1.6; white-space: pre-wrap; }
	.msg.user { color: #e94560; }
	.msg.user::before { content: "You: "; font-weight: bold; }
	.msg.bot { color: #a8d8ea; }
	.msg.bot::before { content: "AI: "; font-weight: bold; }
	.stats { color: #666; font-size: 0.78rem; margin-top: 4px; }
	.input-row { display: flex; gap: 10px; }
	textarea { flex: 1; padding: 12px; border-radius: 8px; border: 1px solid #0f3460; background: #16213e; color: #eee; font-size: 1rem; resize: none; height: 60px; font-family: inherit; }
	textarea:focus { outline: none; border-color: #e94560; }
	button { padding: 12px 28px; border-radius: 8px; border: none; background: #e94560; color: #fff; font-size: 1rem; cursor: pointer; font-weight: bold; }
	button:hover { background: #c73650; }
	button:disabled { background: #555; cursor: not-allowed; }
	.settings { display: flex; gap: 15px; margin-bottom: 15px; flex-wrap: wrap; align-items: center; }
	.settings label { font-size: 0.85rem; color: #aaa; }
	.settings input { background: #16213e; border: 1px solid #0f3460; color: #eee; padding: 5px 8px; border-radius: 6px; width: 80px; }
	</style>
	</head>
	<body>
	<div class="container">
	<h1>LFM2-2.6B Uncensored</h1>
	<p class="subtitle">Running on CPU via vLLM</p>
	<div class="settings">
	<label>Max tokens: <input type="number" id="maxTokens" value="256" min="16" max="2048"></label>
	<label>Temperature: <input type="number" id="temperature" value="0.7" min="0" max="2" step="0.1"></label>
	<label>Top-P: <input type="number" id="topP" value="0.9" min="0" max="1" step="0.05"></label>
	<button onclick="clearChat()" style="padding:5px 14px;font-size:0.85rem;">Clear</button>
	</div>
	<div class="chatbox" id="chatbox"></div>
	<div class="input-row">
	<textarea id="userInput" placeholder="Type your message..." onkeydown="if(event.key==='Enter'&&!event.shiftKey){event.preventDefault();sendMsg();}"></textarea>
	<button id="sendBtn" onclick="sendMsg()">Send</button>
	</div>
	</div>
	<script>
	const chatbox = document.getElementById('chatbox');
	const userInput = document.getElementById('userInput');
	const sendBtn = document.getElementById('sendBtn');
	let history = [];

	function addMsg(role, text) {
	const div = document.createElement('div');
	div.className = 'msg ' + role;
	div.textContent = text;
	chatbox.appendChild(div);
	chatbox.scrollTop = chatbox.scrollHeight;
	return div;
	}

	function clearChat() { history = []; chatbox.innerHTML = ''; }

	async function sendMsg() {
	const text = userInput.value.trim();
	if (!text) return;
	userInput.value = '';
	addMsg('user', text);
	history.push({role:'user', content:text});
	sendBtn.disabled = true;

	const botDiv = addMsg('bot', '');
	botDiv.textContent = '';

	try {
	const resp = await fetch('/chat', {
	method: 'POST',
	headers: {'Content-Type':'application/json'},
	body: JSON.stringify({
	messages: history,
	max_tokens: parseInt(document.getElementById('maxTokens').value)\|\|256,
	temperature: parseFloat(document.getElementById('temperature').value)\|\|0.7,
	top_p: parseFloat(document.getElementById('topP').value)\|\|0.9
	})
	});
	const reader = resp.body.getReader();
	const decoder = new TextDecoder();
	let full = '';

	while (true) {
	const {done, value} = await reader.read();
	if (done) break;
	const chunk = decoder.decode(value, {stream:true});
	for (const line of chunk.split('\\n')) {
	if (!line.startsWith('data: ')) continue;
	const d = line.slice(6);
	if (d === '[DONE]') continue;
	try {
	const j = JSON.parse(d);
	if (j.token) { full += j.token; }
	if (j.stats) {
	const s = document.createElement('div');
	s.className = 'stats';
	s.textContent = j.stats;
	botDiv.textContent = full;
	botDiv.appendChild(s);
	} else {
	botDiv.textContent = full;
	}
	} catch(e){}
	}
	}
	history.push({role:'assistant', content:full});
	} catch(e) {
	botDiv.textContent = 'Error: ' + e.message;
	}
	sendBtn.disabled = false;
	chatbox.scrollTop = chatbox.scrollHeight;
	}
	</script>
	</body>
	</html>
	"""


	def build_prompt(messages):
	prompt = ""
	for msg in messages:
	role = msg["role"]
	content = msg["content"]
	if role == "user":
	prompt += f"<\|user\|>\n{content}\n"
	elif role == "assistant":
	prompt += f"<\|assistant\|>\n{content}\n"
	elif role == "system":
	prompt += f"<\|system\|>\n{content}\n"
	prompt += "<\|assistant\|>\n"
	return prompt


	@app.route("/")
	def index():
	return HTML_PAGE


	@app.route("/chat", methods=["POST"])
	def chat():
	data = request.json
	messages = data.get("messages", [])
	max_tokens = min(data.get("max_tokens", 256), 2048)
	temperature = data.get("temperature", 0.7)
	top_p = data.get("top_p", 0.9)

	prompt = build_prompt(messages)

	sampling_params = SamplingParams(
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	stop=["<\|user\|>", "<\|assistant\|>", "<\|end\|>", "<\|endoftext\|>"],
	)

	def generate():
	start = time.perf_counter()
	token_count = 0

	# vLLM streaming via generate iterator
	results = llm_engine.generate([prompt], sampling_params, use_tqdm=False)

	for request_output in results:
	output_text = request_output.outputs[0].text
	token_ids = request_output.outputs[0].token_ids
	token_count = len(token_ids)

	# Send full text as a single chunk (vLLM batches on CPU)
	yield f"data: {json.dumps({'token': output_text})}\n\n"

	elapsed = time.perf_counter() - start
	tps = token_count / elapsed if elapsed > 0 else 0
	stats = f"{token_count} tokens in {elapsed:.1f}s \u2014 {tps:.2f} tokens/s"
	yield f"data: {json.dumps({'stats': stats})}\n\n"
	yield "data: [DONE]\n\n"

	return Response(stream_with_context(generate()), mimetype="text/event-stream")


	if __name__ == "__main__":
	port = int(os.environ.get("PORT", 7860))
	app.run(host="0.0.0.0", port=port, debug=False, threaded=True)