import os
import time
import json
import threading
from flask import Flask, request, Response, stream_with_context
from huggingface_hub import hf_hub_download
# --- Config ---
MODEL_DIR = "/tmp/models"
REPO = "mradermacher/LFM2-2.6B-Uncensored-X64-GGUF"
FILENAME = "LFM2-2.6B-Uncensored-X64.Q3_K_S.gguf"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
os.makedirs(MODEL_DIR, exist_ok=True)
# --- Download model ---
if not os.path.exists(MODEL_PATH):
print(f"Downloading {FILENAME} ...")
hf_hub_download(repo_id=REPO, filename=FILENAME, local_dir=MODEL_DIR)
print("Download complete.")
# --- Load vLLM engine ---
from vllm import LLM, SamplingParams
print("Loading model with vLLM ...")
llm_engine = LLM(
model=MODEL_PATH,
tokenizer="meta-llama/Llama-2-7b-hf", # fallback tokenizer for GGUF
max_model_len=2048,
dtype="float32", # CPU needs float32
device="cpu",
enforce_eager=True, # no CUDA graphs on CPU
gpu_memory_utilization=0.0,
)
print("Model loaded.")
app = Flask(__name__)
HTML_PAGE = """
LFM2-2.6B Chat
"""
def build_prompt(messages):
prompt = ""
for msg in messages:
role = msg["role"]
content = msg["content"]
if role == "user":
prompt += f"<|user|>\n{content}\n"
elif role == "assistant":
prompt += f"<|assistant|>\n{content}\n"
elif role == "system":
prompt += f"<|system|>\n{content}\n"
prompt += "<|assistant|>\n"
return prompt
@app.route("/")
def index():
return HTML_PAGE
@app.route("/chat", methods=["POST"])
def chat():
data = request.json
messages = data.get("messages", [])
max_tokens = min(data.get("max_tokens", 256), 2048)
temperature = data.get("temperature", 0.7)
top_p = data.get("top_p", 0.9)
prompt = build_prompt(messages)
sampling_params = SamplingParams(
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stop=["<|user|>", "<|assistant|>", "<|end|>", "<|endoftext|>"],
)
def generate():
start = time.perf_counter()
token_count = 0
# vLLM streaming via generate iterator
results = llm_engine.generate([prompt], sampling_params, use_tqdm=False)
for request_output in results:
output_text = request_output.outputs[0].text
token_ids = request_output.outputs[0].token_ids
token_count = len(token_ids)
# Send full text as a single chunk (vLLM batches on CPU)
yield f"data: {json.dumps({'token': output_text})}\n\n"
elapsed = time.perf_counter() - start
tps = token_count / elapsed if elapsed > 0 else 0
stats = f"{token_count} tokens in {elapsed:.1f}s \u2014 {tps:.2f} tokens/s"
yield f"data: {json.dumps({'stats': stats})}\n\n"
yield "data: [DONE]\n\n"
return Response(stream_with_context(generate()), mimetype="text/event-stream")
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
app.run(host="0.0.0.0", port=port, debug=False, threaded=True)