infinityllm / app.py
Infinityecs1's picture
Update app.py
b115eb7 verified
import os
import json
from flask import Flask, request, jsonify, Response, stream_with_context
from huggingface_hub import InferenceClient
app = Flask(__name__)
HF_TOKEN = os.environ.get("HF_TOKEN", "")
DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "Qwen/Qwen2.5-72B-Instruct")
client = InferenceClient(token=HF_TOKEN)
@app.route("/", methods=["GET"])
def index():
return jsonify({"status": "ok", "message": "InfinityLLM API is running"})
@app.route("/v1/models", methods=["GET"])
def models():
return jsonify({
"object": "list",
"data": [
{"id": "Qwen/Qwen2.5-72B-Instruct", "object": "model"},
{"id": "Qwen/Qwen2.5-7B-Instruct", "object": "model"},
{"id": "Qwen/Qwen2.5-Coder-32B-Instruct", "object": "model"},
{"id": "meta-llama/Llama-3.3-70B-Instruct", "object": "model"},
{"id": "meta-llama/Llama-3.1-8B-Instruct", "object": "model"},
{"id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "object": "model"},
{"id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B","object": "model"},
{"id": "moonshotai/Kimi-K2.6", "object": "model"},
{"id": "MiniMaxAI/MiniMax-M2.7", "object": "model"},
{"id": "Qwen/Qwen3-Coder-30B-A3B-Instruct", "object": "model"}, ]
})
@app.route("/v1/chat/completions", methods=["POST"])
def chat():
data = request.json
if not data:
return jsonify({"error": "No JSON body"}), 400
messages = data.get("messages", [])
model = data.get("model", DEFAULT_MODEL)
max_tokens = int(data.get("max_tokens", 2048))
temperature = float(data.get("temperature", 0.7))
stream = data.get("stream", False)
try:
if stream:
def generate():
full_content = ""
try:
for chunk in client.chat_completion(
model=model,
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
stream=True,
):
delta_content = ""
if chunk.choices and chunk.choices[0].delta:
delta_content = chunk.choices[0].delta.content or ""
full_content += delta_content
chunk_data = {
"id": "chatcmpl-hf",
"object": "chat.completion.chunk",
"model": model,
"choices": [{
"index": 0,
"delta": {"role": "assistant", "content": delta_content},
"finish_reason": None
}]
}
yield f"data: {json.dumps(chunk_data)}\n\n"
final = {
"id": "chatcmpl-hf",
"object": "chat.completion.chunk",
"model": model,
"choices": [{
"index": 0,
"delta": {},
"finish_reason": "stop"
}]
}
yield f"data: {json.dumps(final)}\n\n"
yield "data: [DONE]\n\n"
except Exception as e:
yield f"data: {json.dumps({'error': str(e)})}\n\n"
return Response(
stream_with_context(generate()),
mimetype="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no"
}
)
else:
response = client.chat_completion(
model=model,
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
stream=False,
)
content = ""
if response.choices and len(response.choices) > 0:
choice = response.choices[0]
if hasattr(choice, "message") and choice.message:
content = choice.message.content or ""
if not content:
return jsonify({"error": "Empty response from model"}), 500
return jsonify({
"id": "chatcmpl-hf",
"object": "chat.completion",
"model": model,
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": content
},
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0
}
})
except Exception as e:
return jsonify({
"error": str(e),
"choices": [{
"index": 0,
"message": {"role": "assistant", "content": f"Error: {str(e)}"},
"finish_reason": "stop"
}]
}), 500
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860, debug=False)