Spaces:

Infinityecs1
/

infinityllm

Running

App Files Files Community

infinityllm / app.py

Infinityecs1

Update app.py

b115eb7 verified 14 days ago

raw

history blame contribute delete

5.49 kB

	import os
	import json
	from flask import Flask, request, jsonify, Response, stream_with_context
	from huggingface_hub import InferenceClient

	app = Flask(__name__)
	HF_TOKEN = os.environ.get("HF_TOKEN", "")
	DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "Qwen/Qwen2.5-72B-Instruct")
	client = InferenceClient(token=HF_TOKEN)

	@app.route("/", methods=["GET"])
	def index():
	return jsonify({"status": "ok", "message": "InfinityLLM API is running"})

	@app.route("/v1/models", methods=["GET"])
	def models():
	return jsonify({
	"object": "list",
	"data": [
	{"id": "Qwen/Qwen2.5-72B-Instruct", "object": "model"},
	{"id": "Qwen/Qwen2.5-7B-Instruct", "object": "model"},
	{"id": "Qwen/Qwen2.5-Coder-32B-Instruct", "object": "model"},
	{"id": "meta-llama/Llama-3.3-70B-Instruct", "object": "model"},
	{"id": "meta-llama/Llama-3.1-8B-Instruct", "object": "model"},
	{"id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "object": "model"},
	{"id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B","object": "model"},
	{"id": "moonshotai/Kimi-K2.6", "object": "model"},
	{"id": "MiniMaxAI/MiniMax-M2.7", "object": "model"},
	{"id": "Qwen/Qwen3-Coder-30B-A3B-Instruct", "object": "model"}, ]
	})

	@app.route("/v1/chat/completions", methods=["POST"])
	def chat():
	data = request.json
	if not data:
	return jsonify({"error": "No JSON body"}), 400

	messages = data.get("messages", [])
	model = data.get("model", DEFAULT_MODEL)
	max_tokens = int(data.get("max_tokens", 2048))
	temperature = float(data.get("temperature", 0.7))
	stream = data.get("stream", False)

	try:
	if stream:
	def generate():
	full_content = ""
	try:
	for chunk in client.chat_completion(
	model=model,
	messages=messages,
	max_tokens=max_tokens,
	temperature=temperature,
	stream=True,
	):
	delta_content = ""
	if chunk.choices and chunk.choices[0].delta:
	delta_content = chunk.choices[0].delta.content or ""
	full_content += delta_content
	chunk_data = {
	"id": "chatcmpl-hf",
	"object": "chat.completion.chunk",
	"model": model,
	"choices": [{
	"index": 0,
	"delta": {"role": "assistant", "content": delta_content},
	"finish_reason": None
	}]
	}
	yield f"data: {json.dumps(chunk_data)}\n\n"

	final = {
	"id": "chatcmpl-hf",
	"object": "chat.completion.chunk",
	"model": model,
	"choices": [{
	"index": 0,
	"delta": {},
	"finish_reason": "stop"
	}]
	}
	yield f"data: {json.dumps(final)}\n\n"
	yield "data: [DONE]\n\n"
	except Exception as e:
	yield f"data: {json.dumps({'error': str(e)})}\n\n"

	return Response(
	stream_with_context(generate()),
	mimetype="text/event-stream",
	headers={
	"Cache-Control": "no-cache",
	"X-Accel-Buffering": "no"
	}
	)

	else:
	response = client.chat_completion(
	model=model,
	messages=messages,
	max_tokens=max_tokens,
	temperature=temperature,
	stream=False,
	)

	content = ""
	if response.choices and len(response.choices) > 0:
	choice = response.choices[0]
	if hasattr(choice, "message") and choice.message:
	content = choice.message.content or ""

	if not content:
	return jsonify({"error": "Empty response from model"}), 500

	return jsonify({
	"id": "chatcmpl-hf",
	"object": "chat.completion",
	"model": model,
	"choices": [{
	"index": 0,
	"message": {
	"role": "assistant",
	"content": content
	},
	"finish_reason": "stop"
	}],
	"usage": {
	"prompt_tokens": 0,
	"completion_tokens": 0,
	"total_tokens": 0
	}
	})

	except Exception as e:
	return jsonify({
	"error": str(e),
	"choices": [{
	"index": 0,
	"message": {"role": "assistant", "content": f"Error: {str(e)}"},
	"finish_reason": "stop"
	}]
	}), 500

	if __name__ == "__main__":
	app.run(host="0.0.0.0", port=7860, debug=False)