import os import json from flask import Flask, request, jsonify, Response, stream_with_context from huggingface_hub import InferenceClient app = Flask(__name__) HF_TOKEN = os.environ.get("HF_TOKEN", "") DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "Qwen/Qwen2.5-72B-Instruct") client = InferenceClient(token=HF_TOKEN) @app.route("/", methods=["GET"]) def index(): return jsonify({"status": "ok", "message": "InfinityLLM API is running"}) @app.route("/v1/models", methods=["GET"]) def models(): return jsonify({ "object": "list", "data": [ {"id": "Qwen/Qwen2.5-72B-Instruct", "object": "model"}, {"id": "Qwen/Qwen2.5-7B-Instruct", "object": "model"}, {"id": "Qwen/Qwen2.5-Coder-32B-Instruct", "object": "model"}, {"id": "meta-llama/Llama-3.3-70B-Instruct", "object": "model"}, {"id": "meta-llama/Llama-3.1-8B-Instruct", "object": "model"}, {"id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "object": "model"}, {"id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B","object": "model"}, {"id": "moonshotai/Kimi-K2.6", "object": "model"}, {"id": "MiniMaxAI/MiniMax-M2.7", "object": "model"}, {"id": "Qwen/Qwen3-Coder-30B-A3B-Instruct", "object": "model"}, ] }) @app.route("/v1/chat/completions", methods=["POST"]) def chat(): data = request.json if not data: return jsonify({"error": "No JSON body"}), 400 messages = data.get("messages", []) model = data.get("model", DEFAULT_MODEL) max_tokens = int(data.get("max_tokens", 2048)) temperature = float(data.get("temperature", 0.7)) stream = data.get("stream", False) try: if stream: def generate(): full_content = "" try: for chunk in client.chat_completion( model=model, messages=messages, max_tokens=max_tokens, temperature=temperature, stream=True, ): delta_content = "" if chunk.choices and chunk.choices[0].delta: delta_content = chunk.choices[0].delta.content or "" full_content += delta_content chunk_data = { "id": "chatcmpl-hf", "object": "chat.completion.chunk", "model": model, "choices": [{ "index": 0, "delta": {"role": "assistant", "content": delta_content}, "finish_reason": None }] } yield f"data: {json.dumps(chunk_data)}\n\n" final = { "id": "chatcmpl-hf", "object": "chat.completion.chunk", "model": model, "choices": [{ "index": 0, "delta": {}, "finish_reason": "stop" }] } yield f"data: {json.dumps(final)}\n\n" yield "data: [DONE]\n\n" except Exception as e: yield f"data: {json.dumps({'error': str(e)})}\n\n" return Response( stream_with_context(generate()), mimetype="text/event-stream", headers={ "Cache-Control": "no-cache", "X-Accel-Buffering": "no" } ) else: response = client.chat_completion( model=model, messages=messages, max_tokens=max_tokens, temperature=temperature, stream=False, ) content = "" if response.choices and len(response.choices) > 0: choice = response.choices[0] if hasattr(choice, "message") and choice.message: content = choice.message.content or "" if not content: return jsonify({"error": "Empty response from model"}), 500 return jsonify({ "id": "chatcmpl-hf", "object": "chat.completion", "model": model, "choices": [{ "index": 0, "message": { "role": "assistant", "content": content }, "finish_reason": "stop" }], "usage": { "prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0 } }) except Exception as e: return jsonify({ "error": str(e), "choices": [{ "index": 0, "message": {"role": "assistant", "content": f"Error: {str(e)}"}, "finish_reason": "stop" }] }), 500 if __name__ == "__main__": app.run(host="0.0.0.0", port=7860, debug=False)