Spaces:

Infinityecs1
/

infinityllm

Running

File size: 5,493 Bytes

import os
import json
from flask import Flask, request, jsonify, Response, stream_with_context
from huggingface_hub import InferenceClient

app = Flask(__name__)
HF_TOKEN = os.environ.get("HF_TOKEN", "")
DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "Qwen/Qwen2.5-72B-Instruct")
client = InferenceClient(token=HF_TOKEN)

@app.route("/", methods=["GET"])
def index():
    return jsonify({"status": "ok", "message": "InfinityLLM API is running"})

@app.route("/v1/models", methods=["GET"])
def models():
    return jsonify({
        "object": "list",
        "data": [
            {"id": "Qwen/Qwen2.5-72B-Instruct",               "object": "model"},
            {"id": "Qwen/Qwen2.5-7B-Instruct",                "object": "model"},
            {"id": "Qwen/Qwen2.5-Coder-32B-Instruct",         "object": "model"},
            {"id": "meta-llama/Llama-3.3-70B-Instruct",       "object": "model"},
            {"id": "meta-llama/Llama-3.1-8B-Instruct",        "object": "model"},
            {"id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "object": "model"},
            {"id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B","object": "model"},
            {"id": "moonshotai/Kimi-K2.6",                     "object": "model"},  
            {"id": "MiniMaxAI/MiniMax-M2.7",                   "object": "model"},  
            {"id": "Qwen/Qwen3-Coder-30B-A3B-Instruct",        "object": "model"},          ]
    })

@app.route("/v1/chat/completions", methods=["POST"])
def chat():
    data = request.json
    if not data:
        return jsonify({"error": "No JSON body"}), 400

    messages = data.get("messages", [])
    model = data.get("model", DEFAULT_MODEL)
    max_tokens = int(data.get("max_tokens", 2048))
    temperature = float(data.get("temperature", 0.7))
    stream = data.get("stream", False)

    try:
        if stream:
            def generate():
                full_content = ""
                try:
                    for chunk in client.chat_completion(
                        model=model,
                        messages=messages,
                        max_tokens=max_tokens,
                        temperature=temperature,
                        stream=True,
                    ):
                        delta_content = ""
                        if chunk.choices and chunk.choices[0].delta:
                            delta_content = chunk.choices[0].delta.content or ""
                        full_content += delta_content
                        chunk_data = {
                            "id": "chatcmpl-hf",
                            "object": "chat.completion.chunk",
                            "model": model,
                            "choices": [{
                                "index": 0,
                                "delta": {"role": "assistant", "content": delta_content},
                                "finish_reason": None
                            }]
                        }
                        yield f"data: {json.dumps(chunk_data)}\n\n"
                    
                    final = {
                        "id": "chatcmpl-hf",
                        "object": "chat.completion.chunk",
                        "model": model,
                        "choices": [{
                            "index": 0,
                            "delta": {},
                            "finish_reason": "stop"
                        }]
                    }
                    yield f"data: {json.dumps(final)}\n\n"
                    yield "data: [DONE]\n\n"
                except Exception as e:
                    yield f"data: {json.dumps({'error': str(e)})}\n\n"

            return Response(
                stream_with_context(generate()),
                mimetype="text/event-stream",
                headers={
                    "Cache-Control": "no-cache",
                    "X-Accel-Buffering": "no"
                }
            )

        else:
            response = client.chat_completion(
                model=model,
                messages=messages,
                max_tokens=max_tokens,
                temperature=temperature,
                stream=False,
            )

            content = ""
            if response.choices and len(response.choices) > 0:
                choice = response.choices[0]
                if hasattr(choice, "message") and choice.message:
                    content = choice.message.content or ""

            if not content:
                return jsonify({"error": "Empty response from model"}), 500

            return jsonify({
                "id": "chatcmpl-hf",
                "object": "chat.completion",
                "model": model,
                "choices": [{
                    "index": 0,
                    "message": {
                        "role": "assistant",
                        "content": content
                    },
                    "finish_reason": "stop"
                }],
                "usage": {
                    "prompt_tokens": 0,
                    "completion_tokens": 0,
                    "total_tokens": 0
                }
            })

    except Exception as e:
        return jsonify({
            "error": str(e),
            "choices": [{
                "index": 0,
                "message": {"role": "assistant", "content": f"Error: {str(e)}"},
                "finish_reason": "stop"
            }]
        }), 500

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860, debug=False)