Spaces:
Running
Running
| import os | |
| import json | |
| from flask import Flask, request, jsonify, Response, stream_with_context | |
| from huggingface_hub import InferenceClient | |
| app = Flask(__name__) | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "Qwen/Qwen2.5-72B-Instruct") | |
| client = InferenceClient(token=HF_TOKEN) | |
| def index(): | |
| return jsonify({"status": "ok", "message": "InfinityLLM API is running"}) | |
| def models(): | |
| return jsonify({ | |
| "object": "list", | |
| "data": [ | |
| {"id": "Qwen/Qwen2.5-72B-Instruct", "object": "model"}, | |
| {"id": "Qwen/Qwen2.5-7B-Instruct", "object": "model"}, | |
| {"id": "Qwen/Qwen2.5-Coder-32B-Instruct", "object": "model"}, | |
| {"id": "meta-llama/Llama-3.3-70B-Instruct", "object": "model"}, | |
| {"id": "meta-llama/Llama-3.1-8B-Instruct", "object": "model"}, | |
| {"id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "object": "model"}, | |
| {"id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B","object": "model"}, | |
| {"id": "moonshotai/Kimi-K2.6", "object": "model"}, | |
| {"id": "MiniMaxAI/MiniMax-M2.7", "object": "model"}, | |
| {"id": "Qwen/Qwen3-Coder-30B-A3B-Instruct", "object": "model"}, ] | |
| }) | |
| def chat(): | |
| data = request.json | |
| if not data: | |
| return jsonify({"error": "No JSON body"}), 400 | |
| messages = data.get("messages", []) | |
| model = data.get("model", DEFAULT_MODEL) | |
| max_tokens = int(data.get("max_tokens", 2048)) | |
| temperature = float(data.get("temperature", 0.7)) | |
| stream = data.get("stream", False) | |
| try: | |
| if stream: | |
| def generate(): | |
| full_content = "" | |
| try: | |
| for chunk in client.chat_completion( | |
| model=model, | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| stream=True, | |
| ): | |
| delta_content = "" | |
| if chunk.choices and chunk.choices[0].delta: | |
| delta_content = chunk.choices[0].delta.content or "" | |
| full_content += delta_content | |
| chunk_data = { | |
| "id": "chatcmpl-hf", | |
| "object": "chat.completion.chunk", | |
| "model": model, | |
| "choices": [{ | |
| "index": 0, | |
| "delta": {"role": "assistant", "content": delta_content}, | |
| "finish_reason": None | |
| }] | |
| } | |
| yield f"data: {json.dumps(chunk_data)}\n\n" | |
| final = { | |
| "id": "chatcmpl-hf", | |
| "object": "chat.completion.chunk", | |
| "model": model, | |
| "choices": [{ | |
| "index": 0, | |
| "delta": {}, | |
| "finish_reason": "stop" | |
| }] | |
| } | |
| yield f"data: {json.dumps(final)}\n\n" | |
| yield "data: [DONE]\n\n" | |
| except Exception as e: | |
| yield f"data: {json.dumps({'error': str(e)})}\n\n" | |
| return Response( | |
| stream_with_context(generate()), | |
| mimetype="text/event-stream", | |
| headers={ | |
| "Cache-Control": "no-cache", | |
| "X-Accel-Buffering": "no" | |
| } | |
| ) | |
| else: | |
| response = client.chat_completion( | |
| model=model, | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| stream=False, | |
| ) | |
| content = "" | |
| if response.choices and len(response.choices) > 0: | |
| choice = response.choices[0] | |
| if hasattr(choice, "message") and choice.message: | |
| content = choice.message.content or "" | |
| if not content: | |
| return jsonify({"error": "Empty response from model"}), 500 | |
| return jsonify({ | |
| "id": "chatcmpl-hf", | |
| "object": "chat.completion", | |
| "model": model, | |
| "choices": [{ | |
| "index": 0, | |
| "message": { | |
| "role": "assistant", | |
| "content": content | |
| }, | |
| "finish_reason": "stop" | |
| }], | |
| "usage": { | |
| "prompt_tokens": 0, | |
| "completion_tokens": 0, | |
| "total_tokens": 0 | |
| } | |
| }) | |
| except Exception as e: | |
| return jsonify({ | |
| "error": str(e), | |
| "choices": [{ | |
| "index": 0, | |
| "message": {"role": "assistant", "content": f"Error: {str(e)}"}, | |
| "finish_reason": "stop" | |
| }] | |
| }), 500 | |
| if __name__ == "__main__": | |
| app.run(host="0.0.0.0", port=7860, debug=False) |