Infinityecs1 commited on
Commit
78904ef
·
verified ·
1 Parent(s): 8e703d5

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +15 -0
  2. README.md +5 -4
  3. app.py +139 -0
  4. requirements.txt +2 -0
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+
14
+ EXPOSE 7860
15
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,11 +1,12 @@
1
  ---
2
- title: Infinityllm
3
- emoji: 🏢
4
- colorFrom: gray
5
- colorTo: pink
6
  sdk: docker
7
  pinned: false
8
  license: mit
 
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Infinity
3
+ emoji: 👁
4
+ colorFrom: green
5
+ colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
+ short_description: infinityLLM
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from flask import Flask, request, jsonify, Response, stream_with_context
4
+ from huggingface_hub import InferenceClient
5
+
6
+ app = Flask(__name__)
7
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
8
+ DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "Qwen/Qwen2.5-72B-Instruct")
9
+ client = InferenceClient(token=HF_TOKEN)
10
+
11
+ @app.route("/", methods=["GET"])
12
+ def index():
13
+ return jsonify({"status": "ok", "message": "InfinityLLM API is running"})
14
+
15
+ @app.route("/v1/models", methods=["GET"])
16
+ def models():
17
+ return jsonify({
18
+ "object": "list",
19
+ "data": [
20
+ {"id": "Qwen/Qwen2.5-72B-Instruct", "object": "model"},
21
+ {"id": "Qwen/Qwen2.5-7B-Instruct", "object": "model"},
22
+ {"id": "meta-llama/Meta-Llama-3.1-8B-Instruct", "object": "model"},
23
+ {"id": "mistralai/Mistral-7B-Instruct-v0.3", "object": "model"},
24
+ ]
25
+ })
26
+
27
+ @app.route("/v1/chat/completions", methods=["POST"])
28
+ def chat():
29
+ data = request.json
30
+ if not data:
31
+ return jsonify({"error": "No JSON body"}), 400
32
+
33
+ messages = data.get("messages", [])
34
+ model = data.get("model", DEFAULT_MODEL)
35
+ max_tokens = int(data.get("max_tokens", 2048))
36
+ temperature = float(data.get("temperature", 0.7))
37
+ stream = data.get("stream", False)
38
+
39
+ try:
40
+ if stream:
41
+ def generate():
42
+ full_content = ""
43
+ try:
44
+ for chunk in client.chat_completion(
45
+ model=model,
46
+ messages=messages,
47
+ max_tokens=max_tokens,
48
+ temperature=temperature,
49
+ stream=True,
50
+ ):
51
+ delta_content = ""
52
+ if chunk.choices and chunk.choices[0].delta:
53
+ delta_content = chunk.choices[0].delta.content or ""
54
+ full_content += delta_content
55
+ chunk_data = {
56
+ "id": "chatcmpl-hf",
57
+ "object": "chat.completion.chunk",
58
+ "model": model,
59
+ "choices": [{
60
+ "index": 0,
61
+ "delta": {"role": "assistant", "content": delta_content},
62
+ "finish_reason": None
63
+ }]
64
+ }
65
+ yield f"data: {json.dumps(chunk_data)}\n\n"
66
+
67
+ final = {
68
+ "id": "chatcmpl-hf",
69
+ "object": "chat.completion.chunk",
70
+ "model": model,
71
+ "choices": [{
72
+ "index": 0,
73
+ "delta": {},
74
+ "finish_reason": "stop"
75
+ }]
76
+ }
77
+ yield f"data: {json.dumps(final)}\n\n"
78
+ yield "data: [DONE]\n\n"
79
+ except Exception as e:
80
+ yield f"data: {json.dumps({'error': str(e)})}\n\n"
81
+
82
+ return Response(
83
+ stream_with_context(generate()),
84
+ mimetype="text/event-stream",
85
+ headers={
86
+ "Cache-Control": "no-cache",
87
+ "X-Accel-Buffering": "no"
88
+ }
89
+ )
90
+
91
+ else:
92
+ response = client.chat_completion(
93
+ model=model,
94
+ messages=messages,
95
+ max_tokens=max_tokens,
96
+ temperature=temperature,
97
+ stream=False,
98
+ )
99
+
100
+ content = ""
101
+ if response.choices and len(response.choices) > 0:
102
+ choice = response.choices[0]
103
+ if hasattr(choice, "message") and choice.message:
104
+ content = choice.message.content or ""
105
+
106
+ if not content:
107
+ return jsonify({"error": "Empty response from model"}), 500
108
+
109
+ return jsonify({
110
+ "id": "chatcmpl-hf",
111
+ "object": "chat.completion",
112
+ "model": model,
113
+ "choices": [{
114
+ "index": 0,
115
+ "message": {
116
+ "role": "assistant",
117
+ "content": content
118
+ },
119
+ "finish_reason": "stop"
120
+ }],
121
+ "usage": {
122
+ "prompt_tokens": 0,
123
+ "completion_tokens": 0,
124
+ "total_tokens": 0
125
+ }
126
+ })
127
+
128
+ except Exception as e:
129
+ return jsonify({
130
+ "error": str(e),
131
+ "choices": [{
132
+ "index": 0,
133
+ "message": {"role": "assistant", "content": f"Error: {str(e)}"},
134
+ "finish_reason": "stop"
135
+ }]
136
+ }), 500
137
+
138
+ if __name__ == "__main__":
139
+ app.run(host="0.0.0.0", port=7860, debug=False)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ flask
2
+ huggingface_hub