Spaces:

infinityonline
/

infinity

Running

App Files Files Community

infinityonline commited on 21 days ago

Commit

a3e88ad

verified ·

1 Parent(s): b2f1d3b

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -35

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
-from flask import Flask, request, jsonify
 from huggingface_hub import InferenceClient
 app = Flask(__name__)
@@ -33,45 +34,96 @@ def chat():
     model = data.get("model", DEFAULT_MODEL)
     max_tokens = int(data.get("max_tokens", 2048))
     temperature = float(data.get("temperature", 0.7))
     try:
-        response = client.chat_completion(
-            model=model,
-            messages=messages,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            stream=False,
-        )
-        content = ""
-        if response.choices and len(response.choices) > 0:
-            choice = response.choices[0]
-            if hasattr(choice, "message") and choice.message:
-                content = choice.message.content or ""
-            elif hasattr(choice, "text"):
-                content = choice.text or ""
-        if not content:
-            return jsonify({"error": "Empty response from model"}), 500
-        return jsonify({
-            "id": "chatcmpl-hf",
-            "object": "chat.completion",
-            "model": model,
-            "choices": [{
-                "index": 0,
-                "message": {
-                    "role": "assistant",
-                    "content": content
-                },
-                "finish_reason": "stop"
-            }],
-            "usage": {
-                "prompt_tokens": 0,
-                "completion_tokens": 0,
-                "total_tokens": 0
-            }
-        })
     except Exception as e:
         return jsonify({

 import os
+import json
+from flask import Flask, request, jsonify, Response, stream_with_context
 from huggingface_hub import InferenceClient
 app = Flask(__name__)
     model = data.get("model", DEFAULT_MODEL)
     max_tokens = int(data.get("max_tokens", 2048))
     temperature = float(data.get("temperature", 0.7))
+    stream = data.get("stream", False)
     try:
+        if stream:
+            def generate():
+                full_content = ""
+                try:
+                    for chunk in client.chat_completion(
+                        model=model,
+                        messages=messages,
+                        max_tokens=max_tokens,
+                        temperature=temperature,
+                        stream=True,
+                    ):
+                        delta_content = ""
+                        if chunk.choices and chunk.choices[0].delta:
+                            delta_content = chunk.choices[0].delta.content or ""
+                        full_content += delta_content
+                        chunk_data = {
+                            "id": "chatcmpl-hf",
+                            "object": "chat.completion.chunk",
+                            "model": model,
+                            "choices": [{
+                                "index": 0,
+                                "delta": {"role": "assistant", "content": delta_content},
+                                "finish_reason": None
+                            }]
+                        }
+                        yield f"data: {json.dumps(chunk_data)}\n\n"
+                    final = {
+                        "id": "chatcmpl-hf",
+                        "object": "chat.completion.chunk",
+                        "model": model,
+                        "choices": [{
+                            "index": 0,
+                            "delta": {},
+                            "finish_reason": "stop"
+                        }]
+                    }
+                    yield f"data: {json.dumps(final)}\n\n"
+                    yield "data: [DONE]\n\n"
+                except Exception as e:
+                    yield f"data: {json.dumps({'error': str(e)})}\n\n"
+            return Response(
+                stream_with_context(generate()),
+                mimetype="text/event-stream",
+                headers={
+                    "Cache-Control": "no-cache",
+                    "X-Accel-Buffering": "no"
+                }
+            )
+        else:
+            response = client.chat_completion(
+                model=model,
+                messages=messages,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                stream=False,
+            )
+            content = ""
+            if response.choices and len(response.choices) > 0:
+                choice = response.choices[0]
+                if hasattr(choice, "message") and choice.message:
+                    content = choice.message.content or ""
+            if not content:
+                return jsonify({"error": "Empty response from model"}), 500
+            return jsonify({
+                "id": "chatcmpl-hf",
+                "object": "chat.completion",
+                "model": model,
+                "choices": [{
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": content
+                    },
+                    "finish_reason": "stop"
+                }],
+                "usage": {
+                    "prompt_tokens": 0,
+                    "completion_tokens": 0,
+                    "total_tokens": 0
+                }
+            })
     except Exception as e:
         return jsonify({