Spaces:

SupraLabs
/

Supra-50M-Instruct

Running

App Files Files Community

LH-Tech-AI commited on 1 day ago

Commit

b896b6f

verified ·

1 Parent(s): 4776b08

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -32

app.py CHANGED Viewed

@@ -60,9 +60,12 @@ def build_prompt(history: list, system: str, new_message: str) -> str:
     return "".join(parts)
-# ── Generate ──────────────────────────────────────────────────────────────────
-def chat(
     message: str,
     history: list,
     system_prompt: str,
@@ -70,28 +73,35 @@ def chat(
     temperature: float,
     top_p: float,
     repetition_penalty: float,
-) -> str:
     if not message.strip():
-        return ""
     prompt = build_prompt(history, system_prompt, message)
     inputs = tokenizer(prompt, return_tensors="pt")
-    with torch.no_grad():
-        output_ids = model.generate(
-            **inputs,
-            max_new_tokens=int(max_new_tokens),
-            do_sample=temperature > 0.01,
-            temperature=float(temperature),
-            top_p=float(top_p),
-            top_k=50,
-            repetition_penalty=float(repetition_penalty),
-            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-        )
-    new_tokens = output_ids[0][inputs["input_ids"].shape[-1]:]
-    return tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
 # ── UI ────────────────────────────────────────────────────────────────────────
@@ -99,7 +109,7 @@ def chat(
 with gr.Blocks(title="Supra-50M Instruct") as demo:
     gr.Markdown(
         "# 🦅 Supra-50M Instruct\n"
-        "50M-parameter chat model by [SupraLabs](https://huggingface.co/SupraLabs), running on CPU (Consider local inference, the quality is much better)"
     )
     with gr.Row():
@@ -110,8 +120,11 @@ with gr.Blocks(title="Supra-50M Instruct") as demo:
                 show_label=False,
                 lines=1,
                 max_lines=4,
-                submit_btn=True,
             )
         with gr.Column(scale=1, min_width=220):
             gr.Markdown("### ⚙️ Parameters")
@@ -132,28 +145,57 @@ with gr.Blocks(title="Supra-50M Instruct") as demo:
             repetition_penalty = gr.Slider(
                 label="Repetition penalty", minimum=1.0, maximum=1.5, value=1.15, step=0.05
             )
-            clear_btn = gr.Button("🗑️ Clear chat", variant="secondary")
     # ── State & wiring ────────────────────────────────────────────────────────
     chat_state = gr.State([])
-    def on_submit(message, history, system, max_tok, temp, top_p_val, rep_pen):
         if not message.strip():
-            return history, history, ""
-        response = chat(message, history, system, max_tok, temp, top_p_val, rep_pen)
-        history = history + [
-            {"role": "user", "content": message},
-            {"role": "assistant", "content": response},
-        ]
-        return history, history, ""
-    msg_box.submit(
-        fn=on_submit,
-        inputs=[msg_box, chat_state, system_prompt, max_new_tokens, temperature, top_p, repetition_penalty],
         outputs=[chatbot, chat_state, msg_box],
     )
     clear_btn.click(

     return "".join(parts)
+# ── Generate ────────────────────────────────────────────
+from transformers import TextIteratorStreamer
+from threading import Thread
+def chat_stream(
     message: str,
     history: list,
     system_prompt: str,
     temperature: float,
     top_p: float,
     repetition_penalty: float,
+):
     if not message.strip():
+        return
     prompt = build_prompt(history, system_prompt, message)
     inputs = tokenizer(prompt, return_tensors="pt")
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(
+        **inputs,
+        streamer=streamer,
+        max_new_tokens=int(max_new_tokens),
+        do_sample=temperature > 0.01,
+        temperature=float(temperature),
+        top_p=float(top_p),
+        top_k=50,
+        repetition_penalty=float(repetition_penalty),
+        pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    partial_text = ""
+    for new_text in streamer:
+        partial_text += new_text
+        yield partial_text
 # ── UI ────────────────────────────────────────────────────────────────────────
 with gr.Blocks(title="Supra-50M Instruct") as demo:
     gr.Markdown(
         "# 🦅 Supra-50M Instruct\n"
+        "50M-parameter chat model by [SupraLabs](https://huggingface.co/SupraLabs), running on CPU."
     )
     with gr.Row():
                 show_label=False,
                 lines=1,
                 max_lines=4,
             )
+            with gr.Row():
+                submit_btn = gr.Button("🚀 Send", variant="primary")
+                stop_btn = gr.Button("🛑 Stop", variant="stop")
+                clear_btn = gr.Button("🗑️ Clear chat", variant="secondary")
         with gr.Column(scale=1, min_width=220):
             gr.Markdown("### ⚙️ Parameters")
             repetition_penalty = gr.Slider(
                 label="Repetition penalty", minimum=1.0, maximum=1.5, value=1.15, step=0.05
             )
     # ── State & wiring ────────────────────────────────────────────────────────
     chat_state = gr.State([])
+    def user_step(message, history):
         if not message.strip():
+            return gr.update(), history, ""
+        new_history = history + [{"role": "user", "content": message}]
+        return new_history, new_history, ""
+    def bot_step(history, system, max_tok, temp, top_p_val, rep_pen):
+        if not history:
+            return history, history
+        user_message = history[-1]["content"]
+        history_before = history[:-1]
+        history = history + [{"role": "assistant", "content": ""}]
+        for response_chunk in chat_stream(user_message, history_before, system, max_tok, temp, top_p_val, rep_pen):
+            history[-1]["content"] = response_chunk
+            yield history, history
+    submit_event = msg_box.submit(
+        fn=user_step,
+        inputs=[msg_box, chat_state],
         outputs=[chatbot, chat_state, msg_box],
+        queue=False
+    ).then(
+        fn=bot_step,
+        inputs=[chat_state, system_prompt, max_new_tokens, temperature, top_p, repetition_penalty],
+        outputs=[chatbot, chat_state]
+    )
+    click_event = submit_btn.click(
+        fn=user_step,
+        inputs=[msg_box, chat_state],
+        outputs=[chatbot, chat_state, msg_box],
+        queue=False
+    ).then(
+        fn=bot_step,
+        inputs=[chat_state, system_prompt, max_new_tokens, temperature, top_p, repetition_penalty],
+        outputs=[chatbot, chat_state]
+    )
+    stop_btn.click(
+        fn=None,
+        inputs=None,
+        outputs=None,
+        cancels=[submit_event, click_event]
     )
     clear_btn.click(