Spaces:
Sleeping
Sleeping
File size: 3,207 Bytes
b9cf533 6ca820a b9cf533 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | """
Buddy Web — Gradio-basert multimodal assistent (HF Space versjon)
"""
import gradio as gr
import numpy as np
from PIL import Image
import tempfile
import os
import scipy.io.wavfile as wavfile
from huggingface_hub import InferenceClient
def create_app():
# Bruker HF_TOKEN som settes automatisk i HF Spaces
client = InferenceClient(token=os.environ.get("HF_TOKEN"))
def process_turn(audio, screenshot):
if audio is None and screenshot is None:
return "Vennligst snakk inn noe eller last opp et skjermbilde."
content = []
if screenshot is not None:
content.append({"type": "image", "image": screenshot})
if audio is not None:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sr, data = audio
wavfile.write(f.name, sr, data.astype(np.int16))
tmp = f.name
try:
transcript = client.automatic_speech_recognition(
tmp,
model="openai/whisper-large-v3",
)
text = transcript.text
except Exception as e:
text = "[Kunne ikke transkribere: " + str(e) + "]"
finally:
os.unlink(tmp)
content.append({"type": "text", "text": text})
else:
text = "Beskriv det du ser paa skjermbildet."
content.append({"type": "text", "text": text})
messages = [{"role": "user", "content": content}]
try:
response = client.chat_completion(
model="Qwen/Qwen2.5-VL-7B-Instruct",
messages=messages,
max_tokens=512,
)
reply = response.choices[0].message.content
except Exception as e:
reply = "Feil: " + str(e)
return reply
with gr.Blocks(title="Buddy — Push-to-Talk AI Assistant", theme=gr.themes.Soft()) as demo:
gr.Markdown("# Buddy AI Assistent")
gr.Markdown("Snakk inn eller skriv. Assistenten ser skjermbildet ditt.")
with gr.Row():
with gr.Column():
screenshot_input = gr.Image(
label="Skjermbilde / Dokument",
type="pil",
sources=["upload", "clipboard"],
)
audio_input = gr.Audio(
label="Push-to-Talk (spill inn stemme)",
sources=["microphone"],
type="numpy",
)
submit_btn = gr.Button("Send til Buddy", variant="primary")
with gr.Column():
output = gr.Textbox(
label="Buddy svarer",
lines=10,
interactive=False,
)
clear_btn = gr.Button("Tom")
submit_btn.click(
fn=process_turn,
inputs=[audio_input, screenshot_input],
outputs=output,
)
clear_btn.click(lambda: (None, None, ""), outputs=[audio_input, screenshot_input, output])
return demo
if __name__ == "__main__":
demo = create_app()
demo.launch()
|