buddy-desktop / webapp.py
carbonx's picture
Add webapp.py
2ba74a1 verified
"""
Buddy Web — Gradio-basert multimodal assistent
Enkel aa demonstrere uten desktop-avhengigheter.
Krever bare HuggingFace token for inference.
python webapp.py
Apner paa http://localhost:7860
"""
import gradio as gr
import numpy as np
from PIL import Image
import tempfile
import os
import scipy.io.wavfile as wavfile
from huggingface_hub import InferenceClient
def create_app():
client = InferenceClient()
def process_turn(audio, screenshot):
if audio is None and screenshot is None:
return "Vennligst snakk inn noe eller last opp et skjermbilde."
content = []
if screenshot is not None:
content.append({"type": "image", "image": screenshot})
if audio is not None:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sr, data = audio
wavfile.write(f.name, sr, data.astype(np.int16))
tmp = f.name
try:
transcript = client.automatic_speech_recognition(
tmp,
model="openai/whisper-large-v3",
)
text = transcript.text
except Exception as e:
text = "[Kunne ikke transkribere: " + str(e) + "]"
finally:
os.unlink(tmp)
content.append({"type": "text", "text": text})
else:
text = "Beskriv det du ser paa skjermbildet."
content.append({"type": "text", "text": text})
messages = [{"role": "user", "content": content}]
try:
response = client.chat_completion(
model="Qwen/Qwen2.5-VL-7B-Instruct",
messages=messages,
max_tokens=512,
)
reply = response.choices[0].message.content
except Exception as e:
reply = "Feil: " + str(e)
return reply
with gr.Blocks(title="Buddy — Push-to-Talk AI Assistant", theme=gr.themes.Soft()) as demo:
gr.Markdown("# Buddy AI Assistent")
gr.Markdown("Snakk inn eller skriv. Assistenten ser skjermbildet ditt.")
with gr.Row():
with gr.Column():
screenshot_input = gr.Image(
label="Skjermbilde / Dokument",
type="pil",
sources=["upload", "clipboard"],
)
audio_input = gr.Audio(
label="Push-to-Talk (spill inn stemme)",
sources=["microphone"],
type="numpy",
)
submit_btn = gr.Button("Send til Buddy", variant="primary")
with gr.Column():
output = gr.Textbox(
label="Buddy svarer",
lines=10,
interactive=False,
)
clear_btn = gr.Button("Tom")
submit_btn.click(
fn=process_turn,
inputs=[audio_input, screenshot_input],
outputs=output,
)
clear_btn.click(lambda: (None, None, ""), outputs=[audio_input, screenshot_input, output])
return demo
if __name__ == "__main__":
demo = create_app()
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)