""" Buddy Web — Gradio-basert multimodal assistent (HF Space versjon) """ import gradio as gr import numpy as np from PIL import Image import tempfile import os import scipy.io.wavfile as wavfile from huggingface_hub import InferenceClient def create_app(): # Bruker HF_TOKEN som settes automatisk i HF Spaces client = InferenceClient(token=os.environ.get("HF_TOKEN")) def process_turn(audio, screenshot): if audio is None and screenshot is None: return "Vennligst snakk inn noe eller last opp et skjermbilde." content = [] if screenshot is not None: content.append({"type": "image", "image": screenshot}) if audio is not None: with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: sr, data = audio wavfile.write(f.name, sr, data.astype(np.int16)) tmp = f.name try: transcript = client.automatic_speech_recognition( tmp, model="openai/whisper-large-v3", ) text = transcript.text except Exception as e: text = "[Kunne ikke transkribere: " + str(e) + "]" finally: os.unlink(tmp) content.append({"type": "text", "text": text}) else: text = "Beskriv det du ser paa skjermbildet." content.append({"type": "text", "text": text}) messages = [{"role": "user", "content": content}] try: response = client.chat_completion( model="Qwen/Qwen2.5-VL-7B-Instruct", messages=messages, max_tokens=512, ) reply = response.choices[0].message.content except Exception as e: reply = "Feil: " + str(e) return reply with gr.Blocks(title="Buddy — Push-to-Talk AI Assistant", theme=gr.themes.Soft()) as demo: gr.Markdown("# Buddy AI Assistent") gr.Markdown("Snakk inn eller skriv. Assistenten ser skjermbildet ditt.") with gr.Row(): with gr.Column(): screenshot_input = gr.Image( label="Skjermbilde / Dokument", type="pil", sources=["upload", "clipboard"], ) audio_input = gr.Audio( label="Push-to-Talk (spill inn stemme)", sources=["microphone"], type="numpy", ) submit_btn = gr.Button("Send til Buddy", variant="primary") with gr.Column(): output = gr.Textbox( label="Buddy svarer", lines=10, interactive=False, ) clear_btn = gr.Button("Tom") submit_btn.click( fn=process_turn, inputs=[audio_input, screenshot_input], outputs=output, ) clear_btn.click(lambda: (None, None, ""), outputs=[audio_input, screenshot_input, output]) return demo if __name__ == "__main__": demo = create_app() demo.launch()