| """ |
| Buddy Web — Gradio-basert multimodal assistent |
| |
| Enkel aa demonstrere uten desktop-avhengigheter. |
| Krever bare HuggingFace token for inference. |
| |
| python webapp.py |
| |
| Apner paa http://localhost:7860 |
| """ |
| import gradio as gr |
| import numpy as np |
| from PIL import Image |
| import tempfile |
| import os |
| import scipy.io.wavfile as wavfile |
|
|
| from huggingface_hub import InferenceClient |
|
|
|
|
| def create_app(): |
| client = InferenceClient() |
|
|
| def process_turn(audio, screenshot): |
| if audio is None and screenshot is None: |
| return "Vennligst snakk inn noe eller last opp et skjermbilde." |
|
|
| content = [] |
| if screenshot is not None: |
| content.append({"type": "image", "image": screenshot}) |
| if audio is not None: |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: |
| sr, data = audio |
| wavfile.write(f.name, sr, data.astype(np.int16)) |
| tmp = f.name |
|
|
| try: |
| transcript = client.automatic_speech_recognition( |
| tmp, |
| model="openai/whisper-large-v3", |
| ) |
| text = transcript.text |
| except Exception as e: |
| text = "[Kunne ikke transkribere: " + str(e) + "]" |
| finally: |
| os.unlink(tmp) |
|
|
| content.append({"type": "text", "text": text}) |
| else: |
| text = "Beskriv det du ser paa skjermbildet." |
| content.append({"type": "text", "text": text}) |
|
|
| messages = [{"role": "user", "content": content}] |
|
|
| try: |
| response = client.chat_completion( |
| model="Qwen/Qwen2.5-VL-7B-Instruct", |
| messages=messages, |
| max_tokens=512, |
| ) |
| reply = response.choices[0].message.content |
| except Exception as e: |
| reply = "Feil: " + str(e) |
|
|
| return reply |
|
|
| with gr.Blocks(title="Buddy — Push-to-Talk AI Assistant", theme=gr.themes.Soft()) as demo: |
| gr.Markdown("# Buddy AI Assistent") |
| gr.Markdown("Snakk inn eller skriv. Assistenten ser skjermbildet ditt.") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| screenshot_input = gr.Image( |
| label="Skjermbilde / Dokument", |
| type="pil", |
| sources=["upload", "clipboard"], |
| ) |
| audio_input = gr.Audio( |
| label="Push-to-Talk (spill inn stemme)", |
| sources=["microphone"], |
| type="numpy", |
| ) |
| submit_btn = gr.Button("Send til Buddy", variant="primary") |
|
|
| with gr.Column(): |
| output = gr.Textbox( |
| label="Buddy svarer", |
| lines=10, |
| interactive=False, |
| ) |
| clear_btn = gr.Button("Tom") |
|
|
| submit_btn.click( |
| fn=process_turn, |
| inputs=[audio_input, screenshot_input], |
| outputs=output, |
| ) |
| clear_btn.click(lambda: (None, None, ""), outputs=[audio_input, screenshot_input, output]) |
|
|
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| demo = create_app() |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=False) |
|
|