File size: 3,207 Bytes
b9cf533
 
 
 
 
 
 
 
 
 
 
 
 
 
6ca820a
 
b9cf533
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
Buddy Web — Gradio-basert multimodal assistent (HF Space versjon)
"""
import gradio as gr
import numpy as np
from PIL import Image
import tempfile
import os
import scipy.io.wavfile as wavfile

from huggingface_hub import InferenceClient


def create_app():
    # Bruker HF_TOKEN som settes automatisk i HF Spaces
    client = InferenceClient(token=os.environ.get("HF_TOKEN"))

    def process_turn(audio, screenshot):
        if audio is None and screenshot is None:
            return "Vennligst snakk inn noe eller last opp et skjermbilde."

        content = []
        if screenshot is not None:
            content.append({"type": "image", "image": screenshot})
        
        if audio is not None:
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                sr, data = audio
                wavfile.write(f.name, sr, data.astype(np.int16))
                tmp = f.name
            try:
                transcript = client.automatic_speech_recognition(
                    tmp,
                    model="openai/whisper-large-v3",
                )
                text = transcript.text
            except Exception as e:
                text = "[Kunne ikke transkribere: " + str(e) + "]"
            finally:
                os.unlink(tmp)
            content.append({"type": "text", "text": text})
        else:
            text = "Beskriv det du ser paa skjermbildet."
            content.append({"type": "text", "text": text})

        messages = [{"role": "user", "content": content}]

        try:
            response = client.chat_completion(
                model="Qwen/Qwen2.5-VL-7B-Instruct",
                messages=messages,
                max_tokens=512,
            )
            reply = response.choices[0].message.content
        except Exception as e:
            reply = "Feil: " + str(e)

        return reply

    with gr.Blocks(title="Buddy — Push-to-Talk AI Assistant", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# Buddy AI Assistent")
        gr.Markdown("Snakk inn eller skriv. Assistenten ser skjermbildet ditt.")

        with gr.Row():
            with gr.Column():
                screenshot_input = gr.Image(
                    label="Skjermbilde / Dokument",
                    type="pil",
                    sources=["upload", "clipboard"],
                )
                audio_input = gr.Audio(
                    label="Push-to-Talk (spill inn stemme)",
                    sources=["microphone"],
                    type="numpy",
                )
                submit_btn = gr.Button("Send til Buddy", variant="primary")

            with gr.Column():
                output = gr.Textbox(
                    label="Buddy svarer",
                    lines=10,
                    interactive=False,
                )
                clear_btn = gr.Button("Tom")

        submit_btn.click(
            fn=process_turn,
            inputs=[audio_input, screenshot_input],
            outputs=output,
        )
        clear_btn.click(lambda: (None, None, ""), outputs=[audio_input, screenshot_input, output])

    return demo


if __name__ == "__main__":
    demo = create_app()
    demo.launch()