| import gradio as gr |
| import torch |
| from api import FlexSED |
| import tempfile |
| import os |
| import spaces |
|
|
| |
| flexsed = FlexSED(device="cuda" if torch.cuda.is_available() else "cpu") |
|
|
| @spaces.GPU |
| def run_flexsed(audio_file, event_list): |
| """ |
| Run inference using FlexSED and return prediction plot. |
| """ |
| if not audio_file: |
| return None |
|
|
| |
| events = [e.strip() for e in event_list.split(";") if e.strip()] |
| if not events: |
| return None |
|
|
| |
| preds = flexsed.run_inference(audio_file, events) |
|
|
| |
| output_fname = os.path.join(tempfile.gettempdir(), "flexsed_output") |
| flexsed.to_multi_plot(preds, events, fname=output_fname) |
| plot_path = f"{output_fname}.png" |
|
|
| return plot_path |
|
|
|
|
| |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as app: |
| |
| gr.Markdown(""" |
| ## π§ FlexSED: A Flexible Open-Vocabulary Sound Event Detection System |
| |
| π Welcome to the **FlexSED live demo** β explore **prompt-guided sound event detection** in real audio clips. |
| |
| π Learn more on the [FlexSED GitHub Repository](https://github.com/JHU-LCAP/FlexSED) |
| """) |
|
|
| gr.Markdown("### π Upload or choose an example below to detect sound events:") |
|
|
| with gr.Row(): |
| |
| with gr.Column(scale=1): |
| audio_input = gr.Audio(type="filepath", label="π΅ Upload Audio (.wav)") |
| text_input = gr.Textbox(label="Event list (semicolon-separated)", value="Male speech; Door; Dog; Laughter") |
|
|
| with gr.Row(): |
| detect_btn = gr.Button("π― Detect", variant="primary") |
| clear_btn = gr.Button("π§Ή Clear") |
|
|
| |
| with gr.Column(scale=1): |
| image_output = gr.Image(label="Prediction Plot", show_label=True, elem_id="output-image") |
| gr.Examples( |
| examples=[ |
| ["example.wav", "Male speech; Door; Dog; Laughter"], |
| ["example2.wav", "Male speech; Bee; Gunshot, gunfire"], |
| ], |
| inputs=[audio_input, text_input], |
| label="Example Audios" |
| ) |
|
|
| |
| detect_btn.click(run_flexsed, inputs=[audio_input, text_input], outputs=image_output) |
| clear_btn.click(lambda: (None, "Male speech; Door; Dog; Laughter"), outputs=[audio_input, text_input]) |
|
|
|
|
| if __name__ == "__main__": |
| app.launch() |
|
|