Spaces:

multimodalart
/

scenema-audio

Running on Zero

App Files Files Community

multimodalart commited on 10 days ago

Commit

ce4f7f1

1 Parent(s): 12aed82

Copy reference audio before processor deletes it; tidy UI into advanced accordion

Browse files

Files changed (1) hide show

app.py +36 -26

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import asyncio
 import base64
 import logging
 import os
 import sys
 import tempfile
 import uuid
@@ -244,7 +245,15 @@ def generate(
         async def patched(url):
             if url.startswith("file://"):
-                return url[len("file://"):]
             return await original(url)
         processor._download_reference = patched
@@ -319,37 +328,38 @@ with gr.Blocks(title="Scenema Audio") as demo:
                 lines=2,
                 placeholder='e.g. "A warm male voice with a slight British accent..."',
             )
-            with gr.Row():
-                gender = gr.Radio(["male", "female"], value="male", label="Gender")
-                language = gr.Dropdown(
-                    ["en", "es", "fr", "de", "it", "pt", "ja", "zh", "ko"],
-                    value="en", label="Language",
-                )
-                shot = gr.Radio(
-                    ["closeup", "wide", "scene"], value="closeup", label="Shot"
-                )
-            with gr.Accordion("Scene & direction (optional)", open=False):
                 scene = gr.Textbox(label="Scene", placeholder="e.g. busy cafe at midday")
                 action = gr.Textbox(label="Performance direction (<action>)")
                 sound_before = gr.Textbox(label="Sound event before speech (<sound>)")
-            with gr.Accordion("Raw XML override (optional)", open=False):
                 raw_xml = gr.Textbox(
-                    label="<speak> XML (overrides fields above when set)",
                     lines=4,
                 )
-            with gr.Accordion("Voice cloning (optional)", open=False):
-                reference_audio = gr.Audio(
-                    label="Reference voice (10-20s)",
-                    type="filepath",
-                )
-            with gr.Row():
-                mode = gr.Radio(
-                    ["generate", "voice_design"], value="generate", label="Mode"
-                )
-                seed = gr.Number(value=42, precision=0, label="Seed (-1 = random)")
-            with gr.Row():
-                background_sfx = gr.Checkbox(value=False, label="Keep background SFX")
-                skip_vc = gr.Checkbox(value=False, label="Skip SeedVC post-processing")
             run_btn = gr.Button("Generate", variant="primary")
         with gr.Column(scale=2):
             out_audio = gr.Audio(label="Output", type="filepath")

 import base64
 import logging
 import os
+import shutil
 import sys
 import tempfile
 import uuid
         async def patched(url):
             if url.startswith("file://"):
+                # Copy to a throwaway temp file — AudioProcessor unlinks
+                # ref_wav_path on completion, which would otherwise destroy
+                # the user's uploaded gradio file and break subsequent runs.
+                src = url[len("file://"):]
+                suffix = Path(src).suffix or ".wav"
+                tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
+                tmp.close()
+                shutil.copyfile(src, tmp.name)
+                return tmp.name
             return await original(url)
         processor._download_reference = patched
                 lines=2,
                 placeholder='e.g. "A warm male voice with a slight British accent..."',
             )
+            gender = gr.Radio(["male", "female"], value="male", label="Gender")
+            reference_audio = gr.Audio(
+                label="Voice cloning reference (optional, 10-20s)",
+                type="filepath",
+            )
+            with gr.Accordion("Advanced settings", open=False):
+                with gr.Row():
+                    mode = gr.Radio(
+                        ["generate", "voice_design"],
+                        value="generate",
+                        label="Mode",
+                        info="voice_design = 15s voice preview",
+                    )
+                    seed = gr.Number(value=42, precision=0, label="Seed (-1 = random)")
+                with gr.Row():
+                    language = gr.Dropdown(
+                        ["en", "es", "fr", "de", "it", "pt", "ja", "zh", "ko"],
+                        value="en", label="Language",
+                    )
+                    shot = gr.Radio(
+                        ["closeup", "wide", "scene"], value="closeup", label="Shot"
+                    )
                 scene = gr.Textbox(label="Scene", placeholder="e.g. busy cafe at midday")
                 action = gr.Textbox(label="Performance direction (<action>)")
                 sound_before = gr.Textbox(label="Sound event before speech (<sound>)")
+                with gr.Row():
+                    background_sfx = gr.Checkbox(value=False, label="Keep background SFX")
+                    skip_vc = gr.Checkbox(value=False, label="Skip SeedVC post-processing")
                 raw_xml = gr.Textbox(
+                    label="Raw <speak> XML (overrides all fields above when set)",
                     lines=4,
                 )
             run_btn = gr.Button("Generate", variant="primary")
         with gr.Column(scale=2):
             out_audio = gr.Audio(label="Output", type="filepath")