Spaces:

stabilityai
/

stable-audio-3

Running on Zero

App Files Files Community

multimodalart HF Staff commited on about 21 hours ago

Commit

5e77923

verified ·

1 Parent(s): 366a3bb

initial commit: SA3 medium + small-music + small-sfx

Browse files

Files changed (3) hide show

README.md +16 -4
app.py +252 -0
requirements.txt +3 -0

README.md CHANGED Viewed

@@ -1,13 +1,25 @@
 ---
 title: Stable Audio 3
-emoji: 🐢
-colorFrom: red
 colorTo: purple
 sdk: gradio
 sdk_version: 6.14.0
-python_version: '3.13'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Stable Audio 3
+emoji: 🎵
+colorFrom: indigo
 colorTo: purple
 sdk: gradio
 sdk_version: 6.14.0
 app_file: app.py
 pinned: false
+license: other
+short_description: Text-to-audio with SA3 Medium / Small Music / Small SFX.
+suggested_hardware: zero-a10g
+models:
+  - stabilityai/stable-audio-3-medium
+  - stabilityai/stable-audio-3-small-music
+  - stabilityai/stable-audio-3-small-sfx
 ---
+# Stable Audio 3
+ZeroGPU demo of the [Stable Audio 3](https://huggingface.co/stabilityai) family. Three variants preloaded at module load; switch between them with a radio button.
+- [`stable-audio-3-medium`](https://huggingface.co/stabilityai/stable-audio-3-medium) — general audio (largest).
+- [`stable-audio-3-small-music`](https://huggingface.co/stabilityai/stable-audio-3-small-music) — 0.6B, music-focused.
+- [`stable-audio-3-small-sfx`](https://huggingface.co/stabilityai/stable-audio-3-small-sfx) — 0.6B, sound effects.

app.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""ZeroGPU Gradio demo for Stable Audio 3 — Medium, Small Music, Small SFX.
+All three models are preloaded at module level (per the ZeroGPU contract), and
+a radio selector picks which one runs inside the ``@spaces.GPU`` infer call.
+The visible UI mirrors the high-level ``stable_audio_3`` defaults (prompt +
+duration); steps / CFG / sampler / seed live in an Advanced accordion.
+"""
+from __future__ import annotations
+# spaces must be imported before any CUDA-touching module.
+import spaces  # noqa: F401
+import os
+import tempfile
+import time
+from dataclasses import dataclass
+import gradio as gr
+import torch
+import torchaudio
+from einops import rearrange
+from stable_audio_tools import get_pretrained_model
+from stable_audio_tools.inference.generation import generate_diffusion_cond_inpaint
+# ---------------------------------------------------------------------------
+# Variants
+# ---------------------------------------------------------------------------
+@dataclass
+class Variant:
+    key: str
+    repo: str
+    label: str
+    default_duration: int
+    placeholder: str
+VARIANTS: list[Variant] = [
+    Variant(
+        key="medium",
+        repo="stabilityai/stable-audio-3-medium",
+        label="Medium — general audio (largest)",
+        default_duration=60,
+        placeholder="A dream-like Synthpop instrumental that would accompany a dream-sequence in a surrealist movie 120 BPM",
+    ),
+    Variant(
+        key="small-music",
+        repo="stabilityai/stable-audio-3-small-music",
+        label="Small Music — 0.6B, music-focused",
+        default_duration=60,
+        placeholder="Cinematic neo-soul groove with electric piano, brushed drums, walking upright bass, smoky vibe 92 BPM",
+    ),
+    Variant(
+        key="small-sfx",
+        repo="stabilityai/stable-audio-3-small-sfx",
+        label="Small SFX — 0.6B, sound effects",
+        default_duration=7,
+        placeholder="Chugging train coming into station with horn",
+    ),
+]
+# ---------------------------------------------------------------------------
+# Preload all variants at module level (ZeroGPU CUDA emulation accepts it)
+# ---------------------------------------------------------------------------
+@dataclass
+class LoadedVariant:
+    variant: Variant
+    model: object
+    sample_rate: int
+    sample_size: int
+    max_seconds: int
+LOADED: dict[str, LoadedVariant] = {}
+for v in VARIANTS:
+    print(f"[startup] loading {v.repo} …", flush=True)
+    t0 = time.time()
+    model, config = get_pretrained_model(v.repo)
+    sr = int(config["sample_rate"])
+    ss = int(config["sample_size"])
+    model = model.to("cuda").to(torch.float16)
+    LOADED[v.key] = LoadedVariant(
+        variant=v,
+        model=model,
+        sample_rate=sr,
+        sample_size=ss,
+        max_seconds=ss // sr,
+    )
+    print(
+        f"[startup] {v.key} ready in {time.time() - t0:.1f}s · "
+        f"sr={sr} · sample_size={ss} (~{ss // sr}s max)",
+        flush=True,
+    )
+VARIANT_CHOICES = [(v.label, v.key) for v in VARIANTS]
+SAMPLERS = ["pingpong", "k-dpmpp-2m", "k-heun", "dpmpp-2s-ancestral", "dpmpp-3m-sde"]
+# ---------------------------------------------------------------------------
+# Inference
+# ---------------------------------------------------------------------------
+@spaces.GPU(duration=180)
+def infer(
+    variant_key: str,
+    prompt: str,
+    duration: int = 60,
+    steps: int = 8,
+    cfg_scale: float = 1.0,
+    sampler_type: str = "pingpong",
+    seed: int = 0,
+    progress: gr.Progress = gr.Progress(),
+):
+    prompt = (prompt or "").strip()
+    if not prompt:
+        raise gr.Error("Please enter a prompt.")
+    if variant_key not in LOADED:
+        raise gr.Error(f"Unknown variant {variant_key!r}.")
+    lv = LOADED[variant_key]
+    duration = max(1, min(int(duration), lv.max_seconds))
+    progress(0.1, desc=f"[{variant_key}] preparing conditioning")
+    conditioning = [{"prompt": prompt, "seconds_total": int(duration)}]
+    if seed and int(seed) > 0:
+        torch.manual_seed(int(seed))
+    else:
+        torch.seed()
+    progress(0.25, desc=f"[{variant_key}] sampling {steps} steps with {sampler_type}")
+    t0 = time.time()
+    output = generate_diffusion_cond_inpaint(
+        lv.model,
+        steps=int(steps),
+        cfg_scale=float(cfg_scale),
+        conditioning=conditioning,
+        sample_size=lv.sample_size,
+        sampler_type=sampler_type,
+        device="cuda",
+    )
+    print(f"[infer/{variant_key}] sampling done in {time.time() - t0:.1f}s", flush=True)
+    progress(0.92, desc="Normalising & saving")
+    output = rearrange(output, "b d n -> d (b n)")
+    output = (
+        output.to(torch.float32)
+        .div(torch.max(torch.abs(output)).clamp(min=1e-9))
+        .clamp(-1, 1)
+        .mul(32767)
+        .to(torch.int16)
+        .cpu()
+    )
+    output = output[:, : int(duration) * lv.sample_rate]
+    out_path = os.path.join(tempfile.mkdtemp(), f"sa3_{variant_key}.wav")
+    torchaudio.save(out_path, output, lv.sample_rate)
+    return out_path
+# ---------------------------------------------------------------------------
+# UI
+# ---------------------------------------------------------------------------
+DESCRIPTION = """
+# 🎵 Stable Audio 3
+Text-to-audio generation with [Stable Audio 3](https://huggingface.co/stabilityai). Pick a variant, write a prompt, hit Generate.
+"""
+EXAMPLES = [
+    ["medium",      "House music that encapsulates the feeling of being at a festival in the sunny weather with all your friends 124 BPM", 60],
+    ["small-music", "Cinematic neo-soul groove with electric piano, brushed drums, walking upright bass, smoky vibe 92 BPM", 45],
+    ["small-music", "Driving techno track with rolling 16th-note hats, deep sub bass, acid arpeggios building tension 132 BPM", 60],
+    ["small-sfx",   "Chugging train coming into station with horn", 7],
+    ["small-sfx",   "Heavy rain on a tin roof with distant thunder rolls", 10],
+    ["medium",      "Rainy night, lo-fi hip-hop beat with vinyl crackle, mellow piano chords, soft kick and snare 80 BPM", 30],
+]
+def _on_variant_change(variant_key: str):
+    lv = LOADED[variant_key]
+    return (
+        gr.update(maximum=lv.max_seconds, value=min(lv.variant.default_duration, lv.max_seconds),
+                  label=f"Duration (s) · model max {lv.max_seconds}s"),
+        gr.update(placeholder=lv.variant.placeholder),
+    )
+with gr.Blocks(theme=gr.themes.Soft(), title="Stable Audio 3") as demo:
+    gr.Markdown(DESCRIPTION)
+    variant = gr.Radio(
+        choices=VARIANT_CHOICES,
+        value=VARIANTS[0].key,
+        label="Model",
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            prompt = gr.Textbox(
+                label="Prompt",
+                placeholder=VARIANTS[0].placeholder,
+                lines=3,
+            )
+            duration = gr.Slider(
+                1, LOADED[VARIANTS[0].key].max_seconds,
+                value=VARIANTS[0].default_duration, step=1,
+                label=f"Duration (s) · model max {LOADED[VARIANTS[0].key].max_seconds}s",
+            )
+            with gr.Accordion("Advanced settings", open=False):
+                steps = gr.Slider(1, 50, value=8, step=1, label="Steps")
+                cfg_scale = gr.Slider(0.5, 8.0, value=1.0, step=0.1, label="CFG scale")
+                sampler_type = gr.Dropdown(SAMPLERS, value="pingpong", label="Sampler")
+                seed = gr.Number(value=0, precision=0, label="Seed (0 = random)")
+            run_btn = gr.Button("🎼 Generate", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            audio_out = gr.Audio(label="Output", type="filepath", autoplay=True)
+    gr.Examples(
+        examples=EXAMPLES,
+        inputs=[variant, prompt, duration],
+        outputs=[audio_out],
+        fn=infer,
+        cache_examples=True,
+        cache_mode="lazy",
+        label="Examples (lazy-cached on first click)",
+    )
+    variant.change(
+        fn=_on_variant_change,
+        inputs=[variant],
+        outputs=[duration, prompt],
+    )
+    run_btn.click(
+        fn=infer,
+        inputs=[variant, prompt, duration, steps, cfg_scale, sampler_type, seed],
+        outputs=[audio_out],
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+# torch / gradio / spaces are preinstalled on ZeroGPU Spaces.
+stable-audio-tools
+einops