multimodalart HF Staff commited on
Commit
5e77923
·
verified ·
1 Parent(s): 366a3bb

initial commit: SA3 medium + small-music + small-sfx

Browse files
Files changed (3) hide show
  1. README.md +16 -4
  2. app.py +252 -0
  3. requirements.txt +3 -0
README.md CHANGED
@@ -1,13 +1,25 @@
1
  ---
2
  title: Stable Audio 3
3
- emoji: 🐢
4
- colorFrom: red
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 6.14.0
8
- python_version: '3.13'
9
  app_file: app.py
10
  pinned: false
 
 
 
 
 
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
1
  ---
2
  title: Stable Audio 3
3
+ emoji: 🎵
4
+ colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 6.14.0
 
8
  app_file: app.py
9
  pinned: false
10
+ license: other
11
+ short_description: Text-to-audio with SA3 Medium / Small Music / Small SFX.
12
+ suggested_hardware: zero-a10g
13
+ models:
14
+ - stabilityai/stable-audio-3-medium
15
+ - stabilityai/stable-audio-3-small-music
16
+ - stabilityai/stable-audio-3-small-sfx
17
  ---
18
 
19
+ # Stable Audio 3
20
+
21
+ ZeroGPU demo of the [Stable Audio 3](https://huggingface.co/stabilityai) family. Three variants preloaded at module load; switch between them with a radio button.
22
+
23
+ - [`stable-audio-3-medium`](https://huggingface.co/stabilityai/stable-audio-3-medium) — general audio (largest).
24
+ - [`stable-audio-3-small-music`](https://huggingface.co/stabilityai/stable-audio-3-small-music) — 0.6B, music-focused.
25
+ - [`stable-audio-3-small-sfx`](https://huggingface.co/stabilityai/stable-audio-3-small-sfx) — 0.6B, sound effects.
app.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ZeroGPU Gradio demo for Stable Audio 3 — Medium, Small Music, Small SFX.
2
+
3
+ All three models are preloaded at module level (per the ZeroGPU contract), and
4
+ a radio selector picks which one runs inside the ``@spaces.GPU`` infer call.
5
+ The visible UI mirrors the high-level ``stable_audio_3`` defaults (prompt +
6
+ duration); steps / CFG / sampler / seed live in an Advanced accordion.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ # spaces must be imported before any CUDA-touching module.
12
+ import spaces # noqa: F401
13
+
14
+ import os
15
+ import tempfile
16
+ import time
17
+ from dataclasses import dataclass
18
+
19
+ import gradio as gr
20
+ import torch
21
+ import torchaudio
22
+ from einops import rearrange
23
+
24
+ from stable_audio_tools import get_pretrained_model
25
+ from stable_audio_tools.inference.generation import generate_diffusion_cond_inpaint
26
+
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Variants
30
+ # ---------------------------------------------------------------------------
31
+
32
+
33
+ @dataclass
34
+ class Variant:
35
+ key: str
36
+ repo: str
37
+ label: str
38
+ default_duration: int
39
+ placeholder: str
40
+
41
+
42
+ VARIANTS: list[Variant] = [
43
+ Variant(
44
+ key="medium",
45
+ repo="stabilityai/stable-audio-3-medium",
46
+ label="Medium — general audio (largest)",
47
+ default_duration=60,
48
+ placeholder="A dream-like Synthpop instrumental that would accompany a dream-sequence in a surrealist movie 120 BPM",
49
+ ),
50
+ Variant(
51
+ key="small-music",
52
+ repo="stabilityai/stable-audio-3-small-music",
53
+ label="Small Music — 0.6B, music-focused",
54
+ default_duration=60,
55
+ placeholder="Cinematic neo-soul groove with electric piano, brushed drums, walking upright bass, smoky vibe 92 BPM",
56
+ ),
57
+ Variant(
58
+ key="small-sfx",
59
+ repo="stabilityai/stable-audio-3-small-sfx",
60
+ label="Small SFX — 0.6B, sound effects",
61
+ default_duration=7,
62
+ placeholder="Chugging train coming into station with horn",
63
+ ),
64
+ ]
65
+
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # Preload all variants at module level (ZeroGPU CUDA emulation accepts it)
69
+ # ---------------------------------------------------------------------------
70
+
71
+ @dataclass
72
+ class LoadedVariant:
73
+ variant: Variant
74
+ model: object
75
+ sample_rate: int
76
+ sample_size: int
77
+ max_seconds: int
78
+
79
+
80
+ LOADED: dict[str, LoadedVariant] = {}
81
+ for v in VARIANTS:
82
+ print(f"[startup] loading {v.repo} …", flush=True)
83
+ t0 = time.time()
84
+ model, config = get_pretrained_model(v.repo)
85
+ sr = int(config["sample_rate"])
86
+ ss = int(config["sample_size"])
87
+ model = model.to("cuda").to(torch.float16)
88
+ LOADED[v.key] = LoadedVariant(
89
+ variant=v,
90
+ model=model,
91
+ sample_rate=sr,
92
+ sample_size=ss,
93
+ max_seconds=ss // sr,
94
+ )
95
+ print(
96
+ f"[startup] {v.key} ready in {time.time() - t0:.1f}s · "
97
+ f"sr={sr} · sample_size={ss} (~{ss // sr}s max)",
98
+ flush=True,
99
+ )
100
+
101
+ VARIANT_CHOICES = [(v.label, v.key) for v in VARIANTS]
102
+ SAMPLERS = ["pingpong", "k-dpmpp-2m", "k-heun", "dpmpp-2s-ancestral", "dpmpp-3m-sde"]
103
+
104
+
105
+ # ---------------------------------------------------------------------------
106
+ # Inference
107
+ # ---------------------------------------------------------------------------
108
+
109
+
110
+ @spaces.GPU(duration=180)
111
+ def infer(
112
+ variant_key: str,
113
+ prompt: str,
114
+ duration: int = 60,
115
+ steps: int = 8,
116
+ cfg_scale: float = 1.0,
117
+ sampler_type: str = "pingpong",
118
+ seed: int = 0,
119
+ progress: gr.Progress = gr.Progress(),
120
+ ):
121
+ prompt = (prompt or "").strip()
122
+ if not prompt:
123
+ raise gr.Error("Please enter a prompt.")
124
+ if variant_key not in LOADED:
125
+ raise gr.Error(f"Unknown variant {variant_key!r}.")
126
+ lv = LOADED[variant_key]
127
+
128
+ duration = max(1, min(int(duration), lv.max_seconds))
129
+
130
+ progress(0.1, desc=f"[{variant_key}] preparing conditioning")
131
+ conditioning = [{"prompt": prompt, "seconds_total": int(duration)}]
132
+
133
+ if seed and int(seed) > 0:
134
+ torch.manual_seed(int(seed))
135
+ else:
136
+ torch.seed()
137
+
138
+ progress(0.25, desc=f"[{variant_key}] sampling {steps} steps with {sampler_type}")
139
+ t0 = time.time()
140
+ output = generate_diffusion_cond_inpaint(
141
+ lv.model,
142
+ steps=int(steps),
143
+ cfg_scale=float(cfg_scale),
144
+ conditioning=conditioning,
145
+ sample_size=lv.sample_size,
146
+ sampler_type=sampler_type,
147
+ device="cuda",
148
+ )
149
+ print(f"[infer/{variant_key}] sampling done in {time.time() - t0:.1f}s", flush=True)
150
+
151
+ progress(0.92, desc="Normalising & saving")
152
+ output = rearrange(output, "b d n -> d (b n)")
153
+ output = (
154
+ output.to(torch.float32)
155
+ .div(torch.max(torch.abs(output)).clamp(min=1e-9))
156
+ .clamp(-1, 1)
157
+ .mul(32767)
158
+ .to(torch.int16)
159
+ .cpu()
160
+ )
161
+ output = output[:, : int(duration) * lv.sample_rate]
162
+
163
+ out_path = os.path.join(tempfile.mkdtemp(), f"sa3_{variant_key}.wav")
164
+ torchaudio.save(out_path, output, lv.sample_rate)
165
+ return out_path
166
+
167
+
168
+ # ---------------------------------------------------------------------------
169
+ # UI
170
+ # ---------------------------------------------------------------------------
171
+
172
+ DESCRIPTION = """
173
+ # 🎵 Stable Audio 3
174
+
175
+ Text-to-audio generation with [Stable Audio 3](https://huggingface.co/stabilityai). Pick a variant, write a prompt, hit Generate.
176
+ """
177
+
178
+ EXAMPLES = [
179
+ ["medium", "House music that encapsulates the feeling of being at a festival in the sunny weather with all your friends 124 BPM", 60],
180
+ ["small-music", "Cinematic neo-soul groove with electric piano, brushed drums, walking upright bass, smoky vibe 92 BPM", 45],
181
+ ["small-music", "Driving techno track with rolling 16th-note hats, deep sub bass, acid arpeggios building tension 132 BPM", 60],
182
+ ["small-sfx", "Chugging train coming into station with horn", 7],
183
+ ["small-sfx", "Heavy rain on a tin roof with distant thunder rolls", 10],
184
+ ["medium", "Rainy night, lo-fi hip-hop beat with vinyl crackle, mellow piano chords, soft kick and snare 80 BPM", 30],
185
+ ]
186
+
187
+
188
+ def _on_variant_change(variant_key: str):
189
+ lv = LOADED[variant_key]
190
+ return (
191
+ gr.update(maximum=lv.max_seconds, value=min(lv.variant.default_duration, lv.max_seconds),
192
+ label=f"Duration (s) · model max {lv.max_seconds}s"),
193
+ gr.update(placeholder=lv.variant.placeholder),
194
+ )
195
+
196
+
197
+ with gr.Blocks(theme=gr.themes.Soft(), title="Stable Audio 3") as demo:
198
+ gr.Markdown(DESCRIPTION)
199
+
200
+ variant = gr.Radio(
201
+ choices=VARIANT_CHOICES,
202
+ value=VARIANTS[0].key,
203
+ label="Model",
204
+ )
205
+
206
+ with gr.Row():
207
+ with gr.Column(scale=2):
208
+ prompt = gr.Textbox(
209
+ label="Prompt",
210
+ placeholder=VARIANTS[0].placeholder,
211
+ lines=3,
212
+ )
213
+ duration = gr.Slider(
214
+ 1, LOADED[VARIANTS[0].key].max_seconds,
215
+ value=VARIANTS[0].default_duration, step=1,
216
+ label=f"Duration (s) · model max {LOADED[VARIANTS[0].key].max_seconds}s",
217
+ )
218
+ with gr.Accordion("Advanced settings", open=False):
219
+ steps = gr.Slider(1, 50, value=8, step=1, label="Steps")
220
+ cfg_scale = gr.Slider(0.5, 8.0, value=1.0, step=0.1, label="CFG scale")
221
+ sampler_type = gr.Dropdown(SAMPLERS, value="pingpong", label="Sampler")
222
+ seed = gr.Number(value=0, precision=0, label="Seed (0 = random)")
223
+ run_btn = gr.Button("🎼 Generate", variant="primary", size="lg")
224
+
225
+ with gr.Column(scale=1):
226
+ audio_out = gr.Audio(label="Output", type="filepath", autoplay=True)
227
+
228
+ gr.Examples(
229
+ examples=EXAMPLES,
230
+ inputs=[variant, prompt, duration],
231
+ outputs=[audio_out],
232
+ fn=infer,
233
+ cache_examples=True,
234
+ cache_mode="lazy",
235
+ label="Examples (lazy-cached on first click)",
236
+ )
237
+
238
+ variant.change(
239
+ fn=_on_variant_change,
240
+ inputs=[variant],
241
+ outputs=[duration, prompt],
242
+ )
243
+
244
+ run_btn.click(
245
+ fn=infer,
246
+ inputs=[variant, prompt, duration, steps, cfg_scale, sampler_type, seed],
247
+ outputs=[audio_out],
248
+ )
249
+
250
+
251
+ if __name__ == "__main__":
252
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # torch / gradio / spaces are preinstalled on ZeroGPU Spaces.
2
+ stable-audio-tools
3
+ einops