Spaces:

owenisas
/

stable-audio-3-lab

Running on Zero

App Files Files Community

owenisas commited on 6 days ago

Commit

465731f

verified ·

1 Parent(s): a3557ba

Add Stable Audio 3 testing Space

Browse files

Files changed (4) hide show

.gitignore +9 -0
README.md +32 -6
app.py +579 -0
requirements.txt +13 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__/
+*.pyc
+.gradio/
+.cache/
+outputs/
+*.wav
+*.flac
+*.mp3
+*.m4a

README.md CHANGED Viewed

@@ -1,13 +1,39 @@
 ---
 title: Stable Audio 3 Lab
-emoji: 🌍
-colorFrom: yellow
-colorTo: red
 sdk: gradio
-sdk_version: 6.14.0
-python_version: '3.13'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Stable Audio 3 Lab
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: 6.3.0
 app_file: app.py
+python_version: 3.10
+suggested_hardware: a10g-small
 pinned: false
+license: mit
 ---
+# Stable Audio 3 Lab
+Gradio Space for testing Stability AI's Stable Audio 3 collections:
+- Standard collection: `stabilityai/stable-audio-3-small-music`, `stabilityai/stable-audio-3-small-sfx`, `stabilityai/stable-audio-3-medium`
+- Extra collection generation checkpoints: `small-music-base`, `small-sfx-base`, `medium-base`
+- Extra collection autoencoders: `SAME-S`, `SAME-L`
+The optimized repo (`stabilityai/stable-audio-3-optimized`) currently ships MLX and TensorRT assets rather than a generic `model_config.json` + `model.safetensors` checkpoint. This Space lists it in Coverage, but does not run it through the PyTorch `stable_audio_3` path.
+## Access
+The post-trained Stable Audio 3 checkpoints are gated on Hugging Face. Before using them here:
+1. Accept the terms on each gated model page while logged in.
+2. Add a read-only `HF_TOKEN` secret to this Space.
+Base checkpoints are not gated, but they are intended mainly for fine-tuning and may not sound as polished.
+## Hardware
+- Small models can run on CPU, but GPU is still preferred.
+- Medium and Medium Base expect CUDA plus `flash-attn`.
+- `SAME-L` is treated as GPU-first; `SAME-S` can be used for CPU autoencoder round trips.
+The Space is configured with `suggested_hardware: a10g-small`. Upgrade hardware if medium generations fail due to memory or Flash Attention support.

app.py ADDED Viewed

	@@ -0,0 +1,579 @@

+from __future__ import annotations
+import gc
+import importlib
+import importlib.util
+import json
+import os
+import tempfile
+import time
+from dataclasses import dataclass
+from typing import Any
+import gradio as gr
+import numpy as np
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
+@dataclass(frozen=True)
+class GenerationModel:
+    label: str
+    key: str
+    repo_id: str
+    family: str
+    default_prompt: str
+    default_duration: int
+    max_duration: int
+    default_steps: int
+    default_cfg: float
+    default_sampler: str
+    requires_cuda: bool = False
+    gated: bool = False
+    note: str = ""
+GENERATION_MODELS: dict[str, GenerationModel] = {
+    "small-music": GenerationModel(
+        label="Stable Audio 3 Small Music",
+        key="small-music",
+        repo_id="stabilityai/stable-audio-3-small-music",
+        family="post-trained",
+        default_prompt=(
+            "Warm lo-fi house groove, soft sidechained pads, clean drums, "
+            "late-night atmosphere, 118 BPM"
+        ),
+        default_duration=20,
+        max_duration=120,
+        default_steps=8,
+        default_cfg=1.0,
+        default_sampler="pingpong",
+        gated=True,
+        note="Lightweight music checkpoint.",
+    ),
+    "small-sfx": GenerationModel(
+        label="Stable Audio 3 Small SFX",
+        key="small-sfx",
+        repo_id="stabilityai/stable-audio-3-small-sfx",
+        family="post-trained",
+        default_prompt="Close binaural rain on a window, soft cloth movement, detailed texture",
+        default_duration=8,
+        max_duration=120,
+        default_steps=8,
+        default_cfg=1.0,
+        default_sampler="pingpong",
+        gated=True,
+        note="Lightweight sound-effects checkpoint.",
+    ),
+    "medium": GenerationModel(
+        label="Stable Audio 3 Medium",
+        key="medium",
+        repo_id="stabilityai/stable-audio-3-medium",
+        family="post-trained",
+        default_prompt=(
+            "Cinematic ambient electronic cue, deep sub pulse, shimmering stereo texture, "
+            "slow evolving melody"
+        ),
+        default_duration=20,
+        max_duration=380,
+        default_steps=8,
+        default_cfg=1.0,
+        default_sampler="pingpong",
+        requires_cuda=True,
+        gated=True,
+        note="High-quality checkpoint; CUDA and flash-attn are expected.",
+    ),
+    "small-music-base": GenerationModel(
+        label="Stable Audio 3 Small Music Base",
+        key="small-music-base",
+        repo_id="stabilityai/stable-audio-3-small-music-base",
+        family="base",
+        default_prompt="Dreamlike synthpop instrumental, surreal film sequence, 120 BPM",
+        default_duration=20,
+        max_duration=120,
+        default_steps=50,
+        default_cfg=7.0,
+        default_sampler="euler",
+        note="Base checkpoint intended mainly for fine-tuning.",
+    ),
+    "small-sfx-base": GenerationModel(
+        label="Stable Audio 3 Small SFX Base",
+        key="small-sfx-base",
+        repo_id="stabilityai/stable-audio-3-small-sfx-base",
+        family="base",
+        default_prompt="Chugging train coming into station with horn",
+        default_duration=7,
+        max_duration=120,
+        default_steps=50,
+        default_cfg=7.0,
+        default_sampler="euler",
+        note="Base checkpoint intended mainly for fine-tuning.",
+    ),
+    "medium-base": GenerationModel(
+        label="Stable Audio 3 Medium Base",
+        key="medium-base",
+        repo_id="stabilityai/stable-audio-3-medium-base",
+        family="base",
+        default_prompt="Dreamlike synthpop instrumental, surreal film sequence, 120 BPM",
+        default_duration=20,
+        max_duration=380,
+        default_steps=50,
+        default_cfg=7.0,
+        default_sampler="euler",
+        requires_cuda=True,
+        note="Base checkpoint intended mainly for fine-tuning; CUDA and flash-attn are expected.",
+    ),
+}
+AUTOENCODER_MODELS = {
+    "same-s": {
+        "label": "SAME-S",
+        "repo_id": "stabilityai/SAME-S",
+        "requires_cuda": False,
+    },
+    "same-l": {
+        "label": "SAME-L",
+        "repo_id": "stabilityai/SAME-L",
+        "requires_cuda": True,
+    },
+}
+COLLECTION_ROWS = [
+    ["stable-audio-3-small-music", "Text-to-audio", "Generate tab", "Gated post-trained small music"],
+    ["stable-audio-3-small-sfx", "Text-to-audio", "Generate tab", "Gated post-trained small SFX"],
+    ["stable-audio-3-medium", "Text-to-audio", "Generate tab", "Gated medium; needs CUDA + flash-attn"],
+    ["stable-audio-3-small-music-base", "Text-to-audio", "Generate tab", "Base checkpoint"],
+    ["stable-audio-3-small-sfx-base", "Text-to-audio", "Generate tab", "Base checkpoint"],
+    ["stable-audio-3-medium-base", "Text-to-audio", "Generate tab", "Base checkpoint; needs CUDA + flash-attn"],
+    ["stable-audio-3-optimized", "Optimized assets", "Listed only", "MLX/TensorRT artifacts, not generic PyTorch generation"],
+    ["SAME-S", "Autoencoder", "Autoencoder tab", "CPU-capable round trip"],
+    ["SAME-L", "Autoencoder", "Autoencoder tab", "Large autoencoder; CUDA recommended"],
+]
+MODEL_CACHE: dict[str, Any] = {"key": None, "model": None}
+AE_CACHE: dict[str, Any] = {"key": None, "model": None}
+def gpu_task(duration: int):
+    try:
+        import spaces
+        return spaces.GPU(duration=duration)
+    except Exception:
+        return lambda fn: fn
+def import_torch():
+    return importlib.import_module("torch")
+def current_device(torch_module: Any) -> str:
+    if torch_module.cuda.is_available():
+        return "cuda"
+    if hasattr(torch_module.backends, "mps") and torch_module.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+def flash_attn_available() -> bool:
+    return importlib.util.find_spec("flash_attn") is not None
+def stable_audio_token_hint(model: GenerationModel) -> str:
+    if not model.gated:
+        return ""
+    if os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN"):
+        return ""
+    return (
+        "This is a gated Stability model. Accept the model terms on Hugging Face "
+        "and add a read-only HF_TOKEN Space secret if download fails."
+    )
+def assert_generation_runtime(model: GenerationModel, allow_cpu_medium: bool) -> str:
+    torch = import_torch()
+    device = current_device(torch)
+    if model.requires_cuda and device != "cuda" and not allow_cpu_medium:
+        raise gr.Error(
+            f"{model.label} is blocked on this runtime because CUDA is not available. "
+            "Use a GPU Space or enable the CPU override for a slow/debug-only attempt."
+        )
+    if model.requires_cuda and device == "cuda" and not flash_attn_available():
+        raise gr.Error(
+            f"{model.label} expects flash-attn on CUDA. Rebuild the Space with the "
+            "flash-attn wheel in requirements.txt or use a small model."
+        )
+    return device
+def normalize_audio_array(data: np.ndarray) -> np.ndarray:
+    array = np.asarray(data)
+    if np.issubdtype(array.dtype, np.integer):
+        limit = max(abs(np.iinfo(array.dtype).min), np.iinfo(array.dtype).max)
+        array = array.astype(np.float32) / float(limit)
+    else:
+        array = array.astype(np.float32)
+    if array.ndim == 1:
+        array = array[None, :]
+    elif array.ndim == 2:
+        array = array.T
+    else:
+        raise gr.Error("Audio must be mono or stereo.")
+    return np.nan_to_num(array, nan=0.0, posinf=0.0, neginf=0.0)
+def clear_torch_memory() -> None:
+    try:
+        torch = import_torch()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    except Exception:
+        pass
+    gc.collect()
+def load_generation_model(model_key: str, allow_cpu_medium: bool):
+    model_def = GENERATION_MODELS[model_key]
+    device = assert_generation_runtime(model_def, allow_cpu_medium)
+    if MODEL_CACHE["key"] == model_key and MODEL_CACHE["model"] is not None:
+        return MODEL_CACHE["model"], device
+    MODEL_CACHE["model"] = None
+    MODEL_CACHE["key"] = None
+    clear_torch_memory()
+    from stable_audio_3 import StableAudioModel
+    model_half = device == "cuda"
+    model = StableAudioModel.from_pretrained(model_key, model_half=model_half)
+    MODEL_CACHE["key"] = model_key
+    MODEL_CACHE["model"] = model
+    return model, device
+def load_autoencoder(model_key: str, allow_cpu_same_l: bool):
+    model_def = AUTOENCODER_MODELS[model_key]
+    torch = import_torch()
+    device = current_device(torch)
+    if model_def["requires_cuda"] and device != "cuda" and not allow_cpu_same_l:
+        raise gr.Error(
+            f"{model_def['label']} is blocked on this runtime because CUDA is not available. "
+            "Use SAME-S or enable the CPU override for a slow/debug-only attempt."
+        )
+    if AE_CACHE["key"] == model_key and AE_CACHE["model"] is not None:
+        return AE_CACHE["model"], device
+    AE_CACHE["model"] = None
+    AE_CACHE["key"] = None
+    clear_torch_memory()
+    from stable_audio_3 import AutoencoderModel
+    model = AutoencoderModel.from_pretrained(model_key)
+    AE_CACHE["key"] = model_key
+    AE_CACHE["model"] = model
+    return model, device
+def model_changed(model_key: str):
+    model = GENERATION_MODELS[model_key]
+    return (
+        gr.update(value=model.default_prompt),
+        gr.update(value=model.default_duration, maximum=model.max_duration),
+        gr.update(value=model.default_steps),
+        gr.update(value=model.default_cfg),
+        gr.update(value=model.default_sampler),
+        {
+            "repo_id": model.repo_id,
+            "family": model.family,
+            "max_duration_s": model.max_duration,
+            "default_sampler": model.default_sampler,
+            "note": model.note,
+            "token_hint": stable_audio_token_hint(model),
+        },
+    )
+@gpu_task(duration=int(os.getenv("SPACES_GENERATE_GPU_SECONDS", "900")))
+def generate_audio(
+    model_key: str,
+    prompt: str,
+    negative_prompt: str,
+    duration: float,
+    steps: int,
+    cfg_scale: float,
+    sampler_type: str,
+    seed: int,
+    chunked_decode: bool,
+    allow_cpu_medium: bool,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if not prompt or not prompt.strip():
+        raise gr.Error("Prompt is required.")
+    model_def = GENERATION_MODELS[model_key]
+    progress(0.05, desc="Loading model")
+    started = time.time()
+    seed = int(seed)
+    if seed < 0:
+        seed = int.from_bytes(os.urandom(4), "little") % 100000
+    model, device = load_generation_model(model_key, allow_cpu_medium)
+    progress(0.25, desc="Generating")
+    audio = model.generate(
+        prompt=prompt.strip(),
+        negative_prompt=negative_prompt.strip() or None,
+        duration=float(duration),
+        steps=int(steps),
+        cfg_scale=float(cfg_scale),
+        seed=seed,
+        sampler_type=sampler_type,
+        chunked_decode=bool(chunked_decode),
+    )
+    progress(0.9, desc="Writing WAV")
+    import torchaudio
+    sample_rate = int(model.model_config["sample_rate"])
+    waveform = audio[0].detach().to("cpu").float().clamp(-1, 1)
+    out_file = tempfile.NamedTemporaryFile(prefix=f"{model_key}-", suffix=".wav", delete=False)
+    out_file.close()
+    torchaudio.save(out_file.name, waveform, sample_rate)
+    elapsed = round(time.time() - started, 3)
+    metadata = {
+        "model": model_def.key,
+        "repo_id": model_def.repo_id,
+        "family": model_def.family,
+        "device": device,
+        "duration_s": float(duration),
+        "steps": int(steps),
+        "cfg_scale": float(cfg_scale),
+        "sampler_type": sampler_type,
+        "seed": seed,
+        "sample_rate": sample_rate,
+        "elapsed_s": elapsed,
+        "output_file": out_file.name,
+        "note": model_def.note,
+    }
+    return out_file.name, metadata
+@gpu_task(duration=int(os.getenv("SPACES_AUTOENCODER_GPU_SECONDS", "600")))
+def roundtrip_autoencoder(
+    model_key: str,
+    audio_input: tuple[int, np.ndarray] | None,
+    chunked: bool,
+    allow_cpu_same_l: bool,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if audio_input is None:
+        raise gr.Error("Upload or record audio first.")
+    progress(0.05, desc="Loading autoencoder")
+    started = time.time()
+    model, device = load_autoencoder(model_key, allow_cpu_same_l)
+    progress(0.25, desc="Encoding")
+    sr, data = audio_input
+    waveform_np = normalize_audio_array(data)
+    torch = import_torch()
+    waveform = torch.from_numpy(waveform_np)
+    latents = model.encode(waveform, int(sr), chunked=bool(chunked))
+    progress(0.65, desc="Decoding")
+    decoded = model.decode(latents, chunked=bool(chunked))
+    decoded = decoded[0].detach().to("cpu").float().clamp(-1, 1)
+    import torchaudio
+    out_file = tempfile.NamedTemporaryFile(prefix=f"{model_key}-roundtrip-", suffix=".wav", delete=False)
+    out_file.close()
+    torchaudio.save(out_file.name, decoded, int(model.sample_rate))
+    metadata = {
+        "autoencoder": model_key,
+        "repo_id": AUTOENCODER_MODELS[model_key]["repo_id"],
+        "device": device,
+        "input_sample_rate": int(sr),
+        "output_sample_rate": int(model.sample_rate),
+        "input_shape": list(waveform.shape),
+        "latent_shape": list(latents.shape),
+        "elapsed_s": round(time.time() - started, 3),
+        "output_file": out_file.name,
+    }
+    return out_file.name, metadata
+def unload_models():
+    MODEL_CACHE["key"] = None
+    MODEL_CACHE["model"] = None
+    AE_CACHE["key"] = None
+    AE_CACHE["model"] = None
+    clear_torch_memory()
+    return {"status": "unloaded"}
+def runtime_status():
+    try:
+        torch = import_torch()
+        device = current_device(torch)
+        cuda_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else None
+    except Exception as exc:
+        device = "unavailable"
+        cuda_name = None
+        return {"torch": repr(exc), "device": device}
+    return {
+        "device": device,
+        "cuda_name": cuda_name,
+        "flash_attn": flash_attn_available(),
+        "hf_token_present": bool(os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")),
+        "loaded_generation_model": MODEL_CACHE["key"],
+        "loaded_autoencoder": AE_CACHE["key"],
+    }
+MODEL_CHOICES = [(model.label, key) for key, model in GENERATION_MODELS.items()]
+AE_CHOICES = [(value["label"], key) for key, value in AUTOENCODER_MODELS.items()]
+SAMPLER_CHOICES = ["pingpong", "euler", "rk4", "dpmpp", "dpmpp-3m-sde"]
+css = """
+.gradio-container { max-width: 1160px !important; }
+#run-buttons button { min-height: 42px; }
+"""
+with gr.Blocks(title="Stable Audio 3 Lab", css=css) as demo:
+    gr.Markdown("# Stable Audio 3 Lab")
+    with gr.Tab("Generate"):
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2):
+                model_dropdown = gr.Dropdown(
+                    label="Model",
+                    choices=MODEL_CHOICES,
+                    value="small-sfx",
+                    interactive=True,
+                )
+                prompt_box = gr.Textbox(
+                    label="Prompt",
+                    value=GENERATION_MODELS["small-sfx"].default_prompt,
+                    lines=4,
+                )
+                negative_prompt_box = gr.Textbox(label="Negative prompt", lines=2)
+                with gr.Row():
+                    duration_slider = gr.Slider(
+                        label="Duration",
+                        minimum=1,
+                        maximum=GENERATION_MODELS["small-sfx"].max_duration,
+                        value=GENERATION_MODELS["small-sfx"].default_duration,
+                        step=1,
+                    )
+                    steps_slider = gr.Slider(
+                        label="Steps",
+                        minimum=1,
+                        maximum=100,
+                        value=GENERATION_MODELS["small-sfx"].default_steps,
+                        step=1,
+                    )
+                    cfg_slider = gr.Slider(
+                        label="CFG",
+                        minimum=0,
+                        maximum=12,
+                        value=GENERATION_MODELS["small-sfx"].default_cfg,
+                        step=0.1,
+                    )
+                with gr.Row():
+                    sampler_dropdown = gr.Dropdown(
+                        label="Sampler",
+                        choices=SAMPLER_CHOICES,
+                        value=GENERATION_MODELS["small-sfx"].default_sampler,
+                    )
+                    seed_number = gr.Number(label="Seed", value=-1, precision=0)
+                with gr.Row():
+                    chunked_decode_box = gr.Checkbox(label="Chunked decode", value=True)
+                    allow_cpu_medium_box = gr.Checkbox(label="CPU override", value=False)
+                with gr.Row(elem_id="run-buttons"):
+                    generate_button = gr.Button("Generate", variant="primary")
+                    unload_button = gr.Button("Unload")
+                    status_button = gr.Button("Runtime")
+            with gr.Column(scale=1):
+                model_info = gr.JSON(
+                    label="Model info",
+                    value={
+                        "repo_id": GENERATION_MODELS["small-sfx"].repo_id,
+                        "family": GENERATION_MODELS["small-sfx"].family,
+                        "note": GENERATION_MODELS["small-sfx"].note,
+                        "token_hint": stable_audio_token_hint(GENERATION_MODELS["small-sfx"]),
+                    },
+                )
+                audio_output = gr.Audio(label="Output", type="filepath")
+                metadata_output = gr.JSON(label="Run metadata")
+        model_dropdown.change(
+            model_changed,
+            inputs=model_dropdown,
+            outputs=[
+                prompt_box,
+                duration_slider,
+                steps_slider,
+                cfg_slider,
+                sampler_dropdown,
+                model_info,
+            ],
+        )
+        generate_button.click(
+            generate_audio,
+            inputs=[
+                model_dropdown,
+                prompt_box,
+                negative_prompt_box,
+                duration_slider,
+                steps_slider,
+                cfg_slider,
+                sampler_dropdown,
+                seed_number,
+                chunked_decode_box,
+                allow_cpu_medium_box,
+            ],
+            outputs=[audio_output, metadata_output],
+            concurrency_limit=1,
+        )
+        unload_button.click(unload_models, outputs=metadata_output)
+        status_button.click(runtime_status, outputs=metadata_output)
+    with gr.Tab("Autoencoder"):
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2):
+                ae_dropdown = gr.Dropdown(label="Autoencoder", choices=AE_CHOICES, value="same-s")
+                ae_audio_input = gr.Audio(label="Input", sources=["upload", "microphone"], type="numpy")
+                with gr.Row():
+                    ae_chunked_box = gr.Checkbox(label="Chunked", value=True)
+                    ae_allow_cpu_box = gr.Checkbox(label="CPU override", value=False)
+                ae_button = gr.Button("Round Trip", variant="primary")
+            with gr.Column(scale=1):
+                ae_output = gr.Audio(label="Decoded", type="filepath")
+                ae_metadata = gr.JSON(label="Round-trip metadata")
+        ae_button.click(
+            roundtrip_autoencoder,
+            inputs=[ae_dropdown, ae_audio_input, ae_chunked_box, ae_allow_cpu_box],
+            outputs=[ae_output, ae_metadata],
+            concurrency_limit=1,
+        )
+    with gr.Tab("Coverage"):
+        gr.Dataframe(
+            value=COLLECTION_ROWS,
+            headers=["Collection entry", "Type", "Space path", "Status"],
+            datatype=["str", "str", "str", "str"],
+            interactive=False,
+            wrap=True,
+        )
+        gr.JSON(label="Runtime", value=runtime_status())
+if __name__ == "__main__":
+    demo.queue(default_concurrency_limit=1).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+--extra-index-url https://download.pytorch.org/whl/cu126
+torch==2.7.1
+torchaudio==2.7.1
+gradio==6.3.0
+spaces
+hf_transfer
+soundfile
+git+https://github.com/Stability-AI/stable-audio-3.git@main
+# Required for Stable Audio 3 Medium on CUDA. This is the wheel recommended by
+# Stability AI's README for torch 2.7 / CUDA 12.6 / Python 3.10.
+https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.16/flash_attn-2.6.3+cu126torch2.7-cp310-cp310-linux_x86_64.whl