Spaces:

ayf3
/

numberblocks-one-voice

Running

App Files Files Community

ayf3 commited on 13 days ago

Commit

44ebeaa

verified ·

1 Parent(s): d365d5c

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +32 -12

app.py CHANGED Viewed

@@ -1,7 +1,12 @@
 #!/usr/bin/env python3
 """
-NumberBlocks One Voice Cloning Space - VoxCPM V4
-Fix: Force float32 on CPU to avoid bfloat16 dimension errors in MiniCPM4 attention
 """
 import os
@@ -10,15 +15,36 @@ import tempfile
 import soundfile as sf
 import traceback
 from pathlib import Path
-# 环境变量检查
 HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN"))
 def load_model():
     """加载 VoxCPM 模型"""
     try:
         from voxcpm import VoxCPM
-        import torch
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Loading VoxCPM model on {device}...")
@@ -29,10 +55,6 @@ def load_model():
         model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False)
         # CRITICAL FIX: Force float32 on CPU
-        # VoxCPM2 uses bfloat16 by default, which causes dimension/dtype errors on CPU:
-        # 1. "Dimension out of range" in MiniCPM4's scaled_dot_product_attention (bfloat16 SDPA bug)
-        # 2. "mat1 and mat2 must have the same dtype" when model is float32 but inputs are bfloat16
-        # Fix: change config.dtype BEFORE anything creates tensors, then convert model
         if device == "cpu":
             print("Converting model to float32 for CPU compatibility...")
             # Step 1: Change config dtype so _inference creates float32 tensors
@@ -154,7 +176,7 @@ PRESET_TEXTS = {
 # 创建 Gradio 界面
 with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
-    gr.Markdown("# 🎭 NumberBlocks One Voice Cloning (VoxCPM V4)")
     gr.Markdown("### 使用 VoxCPM 2 模型克隆 One 的声音")
     with gr.Row():
@@ -211,16 +233,14 @@ with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
     - **CFG Value**: 控制音色相似度，默认 2.0，越高越像参考音色
     - **推理步数**: 默认 10，越高质量越好但生成越慢
     - **模型**: VoxCPM 2 (openbmb/VoxCPM2)
-    - **V4 修复**: CPU 上使用 float32 避免 bfloat16 维度错误
     """)
 if __name__ == "__main__":
-    # 启动时预加载模型
     import threading
     def preload():
         print("Preloading VoxCPM model...")
         ensure_model()
     threading.Thread(target=preload, daemon=True).start()
     demo.launch(server_name="0.0.0.0", server_port=7860)

 #!/usr/bin/env python3
 """
+NumberBlocks One Voice Cloning Space - VoxCPM V5
+Fix: float32 on CPU + monkey-patch SDPA mask shape for CPU compatibility
+Root cause of "Dimension out of range":
+  MiniCPM4's Attention.forward_step creates a 1D attn_mask but SDPA on CPU
+  expects at least 2D for proper broadcasting with GQA (Grouped Query Attention).
+  On GPU, the flash-attention backend handles this; on CPU the math backend does not.
 """
 import os
 import soundfile as sf
 import traceback
 from pathlib import Path
+import torch
+import torch.nn.functional as F
 HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN"))
+# ──────────────────────────────────────────────────────────────────
+# Monkey-patch: fix SDPA mask shape for CPU
+# ──────────────────────────────────────────────────────────────────
+_original_sdpa = F.scaled_dot_product_attention
+def _cpu_safe_sdpa(query, key, value, attn_mask=None, **kwargs):
+    """Wrapper that fixes 1D attn_mask for CPU SDPA."""
+    if attn_mask is not None and attn_mask.dim() == 1 and not torch.cuda.is_available():
+        # attn_mask is (seq_len,) but SDPA needs (B, H, L, S) or broadcastable
+        # query shape: (B, H, L, D), key shape: (B, H_kv, S, D)
+        B, H, L, D = query.shape
+        S = key.shape[2]
+        # Reshape 1D mask to (1, 1, 1, S) for proper broadcasting
+        attn_mask = attn_mask.view(1, 1, 1, S).expand(B, H, L, S)
+    return _original_sdpa(query, key, value, attn_mask=attn_mask, **kwargs)
+# Apply the patch globally
+F.scaled_dot_product_attention = _cpu_safe_sdpa
+print("✅ Patched scaled_dot_product_attention for CPU mask shape fix")
 def load_model():
     """加载 VoxCPM 模型"""
     try:
         from voxcpm import VoxCPM
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Loading VoxCPM model on {device}...")
         model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False)
         # CRITICAL FIX: Force float32 on CPU
         if device == "cpu":
             print("Converting model to float32 for CPU compatibility...")
             # Step 1: Change config dtype so _inference creates float32 tensors
 # 创建 Gradio 界面
 with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
+    gr.Markdown("# 🎭 NumberBlocks One Voice Cloning (VoxCPM V5)")
     gr.Markdown("### 使用 VoxCPM 2 模型克隆 One 的声音")
     with gr.Row():
     - **CFG Value**: 控制音色相似度，默认 2.0，越高越像参考音色
     - **推理步数**: 默认 10，越高质量越好但生成越慢
     - **模型**: VoxCPM 2 (openbmb/VoxCPM2)
+    - **V5**: CPU float32 + SDPA mask shape fix
     """)
 if __name__ == "__main__":
     import threading
     def preload():
         print("Preloading VoxCPM model...")
         ensure_model()
     threading.Thread(target=preload, daemon=True).start()
     demo.launch(server_name="0.0.0.0", server_port=7860)