ayf3 commited on
Commit
44ebeaa
ยท
verified ยท
1 Parent(s): d365d5c

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +32 -12
app.py CHANGED
@@ -1,7 +1,12 @@
1
  #!/usr/bin/env python3
2
  """
3
- NumberBlocks One Voice Cloning Space - VoxCPM V4
4
- Fix: Force float32 on CPU to avoid bfloat16 dimension errors in MiniCPM4 attention
 
 
 
 
 
5
  """
6
 
7
  import os
@@ -10,15 +15,36 @@ import tempfile
10
  import soundfile as sf
11
  import traceback
12
  from pathlib import Path
 
 
13
 
14
- # ็Žฏๅขƒๅ˜้‡ๆฃ€ๆŸฅ
15
  HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN"))
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def load_model():
18
  """ๅŠ ่ฝฝ VoxCPM ๆจกๅž‹"""
19
  try:
20
  from voxcpm import VoxCPM
21
- import torch
22
 
23
  device = "cuda" if torch.cuda.is_available() else "cpu"
24
  print(f"Loading VoxCPM model on {device}...")
@@ -29,10 +55,6 @@ def load_model():
29
  model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False)
30
 
31
  # CRITICAL FIX: Force float32 on CPU
32
- # VoxCPM2 uses bfloat16 by default, which causes dimension/dtype errors on CPU:
33
- # 1. "Dimension out of range" in MiniCPM4's scaled_dot_product_attention (bfloat16 SDPA bug)
34
- # 2. "mat1 and mat2 must have the same dtype" when model is float32 but inputs are bfloat16
35
- # Fix: change config.dtype BEFORE anything creates tensors, then convert model
36
  if device == "cpu":
37
  print("Converting model to float32 for CPU compatibility...")
38
  # Step 1: Change config dtype so _inference creates float32 tensors
@@ -154,7 +176,7 @@ PRESET_TEXTS = {
154
 
155
  # ๅˆ›ๅปบ Gradio ็•Œ้ข
156
  with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
157
- gr.Markdown("# ๐ŸŽญ NumberBlocks One Voice Cloning (VoxCPM V4)")
158
  gr.Markdown("### ไฝฟ็”จ VoxCPM 2 ๆจกๅž‹ๅ…‹้š† One ็š„ๅฃฐ้Ÿณ")
159
 
160
  with gr.Row():
@@ -211,16 +233,14 @@ with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
211
  - **CFG Value**: ๆŽงๅˆถ้Ÿณ่‰ฒ็›ธไผผๅบฆ๏ผŒ้ป˜่ฎค 2.0๏ผŒ่ถŠ้ซ˜่ถŠๅƒๅ‚่€ƒ้Ÿณ่‰ฒ
212
  - **ๆŽจ็†ๆญฅๆ•ฐ**: ้ป˜่ฎค 10๏ผŒ่ถŠ้ซ˜่ดจ้‡่ถŠๅฅฝไฝ†็”Ÿๆˆ่ถŠๆ…ข
213
  - **ๆจกๅž‹**: VoxCPM 2 (openbmb/VoxCPM2)
214
- - **V4 ไฟฎๅค**: CPU ไธŠไฝฟ็”จ float32 ้ฟๅ… bfloat16 ็ปดๅบฆ้”™่ฏฏ
215
  """)
216
 
217
  if __name__ == "__main__":
218
- # ๅฏๅŠจๆ—ถ้ข„ๅŠ ่ฝฝๆจกๅž‹
219
  import threading
220
  def preload():
221
  print("Preloading VoxCPM model...")
222
  ensure_model()
223
 
224
  threading.Thread(target=preload, daemon=True).start()
225
-
226
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  #!/usr/bin/env python3
2
  """
3
+ NumberBlocks One Voice Cloning Space - VoxCPM V5
4
+ Fix: float32 on CPU + monkey-patch SDPA mask shape for CPU compatibility
5
+
6
+ Root cause of "Dimension out of range":
7
+ MiniCPM4's Attention.forward_step creates a 1D attn_mask but SDPA on CPU
8
+ expects at least 2D for proper broadcasting with GQA (Grouped Query Attention).
9
+ On GPU, the flash-attention backend handles this; on CPU the math backend does not.
10
  """
11
 
12
  import os
 
15
  import soundfile as sf
16
  import traceback
17
  from pathlib import Path
18
+ import torch
19
+ import torch.nn.functional as F
20
 
 
21
  HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN"))
22
 
23
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
24
+ # Monkey-patch: fix SDPA mask shape for CPU
25
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
26
+ _original_sdpa = F.scaled_dot_product_attention
27
+
28
+ def _cpu_safe_sdpa(query, key, value, attn_mask=None, **kwargs):
29
+ """Wrapper that fixes 1D attn_mask for CPU SDPA."""
30
+ if attn_mask is not None and attn_mask.dim() == 1 and not torch.cuda.is_available():
31
+ # attn_mask is (seq_len,) but SDPA needs (B, H, L, S) or broadcastable
32
+ # query shape: (B, H, L, D), key shape: (B, H_kv, S, D)
33
+ B, H, L, D = query.shape
34
+ S = key.shape[2]
35
+ # Reshape 1D mask to (1, 1, 1, S) for proper broadcasting
36
+ attn_mask = attn_mask.view(1, 1, 1, S).expand(B, H, L, S)
37
+ return _original_sdpa(query, key, value, attn_mask=attn_mask, **kwargs)
38
+
39
+ # Apply the patch globally
40
+ F.scaled_dot_product_attention = _cpu_safe_sdpa
41
+ print("โœ… Patched scaled_dot_product_attention for CPU mask shape fix")
42
+
43
+
44
  def load_model():
45
  """ๅŠ ่ฝฝ VoxCPM ๆจกๅž‹"""
46
  try:
47
  from voxcpm import VoxCPM
 
48
 
49
  device = "cuda" if torch.cuda.is_available() else "cpu"
50
  print(f"Loading VoxCPM model on {device}...")
 
55
  model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False)
56
 
57
  # CRITICAL FIX: Force float32 on CPU
 
 
 
 
58
  if device == "cpu":
59
  print("Converting model to float32 for CPU compatibility...")
60
  # Step 1: Change config dtype so _inference creates float32 tensors
 
176
 
177
  # ๅˆ›ๅปบ Gradio ็•Œ้ข
178
  with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
179
+ gr.Markdown("# ๐ŸŽญ NumberBlocks One Voice Cloning (VoxCPM V5)")
180
  gr.Markdown("### ไฝฟ็”จ VoxCPM 2 ๆจกๅž‹ๅ…‹้š† One ็š„ๅฃฐ้Ÿณ")
181
 
182
  with gr.Row():
 
233
  - **CFG Value**: ๆŽงๅˆถ้Ÿณ่‰ฒ็›ธไผผๅบฆ๏ผŒ้ป˜่ฎค 2.0๏ผŒ่ถŠ้ซ˜่ถŠๅƒๅ‚่€ƒ้Ÿณ่‰ฒ
234
  - **ๆŽจ็†ๆญฅๆ•ฐ**: ้ป˜่ฎค 10๏ผŒ่ถŠ้ซ˜่ดจ้‡่ถŠๅฅฝไฝ†็”Ÿๆˆ่ถŠๆ…ข
235
  - **ๆจกๅž‹**: VoxCPM 2 (openbmb/VoxCPM2)
236
+ - **V5**: CPU float32 + SDPA mask shape fix
237
  """)
238
 
239
  if __name__ == "__main__":
 
240
  import threading
241
  def preload():
242
  print("Preloading VoxCPM model...")
243
  ensure_model()
244
 
245
  threading.Thread(target=preload, daemon=True).start()
 
246
  demo.launch(server_name="0.0.0.0", server_port=7860)