Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Sleeping

BoxOfColors commited on 19 days ago

Commit

f012a29

1 Parent(s): b49a86d

Fix stereo conformance across all write paths

- _save_wav: always applies _to_stereo() so every output WAV is 2-channel
- _resample_to_target: only treats (2,T) as stereo; (1,T) preserved as-is
- regen_taro/mmaudio/hunyuan_segment: apply _to_stereo() before _splice_and_save
- media_utils.py: remove pan=stereo|c0=c0|c1=c0 filter that was routing only
left channel to both outputs once input became true stereo

Files changed (2) hide show

HunyuanVideo-Foley/hunyuanvideo_foley/utils/media_utils.py +3 -1
app.py +21 -7

HunyuanVideo-Foley/hunyuanvideo_foley/utils/media_utils.py CHANGED Viewed

@@ -55,6 +55,9 @@ def merge_audio_video(
     }
     # Build ffmpeg command
     ffmpeg_command = [
         "ffmpeg",
         "-i", video_path,
@@ -62,7 +65,6 @@ def merge_audio_video(
         "-c:v", "copy",
         "-c:a", "aac",
         "-ac", "2",
-        "-af", "pan=stereo|c0=c0|c1=c0",
         "-map", "0:v:0",
         "-map", "1:a:0",
         *quality_settings.get(quality, quality_settings["high"]),

     }
     # Build ffmpeg command
+    # Note: audio input is already stereo (2,T); no channel remapping needed.
+    # The old `pan=stereo|c0=c0|c1=c0` filter duplicated only the left channel,
+    # which caused right-channel silence when the source was true stereo.
     ffmpeg_command = [
         "ffmpeg",
         "-i", video_path,
         "-c:v", "copy",
         "-c:a", "aac",
         "-ac", "2",
         "-map", "0:v:0",
         "-map", "1:a:0",
         *quality_settings.get(quality, quality_settings["high"]),

app.py CHANGED Viewed

@@ -719,18 +719,26 @@ def _resample_to_target(wav: np.ndarray, src_sr: int,
     *dst_sr* defaults to TARGET_SR (48 kHz).  No-op if src_sr == dst_sr.
     Uses torchaudio Kaiser-windowed sinc resampling — CPU-only, ZeroGPU-safe.
     """
     if dst_sr is None:
         dst_sr = TARGET_SR
     if src_sr == dst_sr:
         return wav
-    stereo = wav.ndim == 2
     t = torch.from_numpy(np.ascontiguousarray(wav.astype(np.float32)))
     if not stereo:
-        t = t.unsqueeze(0)          # [1, T]
     t = torchaudio.functional.resample(t, src_sr, dst_sr)
-    if not stereo:
-        t = t.squeeze(0)            # [T]
     return t.numpy()
@@ -807,10 +815,10 @@ def _stitch_wavs(wavs: list[np.ndarray], crossfade_s: float, db_boost: float,
 def _save_wav(path: str, wav: np.ndarray, sr: int) -> None:
-    """Save a numpy wav array (mono or stereo) to *path* via torchaudio."""
     t = torch.from_numpy(np.ascontiguousarray(wav))
-    if t.ndim == 1:
-        t = t.unsqueeze(0)
     torchaudio.save(path, t, sr)
@@ -1523,6 +1531,8 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
     # Upsample 16kHz → 48kHz (sinc, CPU)
     new_wav = _upsample_taro(new_wav)
     # CPU: splice, stitch, mux, save
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
@@ -1606,6 +1616,8 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
         sr = TARGET_SR
     meta["sr"] = sr
     # CPU: splice, stitch, mux, save
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
@@ -1684,6 +1696,8 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
     meta["sr"] = sr
     # CPU: splice, stitch, mux, save
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id

     *dst_sr* defaults to TARGET_SR (48 kHz).  No-op if src_sr == dst_sr.
     Uses torchaudio Kaiser-windowed sinc resampling — CPU-only, ZeroGPU-safe.
+    Shape contract: returns same shape as input.
+      (T,)   → (T,)      mono
+      (2, T) → (2, T)    stereo
+      (1, T) → (1, T)    1-channel (caller must apply _to_stereo if needed)
     """
     if dst_sr is None:
         dst_sr = TARGET_SR
     if src_sr == dst_sr:
         return wav
+    # Treat as stereo only when there are genuinely 2 channels
+    stereo = wav.ndim == 2 and wav.shape[0] == 2
     t = torch.from_numpy(np.ascontiguousarray(wav.astype(np.float32)))
     if not stereo:
+        if t.ndim == 1:
+            t = t.unsqueeze(0)      # (T,) → (1, T) for resample
+        # (1, T) or any other 2D single-channel stays as-is
     t = torchaudio.functional.resample(t, src_sr, dst_sr)
+    if not stereo and wav.ndim == 1:
+        t = t.squeeze(0)            # (1, T) → (T,) to match input shape
     return t.numpy()
 def _save_wav(path: str, wav: np.ndarray, sr: int) -> None:
+    """Save a numpy wav array to *path* as stereo (2, T) via torchaudio.
+    Always conforms to stereo so every output WAV is a proper 2-channel file."""
+    wav = _to_stereo(wav)   # (1,T) or (T,) → (2,T); (2,T) is a no-op
     t = torch.from_numpy(np.ascontiguousarray(wav))
     torchaudio.save(path, t, sr)
     # Upsample 16kHz → 48kHz (sinc, CPU)
     new_wav = _upsample_taro(new_wav)
+    # Conform to stereo (2, T) — TARO returns mono (T,)
+    new_wav = _to_stereo(new_wav)
     # CPU: splice, stitch, mux, save
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
         sr = TARGET_SR
     meta["sr"] = sr
+    # Conform to stereo (2, T) — MMAudio returns (2,T) but apply defensively
+    new_wav = _to_stereo(new_wav)
     # CPU: splice, stitch, mux, save
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     meta["sr"] = sr
+    # Conform to stereo (2, T) — HunyuanFoley DAC returns (1, T)
+    new_wav = _to_stereo(new_wav)
     # CPU: splice, stitch, mux, save
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id