BoxOfColors commited on
Commit
f012a29
Β·
1 Parent(s): b49a86d

Fix stereo conformance across all write paths

Browse files

- _save_wav: always applies _to_stereo() so every output WAV is 2-channel
- _resample_to_target: only treats (2,T) as stereo; (1,T) preserved as-is
- regen_taro/mmaudio/hunyuan_segment: apply _to_stereo() before _splice_and_save
- media_utils.py: remove pan=stereo|c0=c0|c1=c0 filter that was routing only
left channel to both outputs once input became true stereo

HunyuanVideo-Foley/hunyuanvideo_foley/utils/media_utils.py CHANGED
@@ -55,6 +55,9 @@ def merge_audio_video(
55
  }
56
 
57
  # Build ffmpeg command
 
 
 
58
  ffmpeg_command = [
59
  "ffmpeg",
60
  "-i", video_path,
@@ -62,7 +65,6 @@ def merge_audio_video(
62
  "-c:v", "copy",
63
  "-c:a", "aac",
64
  "-ac", "2",
65
- "-af", "pan=stereo|c0=c0|c1=c0",
66
  "-map", "0:v:0",
67
  "-map", "1:a:0",
68
  *quality_settings.get(quality, quality_settings["high"]),
 
55
  }
56
 
57
  # Build ffmpeg command
58
+ # Note: audio input is already stereo (2,T); no channel remapping needed.
59
+ # The old `pan=stereo|c0=c0|c1=c0` filter duplicated only the left channel,
60
+ # which caused right-channel silence when the source was true stereo.
61
  ffmpeg_command = [
62
  "ffmpeg",
63
  "-i", video_path,
 
65
  "-c:v", "copy",
66
  "-c:a", "aac",
67
  "-ac", "2",
 
68
  "-map", "0:v:0",
69
  "-map", "1:a:0",
70
  *quality_settings.get(quality, quality_settings["high"]),
app.py CHANGED
@@ -719,18 +719,26 @@ def _resample_to_target(wav: np.ndarray, src_sr: int,
719
 
720
  *dst_sr* defaults to TARGET_SR (48 kHz). No-op if src_sr == dst_sr.
721
  Uses torchaudio Kaiser-windowed sinc resampling β€” CPU-only, ZeroGPU-safe.
 
 
 
 
 
722
  """
723
  if dst_sr is None:
724
  dst_sr = TARGET_SR
725
  if src_sr == dst_sr:
726
  return wav
727
- stereo = wav.ndim == 2
 
728
  t = torch.from_numpy(np.ascontiguousarray(wav.astype(np.float32)))
729
  if not stereo:
730
- t = t.unsqueeze(0) # [1, T]
 
 
731
  t = torchaudio.functional.resample(t, src_sr, dst_sr)
732
- if not stereo:
733
- t = t.squeeze(0) # [T]
734
  return t.numpy()
735
 
736
 
@@ -807,10 +815,10 @@ def _stitch_wavs(wavs: list[np.ndarray], crossfade_s: float, db_boost: float,
807
 
808
 
809
  def _save_wav(path: str, wav: np.ndarray, sr: int) -> None:
810
- """Save a numpy wav array (mono or stereo) to *path* via torchaudio."""
 
 
811
  t = torch.from_numpy(np.ascontiguousarray(wav))
812
- if t.ndim == 1:
813
- t = t.unsqueeze(0)
814
  torchaudio.save(path, t, sr)
815
 
816
 
@@ -1523,6 +1531,8 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
1523
 
1524
  # Upsample 16kHz β†’ 48kHz (sinc, CPU)
1525
  new_wav = _upsample_taro(new_wav)
 
 
1526
  # CPU: splice, stitch, mux, save
1527
  video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
1528
  new_wav, seg_idx, meta, slot_id
@@ -1606,6 +1616,8 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
1606
  sr = TARGET_SR
1607
  meta["sr"] = sr
1608
 
 
 
1609
  # CPU: splice, stitch, mux, save
1610
  video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
1611
  new_wav, seg_idx, meta, slot_id
@@ -1684,6 +1696,8 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
1684
 
1685
  meta["sr"] = sr
1686
 
 
 
1687
  # CPU: splice, stitch, mux, save
1688
  video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
1689
  new_wav, seg_idx, meta, slot_id
 
719
 
720
  *dst_sr* defaults to TARGET_SR (48 kHz). No-op if src_sr == dst_sr.
721
  Uses torchaudio Kaiser-windowed sinc resampling β€” CPU-only, ZeroGPU-safe.
722
+
723
+ Shape contract: returns same shape as input.
724
+ (T,) β†’ (T,) mono
725
+ (2, T) β†’ (2, T) stereo
726
+ (1, T) β†’ (1, T) 1-channel (caller must apply _to_stereo if needed)
727
  """
728
  if dst_sr is None:
729
  dst_sr = TARGET_SR
730
  if src_sr == dst_sr:
731
  return wav
732
+ # Treat as stereo only when there are genuinely 2 channels
733
+ stereo = wav.ndim == 2 and wav.shape[0] == 2
734
  t = torch.from_numpy(np.ascontiguousarray(wav.astype(np.float32)))
735
  if not stereo:
736
+ if t.ndim == 1:
737
+ t = t.unsqueeze(0) # (T,) β†’ (1, T) for resample
738
+ # (1, T) or any other 2D single-channel stays as-is
739
  t = torchaudio.functional.resample(t, src_sr, dst_sr)
740
+ if not stereo and wav.ndim == 1:
741
+ t = t.squeeze(0) # (1, T) β†’ (T,) to match input shape
742
  return t.numpy()
743
 
744
 
 
815
 
816
 
817
  def _save_wav(path: str, wav: np.ndarray, sr: int) -> None:
818
+ """Save a numpy wav array to *path* as stereo (2, T) via torchaudio.
819
+ Always conforms to stereo so every output WAV is a proper 2-channel file."""
820
+ wav = _to_stereo(wav) # (1,T) or (T,) β†’ (2,T); (2,T) is a no-op
821
  t = torch.from_numpy(np.ascontiguousarray(wav))
 
 
822
  torchaudio.save(path, t, sr)
823
 
824
 
 
1531
 
1532
  # Upsample 16kHz β†’ 48kHz (sinc, CPU)
1533
  new_wav = _upsample_taro(new_wav)
1534
+ # Conform to stereo (2, T) β€” TARO returns mono (T,)
1535
+ new_wav = _to_stereo(new_wav)
1536
  # CPU: splice, stitch, mux, save
1537
  video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
1538
  new_wav, seg_idx, meta, slot_id
 
1616
  sr = TARGET_SR
1617
  meta["sr"] = sr
1618
 
1619
+ # Conform to stereo (2, T) β€” MMAudio returns (2,T) but apply defensively
1620
+ new_wav = _to_stereo(new_wav)
1621
  # CPU: splice, stitch, mux, save
1622
  video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
1623
  new_wav, seg_idx, meta, slot_id
 
1696
 
1697
  meta["sr"] = sr
1698
 
1699
+ # Conform to stereo (2, T) β€” HunyuanFoley DAC returns (1, T)
1700
+ new_wav = _to_stereo(new_wav)
1701
  # CPU: splice, stitch, mux, save
1702
  video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
1703
  new_wav, seg_idx, meta, slot_id