Commit Β·
f012a29
1
Parent(s): b49a86d
Fix stereo conformance across all write paths
Browse files- _save_wav: always applies _to_stereo() so every output WAV is 2-channel
- _resample_to_target: only treats (2,T) as stereo; (1,T) preserved as-is
- regen_taro/mmaudio/hunyuan_segment: apply _to_stereo() before _splice_and_save
- media_utils.py: remove pan=stereo|c0=c0|c1=c0 filter that was routing only
left channel to both outputs once input became true stereo
HunyuanVideo-Foley/hunyuanvideo_foley/utils/media_utils.py
CHANGED
|
@@ -55,6 +55,9 @@ def merge_audio_video(
|
|
| 55 |
}
|
| 56 |
|
| 57 |
# Build ffmpeg command
|
|
|
|
|
|
|
|
|
|
| 58 |
ffmpeg_command = [
|
| 59 |
"ffmpeg",
|
| 60 |
"-i", video_path,
|
|
@@ -62,7 +65,6 @@ def merge_audio_video(
|
|
| 62 |
"-c:v", "copy",
|
| 63 |
"-c:a", "aac",
|
| 64 |
"-ac", "2",
|
| 65 |
-
"-af", "pan=stereo|c0=c0|c1=c0",
|
| 66 |
"-map", "0:v:0",
|
| 67 |
"-map", "1:a:0",
|
| 68 |
*quality_settings.get(quality, quality_settings["high"]),
|
|
|
|
| 55 |
}
|
| 56 |
|
| 57 |
# Build ffmpeg command
|
| 58 |
+
# Note: audio input is already stereo (2,T); no channel remapping needed.
|
| 59 |
+
# The old `pan=stereo|c0=c0|c1=c0` filter duplicated only the left channel,
|
| 60 |
+
# which caused right-channel silence when the source was true stereo.
|
| 61 |
ffmpeg_command = [
|
| 62 |
"ffmpeg",
|
| 63 |
"-i", video_path,
|
|
|
|
| 65 |
"-c:v", "copy",
|
| 66 |
"-c:a", "aac",
|
| 67 |
"-ac", "2",
|
|
|
|
| 68 |
"-map", "0:v:0",
|
| 69 |
"-map", "1:a:0",
|
| 70 |
*quality_settings.get(quality, quality_settings["high"]),
|
app.py
CHANGED
|
@@ -719,18 +719,26 @@ def _resample_to_target(wav: np.ndarray, src_sr: int,
|
|
| 719 |
|
| 720 |
*dst_sr* defaults to TARGET_SR (48 kHz). No-op if src_sr == dst_sr.
|
| 721 |
Uses torchaudio Kaiser-windowed sinc resampling β CPU-only, ZeroGPU-safe.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
"""
|
| 723 |
if dst_sr is None:
|
| 724 |
dst_sr = TARGET_SR
|
| 725 |
if src_sr == dst_sr:
|
| 726 |
return wav
|
| 727 |
-
stereo
|
|
|
|
| 728 |
t = torch.from_numpy(np.ascontiguousarray(wav.astype(np.float32)))
|
| 729 |
if not stereo:
|
| 730 |
-
|
|
|
|
|
|
|
| 731 |
t = torchaudio.functional.resample(t, src_sr, dst_sr)
|
| 732 |
-
if not stereo:
|
| 733 |
-
t = t.squeeze(0) #
|
| 734 |
return t.numpy()
|
| 735 |
|
| 736 |
|
|
@@ -807,10 +815,10 @@ def _stitch_wavs(wavs: list[np.ndarray], crossfade_s: float, db_boost: float,
|
|
| 807 |
|
| 808 |
|
| 809 |
def _save_wav(path: str, wav: np.ndarray, sr: int) -> None:
|
| 810 |
-
"""Save a numpy wav array
|
|
|
|
|
|
|
| 811 |
t = torch.from_numpy(np.ascontiguousarray(wav))
|
| 812 |
-
if t.ndim == 1:
|
| 813 |
-
t = t.unsqueeze(0)
|
| 814 |
torchaudio.save(path, t, sr)
|
| 815 |
|
| 816 |
|
|
@@ -1523,6 +1531,8 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
|
|
| 1523 |
|
| 1524 |
# Upsample 16kHz β 48kHz (sinc, CPU)
|
| 1525 |
new_wav = _upsample_taro(new_wav)
|
|
|
|
|
|
|
| 1526 |
# CPU: splice, stitch, mux, save
|
| 1527 |
video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
|
| 1528 |
new_wav, seg_idx, meta, slot_id
|
|
@@ -1606,6 +1616,8 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
|
|
| 1606 |
sr = TARGET_SR
|
| 1607 |
meta["sr"] = sr
|
| 1608 |
|
|
|
|
|
|
|
| 1609 |
# CPU: splice, stitch, mux, save
|
| 1610 |
video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
|
| 1611 |
new_wav, seg_idx, meta, slot_id
|
|
@@ -1684,6 +1696,8 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
|
|
| 1684 |
|
| 1685 |
meta["sr"] = sr
|
| 1686 |
|
|
|
|
|
|
|
| 1687 |
# CPU: splice, stitch, mux, save
|
| 1688 |
video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
|
| 1689 |
new_wav, seg_idx, meta, slot_id
|
|
|
|
| 719 |
|
| 720 |
*dst_sr* defaults to TARGET_SR (48 kHz). No-op if src_sr == dst_sr.
|
| 721 |
Uses torchaudio Kaiser-windowed sinc resampling β CPU-only, ZeroGPU-safe.
|
| 722 |
+
|
| 723 |
+
Shape contract: returns same shape as input.
|
| 724 |
+
(T,) β (T,) mono
|
| 725 |
+
(2, T) β (2, T) stereo
|
| 726 |
+
(1, T) β (1, T) 1-channel (caller must apply _to_stereo if needed)
|
| 727 |
"""
|
| 728 |
if dst_sr is None:
|
| 729 |
dst_sr = TARGET_SR
|
| 730 |
if src_sr == dst_sr:
|
| 731 |
return wav
|
| 732 |
+
# Treat as stereo only when there are genuinely 2 channels
|
| 733 |
+
stereo = wav.ndim == 2 and wav.shape[0] == 2
|
| 734 |
t = torch.from_numpy(np.ascontiguousarray(wav.astype(np.float32)))
|
| 735 |
if not stereo:
|
| 736 |
+
if t.ndim == 1:
|
| 737 |
+
t = t.unsqueeze(0) # (T,) β (1, T) for resample
|
| 738 |
+
# (1, T) or any other 2D single-channel stays as-is
|
| 739 |
t = torchaudio.functional.resample(t, src_sr, dst_sr)
|
| 740 |
+
if not stereo and wav.ndim == 1:
|
| 741 |
+
t = t.squeeze(0) # (1, T) β (T,) to match input shape
|
| 742 |
return t.numpy()
|
| 743 |
|
| 744 |
|
|
|
|
| 815 |
|
| 816 |
|
| 817 |
def _save_wav(path: str, wav: np.ndarray, sr: int) -> None:
|
| 818 |
+
"""Save a numpy wav array to *path* as stereo (2, T) via torchaudio.
|
| 819 |
+
Always conforms to stereo so every output WAV is a proper 2-channel file."""
|
| 820 |
+
wav = _to_stereo(wav) # (1,T) or (T,) β (2,T); (2,T) is a no-op
|
| 821 |
t = torch.from_numpy(np.ascontiguousarray(wav))
|
|
|
|
|
|
|
| 822 |
torchaudio.save(path, t, sr)
|
| 823 |
|
| 824 |
|
|
|
|
| 1531 |
|
| 1532 |
# Upsample 16kHz β 48kHz (sinc, CPU)
|
| 1533 |
new_wav = _upsample_taro(new_wav)
|
| 1534 |
+
# Conform to stereo (2, T) β TARO returns mono (T,)
|
| 1535 |
+
new_wav = _to_stereo(new_wav)
|
| 1536 |
# CPU: splice, stitch, mux, save
|
| 1537 |
video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
|
| 1538 |
new_wav, seg_idx, meta, slot_id
|
|
|
|
| 1616 |
sr = TARGET_SR
|
| 1617 |
meta["sr"] = sr
|
| 1618 |
|
| 1619 |
+
# Conform to stereo (2, T) β MMAudio returns (2,T) but apply defensively
|
| 1620 |
+
new_wav = _to_stereo(new_wav)
|
| 1621 |
# CPU: splice, stitch, mux, save
|
| 1622 |
video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
|
| 1623 |
new_wav, seg_idx, meta, slot_id
|
|
|
|
| 1696 |
|
| 1697 |
meta["sr"] = sr
|
| 1698 |
|
| 1699 |
+
# Conform to stereo (2, T) β HunyuanFoley DAC returns (1, T)
|
| 1700 |
+
new_wav = _to_stereo(new_wav)
|
| 1701 |
# CPU: splice, stitch, mux, save
|
| 1702 |
video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
|
| 1703 |
new_wav, seg_idx, meta, slot_id
|