Spaces:

dagloop5
/

Testing2

Running on Zero

App Files Files Community

dagloop5 commited on 5 days ago

Commit

c43c959

verified ·

1 Parent(s): 2470e81

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -56

app.py CHANGED Viewed

@@ -65,7 +65,6 @@ from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTIL
 from ltx_pipelines.utils.helpers import (
     cleanup_memory,
     combined_image_conditionings,
-    denoise_audio_video,
     denoise_video_only,
     encode_prompts,
     simple_denoising_func,
@@ -103,7 +102,7 @@ RESOLUTIONS = {
 class LTX23DistilledA2VPipeline(DistilledPipeline):
-    """DistilledPipeline: single stage, full resolution, 8 steps, with optional audio."""
     def __call__(
         self,
@@ -118,7 +117,20 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
     ):
         print(prompt)
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
@@ -133,41 +145,32 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
         )
         video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
-        # Audio encoding — only runs if audio is provided
-        encoded_audio_latent = None
-        original_audio = None
-        if audio_path is not None:
-            video_duration = num_frames / frame_rate
-            decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
-            if decoded_audio is None:
-                raise ValueError(f"Could not extract audio stream from {audio_path}")
-            encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
-            audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
-            expected_frames = audio_shape.frames
-            actual_frames = encoded_audio_latent.shape[2]
-            if actual_frames > expected_frames:
-                encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
-            elif actual_frames < expected_frames:
-                pad = torch.zeros(
-                    encoded_audio_latent.shape[0],
-                    encoded_audio_latent.shape[1],
-                    expected_frames - actual_frames,
-                    encoded_audio_latent.shape[3],
-                    device=encoded_audio_latent.device,
-                    dtype=encoded_audio_latent.dtype,
-                )
-                encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
-            original_audio = Audio(
-                waveform=decoded_audio.waveform.squeeze(0),
-                sampling_rate=decoded_audio.sampling_rate,
             )
         video_encoder = self.model_ledger.video_encoder()
         transformer = self.model_ledger.transformer()
-        sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
         def denoising_loop(sigmas, video_state, audio_state, stepper):
             return euler_denoising_loop(
@@ -182,26 +185,26 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
                 ),
             )
-        output_shape = VideoPixelShape(
             batch=1,
             frames=num_frames,
-            width=width,
-            height=height,
             fps=frame_rate,
         )
-        conditionings = combined_image_conditionings(
             images=images,
-            height=output_shape.height,
-            width=output_shape.width,
             video_encoder=video_encoder,
             dtype=dtype,
             device=self.device,
         )
-        video_state, audio_state = denoise_audio_video(
-            output_shape=output_shape,
-            conditionings=conditionings,
             noiser=noiser,
-            sigmas=sigmas,
             stepper=stepper,
             denoising_loop_fn=denoising_loop,
             components=self.pipeline_components,
@@ -210,6 +213,39 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
             initial_audio_latent=encoded_audio_latent,
         )
         torch.cuda.synchronize()
         del transformer
         del video_encoder
@@ -221,19 +257,11 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
             tiling_config,
             generator,
         )
-        # If audio was provided as input, return it as-is (higher fidelity than decoded)
-        # If no audio input, decode the generated audio latent from the denoising
-        if original_audio is not None:
-            return decoded_video, original_audio
-        else:
-            from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
-            generated_audio = vae_decode_audio(
-                audio_state.latent,
-                self.model_ledger.audio_decoder(),
-                self.model_ledger.vocoder(),
-            )
-            return decoded_video, generated_audio
 # Model repos

 from ltx_pipelines.utils.helpers import (
     cleanup_memory,
     combined_image_conditionings,
     denoise_video_only,
     encode_prompts,
     simple_denoising_func,
 class LTX23DistilledA2VPipeline(DistilledPipeline):
+    """DistilledPipeline with optional audio conditioning."""
     def __call__(
         self,
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
     ):
+        # Standard path when no audio input is provided.
         print(prompt)
+        if audio_path is None:
+            return super().__call__(
+                prompt=prompt,
+                seed=seed,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                frame_rate=frame_rate,
+                images=images,
+                tiling_config=tiling_config,
+                enhance_prompt=enhance_prompt,
+            )
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
         )
         video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
+        video_duration = num_frames / frame_rate
+        decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
+        if decoded_audio is None:
+            raise ValueError(f"Could not extract audio stream from {audio_path}")
+        encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
+        audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
+        expected_frames = audio_shape.frames
+        actual_frames = encoded_audio_latent.shape[2]
+        if actual_frames > expected_frames:
+            encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
+        elif actual_frames < expected_frames:
+            pad = torch.zeros(
+                encoded_audio_latent.shape[0],
+                encoded_audio_latent.shape[1],
+                expected_frames - actual_frames,
+                encoded_audio_latent.shape[3],
+                device=encoded_audio_latent.device,
+                dtype=encoded_audio_latent.dtype,
             )
+            encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
         video_encoder = self.model_ledger.video_encoder()
         transformer = self.model_ledger.transformer()
+        stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
         def denoising_loop(sigmas, video_state, audio_state, stepper):
             return euler_denoising_loop(
                 ),
             )
+        stage_1_output_shape = VideoPixelShape(
             batch=1,
             frames=num_frames,
+            width=width // 2,
+            height=height // 2,
             fps=frame_rate,
         )
+        stage_1_conditionings = combined_image_conditionings(
             images=images,
+            height=stage_1_output_shape.height,
+            width=stage_1_output_shape.width,
             video_encoder=video_encoder,
             dtype=dtype,
             device=self.device,
         )
+        video_state = denoise_video_only(
+            output_shape=stage_1_output_shape,
+            conditionings=stage_1_conditionings,
             noiser=noiser,
+            sigmas=stage_1_sigmas,
             stepper=stepper,
             denoising_loop_fn=denoising_loop,
             components=self.pipeline_components,
             initial_audio_latent=encoded_audio_latent,
         )
+        torch.cuda.synchronize()
+        cleanup_memory()
+        upscaled_video_latent = upsample_video(
+            latent=video_state.latent[:1],
+            video_encoder=video_encoder,
+            upsampler=self.model_ledger.spatial_upsampler(),
+        )
+        stage_2_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=self.device)
+        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
+        stage_2_conditionings = combined_image_conditionings(
+            images=images,
+            height=stage_2_output_shape.height,
+            width=stage_2_output_shape.width,
+            video_encoder=video_encoder,
+            dtype=dtype,
+            device=self.device,
+        )
+        video_state = denoise_video_only(
+            output_shape=stage_2_output_shape,
+            conditionings=stage_2_conditionings,
+            noiser=noiser,
+            sigmas=stage_2_sigmas,
+            stepper=stepper,
+            denoising_loop_fn=denoising_loop,
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+            noise_scale=stage_2_sigmas[0],
+            initial_video_latent=upscaled_video_latent,
+            initial_audio_latent=encoded_audio_latent,
+        )
         torch.cuda.synchronize()
         del transformer
         del video_encoder
             tiling_config,
             generator,
         )
+        original_audio = Audio(
+            waveform=decoded_audio.waveform.squeeze(0),
+            sampling_rate=decoded_audio.sampling_rate,
+        )
+        return decoded_video, original_audio
 # Model repos