diff --git a/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py index 791dec073524..1d33c2ae188f 100644 --- a/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py +++ b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py @@ -611,7 +611,7 @@ def prepare_cond_latents_and_mask( tuple: (cond_latents_concat, mask_concat) - both are zero tensors for t2v """ - batch, channels, frames, height, width = latents.shape + batch, channels, frames, latent_height, latent_width = latents.shape image_latents = self._get_image_latents( vae=self.vae, @@ -626,7 +626,7 @@ def prepare_cond_latents_and_mask( latent_condition[:, :, 1:, :, :] = 0 latent_condition = latent_condition.to(device=device, dtype=dtype) - latent_mask = torch.zeros(batch, 1, frames, height, width, dtype=dtype, device=device) + latent_mask = torch.zeros(batch, 1, frames, latent_height, latent_width, dtype=dtype, device=device) latent_mask[:, :, 0, :, :] = 1.0 return latent_condition, latent_mask