diff --git a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py index d3c8a3539b98..18a0e970c610 100644 --- a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +++ b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py @@ -344,7 +344,7 @@ def _get_llama_prompt_embeds( ) prompt_embeds = self.text_encoder( **expanded_inputs, - pixel_value=image_embeds, + pixel_values=image_embeds, output_hidden_states=True, ).hidden_states[-(num_hidden_layers_to_skip + 1)] prompt_embeds = prompt_embeds.to(dtype=dtype) diff --git a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py index 5802bde87a61..37a4f418cc6d 100644 --- a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py +++ b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py @@ -24,9 +24,11 @@ CLIPTextModel, CLIPTokenizer, LlamaConfig, - LlamaModel, - LlamaTokenizer, + LlamaTokenizerFast, + LlavaConfig, + LlavaForConditionalGeneration, ) +from transformers.models.clip import CLIPVisionConfig from diffusers import ( AutoencoderKLHunyuanVideo, @@ -116,7 +118,7 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1): torch.manual_seed(0) scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0) - llama_text_encoder_config = LlamaConfig( + text_config = LlamaConfig( bos_token_id=0, eos_token_id=2, hidden_size=16, @@ -124,11 +126,21 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1): layer_norm_eps=1e-05, num_attention_heads=4, num_hidden_layers=2, - pad_token_id=1, + pad_token_id=100, vocab_size=1000, hidden_act="gelu", projection_dim=32, ) + vision_config = CLIPVisionConfig( + hidden_size=8, + intermediate_size=37, + projection_dim=32, + num_attention_heads=4, + num_hidden_layers=2, + image_size=224, + ) + llava_text_encoder_config = LlavaConfig(vision_config, text_config, pad_token_id=100, image_token_index=101) + clip_text_encoder_config = CLIPTextConfig( bos_token_id=0, eos_token_id=2, @@ -144,8 +156,8 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1): ) torch.manual_seed(0) - text_encoder = LlamaModel(llama_text_encoder_config) - tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer") + text_encoder = LlavaForConditionalGeneration(llava_text_encoder_config) + tokenizer = LlamaTokenizerFast.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer") torch.manual_seed(0) text_encoder_2 = CLIPTextModel(clip_text_encoder_config) @@ -153,14 +165,14 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1): torch.manual_seed(0) image_processor = CLIPImageProcessor( - crop_size=336, + crop_size=224, do_center_crop=True, do_normalize=True, do_resize=True, image_mean=[0.48145466, 0.4578275, 0.40821073], image_std=[0.26862954, 0.26130258, 0.27577711], resample=3, - size=336, + size=224, ) components = { @@ -190,6 +202,10 @@ def get_dummy_inputs(self, device, seed=0): "prompt_template": { "template": "{}", "crop_start": 0, + "image_emb_len": 49, + "image_emb_start": 5, + "image_emb_end": 54, + "double_return_token_id": 0, }, "generator": generator, "num_inference_steps": 2, @@ -197,7 +213,7 @@ def get_dummy_inputs(self, device, seed=0): "height": image_height, "width": image_width, "num_frames": 9, - "max_sequence_length": 16, + "max_sequence_length": 64, "output_type": "pt", } return inputs