huggingface · zucchini-nlp · Feb 27, 2025 · Feb 27, 2025 · Feb 27, 2025 · Feb 28, 2025
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -904,8 +904,8 @@ def _load_state_dict_into_meta_model(
                         param = param[:].to(param_casting_dtype)
                     module.load_state_dict(
                         {param_type: param[:].to(param_device)},
-                        False,
-                        True,
+                        strict=False,
+                        assign=True,
                     )
                 else:
                     hf_quantizer.create_quantized_param(
@@ -4246,14 +4246,8 @@ def from_pretrained(
                 loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]
             else:
                 loaded_state_dict_keys = list(state_dict.keys())
-            if (
-                gguf_path is None
-                and (low_cpu_mem_usage or (use_keep_in_fp32_modules and is_accelerate_available()))
-                and pretrained_model_name_or_path is not None
-            ):
-                # In case some weights need to be kept in float32 and accelerate is not installed,
-                # we later on want to take the path where state_dict is not None, that is the one
-                # that do not require accelerate.
+
+            if gguf_path is None and low_cpu_mem_usage and pretrained_model_name_or_path is not None:
                 state_dict = None
 
         config.name_or_path = pretrained_model_name_or_path

diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
@@ -485,7 +485,7 @@ def test_model_from_config_torch_dtype_str(self):
         with self.assertRaises(ValueError):
             model = AutoModel.from_pretrained(TINY_T5, torch_dtype="int64")
 
-    def test_model_from_config_torch_dtype_composite(self):
+    def test_model_from_pretrained_torch_dtype_composite(self):
         """
         Test that from_pretrained works with torch_dtype being as a dict per each sub-config in composite config
         Tiny-Llava has saved auto dtype as `torch.float32` for all modules.
@@ -507,6 +507,16 @@ def test_model_from_config_torch_dtype_composite(self):
         self.assertEqual(model.vision_tower.dtype, torch.float16)
         self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.bfloat16)
 
+        # Test if we use the `accelerate` loading with `low_cpu_mem_usage`. Test only once with any `torch_dtype`.
+        model = LlavaForConditionalGeneration.from_pretrained(
+            TINY_LLAVA,
+            torch_dtype={"text_config": "float32", "vision_config": "float16", "": "bfloat16"},
+            low_cpu_mem_usage=True,
+        )
+        self.assertEqual(model.language_model.dtype, torch.float32)
+        self.assertEqual(model.vision_tower.dtype, torch.float16)
+        self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.bfloat16)
+
         # should be able to set the values as torch.dtype (not str)
         model = LlavaForConditionalGeneration.from_pretrained(
             TINY_LLAVA, torch_dtype={"text_config": torch.float32, "vision_config": torch.float16, "": torch.bfloat16}
@@ -525,13 +535,12 @@ def test_model_from_config_torch_dtype_composite(self):
         self.assertEqual(model.vision_tower.dtype, torch.bfloat16)
         self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.float16)
 
-        # TODO @ARTHURZUCKER FIX THIS
         # but if the model has `_keep_in_fp32_modules` then those modules should be in fp32 no matter what
-        # LlavaForConditionalGeneration._keep_in_fp32_modules = ["multi_modal_projector"]
-        # model = LlavaForConditionalGeneration.from_pretrained(TINY_LLAVA, config=config, torch_dtype="auto")
-        # self.assertEqual(model.language_model.dtype, torch.float32)
-        # self.assertEqual(model.vision_tower.dtype, torch.bfloat16)
-        # self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.float32)
+        LlavaForConditionalGeneration._keep_in_fp32_modules = ["multi_modal_projector"]
+        model = LlavaForConditionalGeneration.from_pretrained(TINY_LLAVA, config=config, torch_dtype="auto")
+        self.assertEqual(model.language_model.dtype, torch.float32)
+        self.assertEqual(model.vision_tower.dtype, torch.bfloat16)
+        self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.float32)
 
         # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type
         with self.assertRaises(ValueError):