tests

ankitageorge · ankitageorge · commit f4857cbebf66 · 2025-03-20T10:41:22.000-07:00
diff --git a/tests/torchtune/training/checkpointing/test_checkpointer.py b/tests/torchtune/training/checkpointing/test_checkpointer.py
@@ -754,68 +754,6 @@ def test_load_save_checkpoint_single_file(
 
         assert len(output_state_dict.keys()) == len(orig_state_dict.keys()) - 1
 
-    '''
-    def test_load_save_checkpoint_single_file_with_dcp(
-        self,
-        single_file_checkpointer: FullModelHFCheckpointer,
-        mistral_reward_model_hf_checkpoint: Path,
-    ):
-        """
-        Test ``load_checkpoint`` and ``save_checkpoint`` method within the
-        FullModelHFCheckpointer for a single checkpoint file for a mistral reward model
-        with DCP.
-
-        We test:
-        * ``load_checkpoint`` loads the right sets of keys
-        * Internal state of the checkpointer is correctly updated
-        * Converted checkpoint can be loaded into the `mistral_classifier` torchtune implementation
-        * Saved checkpoint keys match the original checkpoint
-        """
-        single_file_checkpointer._enable_dcp = True
-        # Read the state dict directly from file using torch.load. This will be the state
-        # dict we test against
-        checkpoint_file = mistral_reward_model_hf_checkpoint
-        orig_state_dict = safe_torch_load(checkpoint_file)
-
-        # Converted state dict from the checkpointer
-        state_dict = single_file_checkpointer.load_checkpoint()
-        # Check that we've loaded all the keys minus the output bias
-        assert len(state_dict["model"].keys()) == len(orig_state_dict.keys()) - 1
-
-        # the keys in original state dict should match up with the keys in the weight_map
-        for key in orig_state_dict.keys():
-            if "inv_freq" in key or "output.bias" in key:
-                continue
-            assert key in single_file_checkpointer._weight_map
-
-        # loading the state dict into the model implementation should work correctly
-        model = mistral.mistral_classifier(
-            num_classes=1,
-            vocab_size=_VOCAB_SIZE,
-            num_layers=1,
-            num_heads=_NUM_HEADS,
-            num_kv_heads=_NUM_KV_HEADS,
-            embed_dim=_DIM,
-            intermediate_dim=_HIDDEN_DIM,
-            max_seq_len=128,
-        )
-        model.load_state_dict(state_dict["model"])
-
-        single_file_checkpointer.save_checkpoint(state_dict, epoch=1)
-
-        # Reload the output checkpoint file and compare to the original checkpoint. This
-        # assumes we know what the name of the file is. This is fine, breaking this logic
-        # should be something we capture through this test
-        output_file = Path.joinpath(
-            checkpoint_file.parent.parent / "output_dir",
-            "epoch_1",
-            SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)),
-        ).with_suffix(".safetensors")
-        output_state_dict = safe_torch_load(output_file)
-
-        assert len(output_state_dict.keys()) == len(orig_state_dict.keys()) - 1
-        '''
-
 
 class TestHFGemmaFullModelCheckpointer:
     @pytest.fixture
@@ -983,65 +921,3 @@ def test_load_save_checkpoint_single_file(
         output_state_dict = safe_torch_load(output_file)
 
         assert len(output_state_dict.keys()) == len(orig_state_dict.keys())
-
-    '''
-    def test_load_save_checkpoint_single_file_with_dcp(
-        self,
-        single_file_checkpointer: FullModelHFCheckpointer,
-        gemma_hf_checkpoint: Path,
-    ):
-        """
-        Test ``load_checkpoint`` and ``save_checkpoint`` method within the
-        FullModelHFCheckpointer for a single checkpoint file for Gemma with DCP enabled.
-
-        We test:
-        * ``load_checkpoint`` loads the right sets of keys
-        * Internal state of the checkpointer is correctly updated
-        * Converted checkpoint can be loaded into the `gemma` TorchTune implementation
-        * lm_head weights are tied to the embed_tokens weights during saving
-        * lmhead weights are popped during loading
-        """
-        single_file_checkpointer._enable_dcp = True
-        # Read the state dict directly from file using torch.load. This will be the state
-        # dict we test against
-        checkpoint_file = gemma_hf_checkpoint
-        orig_state_dict = safe_torch_load(checkpoint_file)
-
-        # Converted state dict from the checkpointer
-
-        state_dict = single_file_checkpointer.load_checkpoint()
-        assert len(state_dict["model"].keys()) == len(orig_state_dict.keys())
-
-        # the keys in original state dict should match up with the keys in the weight_map
-        for key in orig_state_dict.keys():
-            if "inv_freq" in key:
-                continue
-            assert key in single_file_checkpointer._weight_map
-
-        # loading the state dict into the model implementation should work correctly
-        model = gemma.gemma(
-            vocab_size=_VOCAB_SIZE,
-            num_layers=1,
-            num_heads=_NUM_HEADS,
-            head_dim=_HEAD_DIM,
-            num_kv_heads=1,
-            embed_dim=_DIM,
-            intermediate_dim=_HIDDEN_DIM,
-            max_seq_len=128,
-        )
-        model.load_state_dict(state_dict["model"])
-
-        single_file_checkpointer.save_checkpoint(state_dict, epoch=1)
-
-        # Reload the output checkpoint file and compare to the original checkpoint. This
-        # assumes we know what the name of the file is. This is fine, breaking this logic
-        # should be something we capture through this test
-        output_file = Path.joinpath(
-            checkpoint_file.parent.parent / "output_dir",
-            "epoch_1",
-            SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)),
-        ).with_suffix(".safetensors")
-        output_state_dict = safe_torch_load(output_file)
-
-        assert len(output_state_dict.keys()) == len(orig_state_dict.keys())
-    '''
diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py
@@ -442,7 +442,7 @@ def __init__(
                 f"Got {self._fs} and {output_fs} instead."
             )
 
-        self._fs.mkdir(output_dir, exist_ok=True)
+        self._fs.mkdirs(output_dir, exist_ok=True)
 
         # weight_map contains the state_dict key -> checkpoint file mapping so we can correctly
         # parition the state dict into output checkpoint files. This is updated during checkpoint
@@ -813,7 +813,7 @@ def save_checkpoint(
                     output_path = os.path.join(
                         self._output_dir, f"epoch_{epoch}", shard_name
                     )
-                    self._fs.mkdir(os.path.dirname(output_path), exist_ok=True)
+                    self._fs.mkdirs(os.path.dirname(output_path), exist_ok=True)
                     if not self._safe_serialization:
                         output_path = output_path = ".bin"
                         torch.save(model_state_dict, output_path)
@@ -865,7 +865,7 @@ def save_checkpoint(
                 os.path.join(self._output_dir, f"epoch_{epoch}", ADAPTER_MODEL_FNAME)
                 + ".pt"
             )
-            self._fs.mkdir(os.path.dirname(output_path), exist_ok=True)
+            self._fs.mkdirs(os.path.dirname(output_path), exist_ok=True)
             torch.save(state_dict[training.ADAPTER_KEY], output_path)
             logger.info(
                 "Adapter checkpoint of size "
@@ -894,7 +894,7 @@ def save_checkpoint(
                 output_path = os.path.join(
                     self._output_dir, f"epoch_{epoch}", ADAPTER_MODEL_FNAME
                 )
-                self._fs.mkdir(os.path.dirname(output_path), exist_ok=True)
+                self._fs.mkdirs(os.path.dirname(output_path), exist_ok=True)
                 if not self._safe_serialization:
                     output_path = output_path + ".bin"
                     torch.save(state_dict[training.ADAPTER_KEY], output_path)