NovaSky-AI · ashutoshuiuc · Mar 23, 2026 · Mar 23, 2026 · gemini-code-assist · Mar 23, 2026
diff --git a/skyrl/backends/skyrl_train/distributed/fsdp_strategy.py b/skyrl/backends/skyrl_train/distributed/fsdp_strategy.py
@@ -169,31 +169,37 @@ def optimizer_step(
         **kwargs,
     ) -> Optional[Float[torch.Tensor, "1"]]:
         """Perform optimizer step"""
+        import time as _time
+
+        rank = dist.get_rank() if dist.is_initialized() else -1
         grad_norm = None
         if isinstance(model, HFModelWrapper):
             model = model.model
 
         if self.max_norm > 0:
+            t0 = _time.time()
             # NOTE (sumanthrh): All `grad_norm`s returned here are the original grad norms before clipping.
             if isinstance(model, FSDP):
                 grad_norm = model.clip_grad_norm_(max_norm=self.max_norm)
             elif isinstance(model, FSDPModule):
                 grad_norm = fsdp2_clip_grad_norm_(model.parameters(), max_norm=self.max_norm)
             else:
                 grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=self.max_norm)
+            logger.info(f"[rank {rank}] clip_grad_norm_ done in {_time.time() - t0:.1f}s, grad_norm={grad_norm}")
 
-        # Skip update if gradient norm is not finite
-        if grad_norm is not None and not torch.isfinite(grad_norm):
-            if torch.distributed.is_initialized():
-                rank = torch.distributed.get_rank()
-                logger.warning(f"rank {rank} grad_norm is not finite: {grad_norm}")
-            else:
-                logger.warning(f"grad_norm is not finite: {grad_norm}")
-            optimizer.zero_grad()
-            return grad_norm
+        # NOTE: With FSDP, optimizer.step() involves NCCL collectives. ALL ranks
+        # must call it even if grad_norm is non-finite, otherwise NCCL deadlocks.
+        # We zero_grad before stepping so the non-finite update is harmless.
+        non_finite = grad_norm is not None and not torch.isfinite(grad_norm)
+        if non_finite:
+            logger.warning(f"rank {rank} grad_norm is not finite: {grad_norm}, zeroing grads before step")
+            optimizer.zero_grad(set_to_none=True)
 
+        t0 = _time.time()
         optimizer.step()
-        if scheduler is not None:
+        logger.info(f"[rank {rank}] optimizer.step() done in {_time.time() - t0:.1f}s")
-            logger.info(f"[rank {rank}] clip_grad_norm_ done in {_time.time() - t0:.1f}s, grad_norm={grad_norm}")
-
-        # Skip update if gradient norm is not finite
-        if grad_norm is not None and not torch.isfinite(grad_norm):
-            if torch.distributed.is_initialized():
-                rank = torch.distributed.get_rank()
-                logger.warning(f"rank {rank} grad_norm is not finite: {grad_norm}")
-            else:
-                logger.warning(f"grad_norm is not finite: {grad_norm}")
-            optimizer.zero_grad()
-            return grad_norm
-        # NOTE: With FSDP, optimizer.step() involves NCCL collectives. ALL ranks
-        # must call it even if grad_norm is non-finite, otherwise NCCL deadlocks.
-        # We zero_grad before stepping so the non-finite update is harmless.
-        non_finite = grad_norm is not None and not torch.isfinite(grad_norm)
-        if non_finite:
-            logger.warning(f"rank {rank} grad_norm is not finite: {grad_norm}, zeroing grads before step")
-            optimizer.zero_grad(set_to_none=True)
-
-        t0 = _time.time()
-        optimizer.step()
-        if scheduler is not None:
-        logger.info(f"[rank {rank}] optimizer.step() done in {_time.time() - t0:.1f}s")
+            logger.debug(f"[rank {rank}] clip_grad_norm_ done in {_time.time() - t0:.1f}s, grad_norm={grad_norm}")
+
+        # NOTE: With FSDP, optimizer.step() involves NCCL collectives. ALL ranks
+        # must call it even if grad_norm is non-finite, otherwise NCCL deadlocks.
+        # We zero_grad before stepping so the non-finite update is harmless.
+        non_finite = grad_norm is not None and not torch.isfinite(grad_norm)
+        if non_finite:
+            logger.warning(f"rank {rank} grad_norm is not finite: {grad_norm}, zeroing grads before step")
+            optimizer.zero_grad(set_to_none=True)
+
+        t0 = _time.time()
+        optimizer.step()
+        logger.debug(f"[rank {rank}] optimizer.step() done in {_time.time() - t0:.1f}s")
-            logger.info(f"[rank {rank}] clip_grad_norm_ done in {_time.time() - t0:.1f}s, grad_norm={grad_norm}")
-
-        # Skip update if gradient norm is not finite
-        if grad_norm is not None and not torch.isfinite(grad_norm):
-            if torch.distributed.is_initialized():
-                rank = torch.distributed.get_rank()
-                logger.warning(f"rank {rank} grad_norm is not finite: {grad_norm}")
-            else:
-                logger.warning(f"grad_norm is not finite: {grad_norm}")
-            optimizer.zero_grad()
-            return grad_norm
-        # NOTE: With FSDP, optimizer.step() involves NCCL collectives. ALL ranks
-        # must call it even if grad_norm is non-finite, otherwise NCCL deadlocks.
-        # We zero_grad before stepping so the non-finite update is harmless.
-        non_finite = grad_norm is not None and not torch.isfinite(grad_norm)
-        if non_finite:
-            logger.warning(f"rank {rank} grad_norm is not finite: {grad_norm}, zeroing grads before step")
-            optimizer.zero_grad(set_to_none=True)
-
-        t0 = _time.time()
-        optimizer.step()
-        if scheduler is not None:
-        logger.info(f"[rank {rank}] optimizer.step() done in {_time.time() - t0:.1f}s")
+            logger.debug(f"[rank {rank}] clip_grad_norm_ done in {_time.time() - t0:.1f}s, grad_norm={grad_norm}")
+
+        # NOTE: With FSDP, optimizer.step() involves NCCL collectives. ALL ranks
+        # must call it even if grad_norm is non-finite, otherwise NCCL deadlocks.
+        # We zero_grad before stepping so the non-finite update is harmless.
+        non_finite = grad_norm is not None and not torch.isfinite(grad_norm)
+        if non_finite:
+            logger.warning(f"rank {rank} grad_norm is not finite: {grad_norm}, zeroing grads before step")
+            optimizer.zero_grad(set_to_none=True)
+
+        t0 = _time.time()
+        optimizer.step()
+        logger.debug(f"[rank {rank}] optimizer.step() done in {_time.time() - t0:.1f}s")
+        # Only advance LR schedule when gradients were finite (non-finite steps are no-ops)
+        if scheduler is not None and not non_finite:
             scheduler.step()
         optimizer.zero_grad()
         return grad_norm
@@ -273,9 +279,12 @@ def _fsdp_init_model(self, model, is_train=True, is_wrapped=False):
                 "reshard_after_forward": self.fsdp_config.reshard_after_forward,
             }
             module = model.model if is_wrapped else model
+            if getattr(module.config, "tie_word_embeddings", False):
+                module.tie_weights()
             full_state = module.state_dict()
             apply_fsdp2(module, fsdp_kwargs, self.fsdp_config)
             fsdp2_load_full_state_dict(module, full_state, cpu_offload)
+            del full_state  # free CPU memory (rank 0 held full model copy)
             fsdp_module = module
         else:
             raise NotImplementedError(f"{self.fsdp_strategy} not implemented")

diff --git a/skyrl/backends/skyrl_train/distributed/fsdp_utils.py b/skyrl/backends/skyrl_train/distributed/fsdp_utils.py
@@ -293,33 +293,57 @@ def _cast_and_contiguous(tensor, to_contiguous, dtype):
             tensor = tensor.contiguous()
         return tensor
 
-    if dist.get_rank() == 0:
-        for (param_name, full_param), sharded_param in zip(full_sd.items(), meta_sharded_sd.values()):
-            full_param = full_param.detach().cuda()
-            mesh = sharded_param.device_mesh
-            dist.broadcast(full_param, src=0)
-            sharded_tensor = distribute_tensor(full_param, mesh, sharded_param.placements)
-            to_contiguous, casting_dtype = _infer_parameter_dtype(
-                model,
-                param_name,
-                full_param,
-            )
-            sharded_tensor = _cast_and_contiguous(sharded_tensor, to_contiguous, casting_dtype)
-            sharded_sd[param_name] = sharded_tensor
-    # We need this else to have a matching `broadcast` for all of the ranks, else we deadlock
-    else:
-        for param_name, sharded_param in meta_sharded_sd.items():
-            full_tensor = torch.empty(sharded_param.size(), device="cuda", dtype=sharded_param.dtype)
+    # Batched broadcast: coalesce many small tensors into fewer NCCL calls.
+    # For MoE models with 18,000+ params, this reduces init from minutes to seconds.
+    BATCH_SIZE_BYTES = 500 * 1024 * 1024  # 500 MB per coalesced broadcast
+
+    param_names = list(meta_sharded_sd.keys())
+    batch_tensors = []
+    batch_names = []
+    batch_bytes = 0
+
+    def _flush_batch():
+        """Broadcast current batch and distribute to shards."""
+        if not batch_tensors:
+            return
+        # Use coalesced broadcast when available (private API), fall back to per-tensor
+        if len(batch_tensors) > 1 and hasattr(dist, "_broadcast_coalesced"):
+            pg = dist.distributed_c10d._get_default_group()
+            dist._broadcast_coalesced(pg, batch_tensors, BATCH_SIZE_BYTES, 0)
+        else:
+            for t in batch_tensors:
+                dist.broadcast(t, src=0)
+
+        for name, full_tensor in zip(batch_names, batch_tensors):
+            sharded_param = meta_sharded_sd[name]
             mesh = sharded_param.device_mesh
-            dist.broadcast(full_tensor, src=0)
             sharded_tensor = distribute_tensor(full_tensor, mesh, sharded_param.placements)
-            to_contiguous, casting_dtype = _infer_parameter_dtype(
-                model,
-                param_name,
-                full_tensor,
-            )
+            to_contiguous, casting_dtype = _infer_parameter_dtype(model, name, full_tensor)
             sharded_tensor = _cast_and_contiguous(sharded_tensor, to_contiguous, casting_dtype)
-            sharded_sd[param_name] = sharded_tensor
+            sharded_sd[name] = sharded_tensor
+
+    for param_name in param_names:
+        sharded_param = meta_sharded_sd[param_name]
+        if dist.get_rank() == 0:
+            full_tensor = full_sd.pop(param_name).detach().cuda()
+        else:
+            full_tensor = torch.empty(sharded_param.size(), device="cuda", dtype=sharded_param.dtype)
+
+        tensor_bytes = full_tensor.nelement() * full_tensor.element_size()
+
+        # If adding this tensor exceeds batch size, flush current batch first
+        if batch_bytes + tensor_bytes > BATCH_SIZE_BYTES and batch_tensors:
+            _flush_batch()
+            batch_tensors.clear()
+            batch_names.clear()
+            batch_bytes = 0
+
+        batch_tensors.append(full_tensor)
+        batch_names.append(param_name)
+        batch_bytes += tensor_bytes
+
+    # Flush remaining tensors
+    _flush_batch()
 
     # we set `assign=True` because our params can be on meta device
     model.load_state_dict(sharded_sd, assign=True)