@@ -384,23 +384,18 @@ void LLamaModel::backward(Tensor inputs, Tensor targets, NCCLCommunicator& comm,
384384 rmsnorm_backward (rs->DActs [L-1 ].DResFFN .Value , d_lnf_w, rs->RMSNormScratch , rs->DActs [L - 1 ].DResFFN .Value , rs->DLNF ,
385385 rs->get_res_ffn (L-1 , main_stream), Parameters->get_lnf (main_stream), rs->LNF_Rstd ,
386386 quant_abs_max_ptr (rs->DActs [L-1 ].DResFFN ), B, T, C, rs->DeviceProp , main_stream);
387- CUDA_CHECK (cudaDeviceSynchronize ());
388387 rs->release_res_ffn (L-1 , main_stream);
389388
390389 Parameters->release_lnf (main_stream);
391- CUDA_CHECK (cudaDeviceSynchronize ());
392390 Grads->notify_lnf_w (main_stream, comm);
393- CUDA_CHECK (cudaDeviceSynchronize ());
394391 rs->fetch_res_ffn (L-2 , comm.stream ());
395- CUDA_CHECK (cudaDeviceSynchronize ());
396392 Parameters->gather_block (L - 1 , comm, *rs);
397393 // now backward all the layers
398394 for (int l = L-1 ; l >= 0 ; l--) {
399395 NvtxRange layer_range (" Layer" , l);
400396 auto & dw = Grads->get_block_full (l, main_stream, comm, accumulate);
401397
402398 // prefetch previous layer
403- CUDA_CHECK (cudaDeviceSynchronize ());
404399 if (l > 1 ) {
405400 rs->fetch_res_ffn (l-2 , comm.stream ());
406401 }
0 commit comments