@@ -653,11 +653,10 @@ def _replace(child, name, conv_linear_layer):
653
653
new_bias = torch .empty ((weight_shape [0 ]),
654
654
device = child .weight .device ,
655
655
dtype = child .weight .dtype )
656
-
657
656
if z_inference :
658
657
with deepspeed .zero .GatheredParameters (child .bias , modifier_rank = 0 ):
659
658
new_bias .data .copy_ (child .bias .data )
660
- else :
659
+ elif child . bias :
661
660
new_bias .data .copy_ (child .bias .data )
662
661
return LinearAllreduce (data , child .bias if child .bias is None else \
663
662
torch .nn .parameter .Parameter (new_bias .to (torch .cuda .current_device ())), mp_group )
@@ -704,7 +703,9 @@ def _slice_embedding(child, name, conv_linear_layer):
704
703
child .weight .shape [1 ] // mp_size ),
705
704
device = child .weight .device ,
706
705
dtype = child .weight .dtype )
707
- data = mp_replace .copy (new_weight , child .weight .ds_tensor .data )
706
+ data = mp_replace .copy (new_weight ,
707
+ child .weight .ds_tensor .data if hasattr (child .weight , 'ds_tensor' ) else \
708
+ child .weight .data )
708
709
new_embedding = nn .Embedding (child .weight .shape [0 ],
709
710
child .weight .shape [1 ] // mp_size )
710
711
new_embedding .weight .data .copy_ (data )
0 commit comments