@@ -314,11 +314,12 @@ def save_new_embed(text_encoder, modifier_token_id, accelerator, args, output_di
314
314
for x , y in zip (modifier_token_id , args .modifier_token ):
315
315
learned_embeds_dict = {}
316
316
learned_embeds_dict [y ] = learned_embeds [x ]
317
- filename = f"{ output_dir } /{ y } .bin"
318
317
319
318
if safe_serialization :
319
+ filename = f"{ output_dir } /{ y } .safetensors"
320
320
safetensors .torch .save_file (learned_embeds_dict , filename , metadata = {"format" : "pt" })
321
321
else :
322
+ filename = f"{ output_dir } /{ y } .bin"
322
323
torch .save (learned_embeds_dict , filename )
323
324
324
325
@@ -1040,17 +1041,22 @@ def main(args):
1040
1041
)
1041
1042
1042
1043
# Scheduler and math around the number of training steps.
1043
- overrode_max_train_steps = False
1044
- num_update_steps_per_epoch = math . ceil ( len ( train_dataloader ) / args . gradient_accumulation_steps )
1044
+ # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
1045
+ num_warmup_steps_for_scheduler = args . lr_warmup_steps * accelerator . num_processes
1045
1046
if args .max_train_steps is None :
1046
- args .max_train_steps = args .num_train_epochs * num_update_steps_per_epoch
1047
- overrode_max_train_steps = True
1047
+ len_train_dataloader_after_sharding = math .ceil (len (train_dataloader ) / accelerator .num_processes )
1048
+ num_update_steps_per_epoch = math .ceil (len_train_dataloader_after_sharding / args .gradient_accumulation_steps )
1049
+ num_training_steps_for_scheduler = (
1050
+ args .num_train_epochs * num_update_steps_per_epoch * accelerator .num_processes
1051
+ )
1052
+ else :
1053
+ num_training_steps_for_scheduler = args .max_train_steps * accelerator .num_processes
1048
1054
1049
1055
lr_scheduler = get_scheduler (
1050
1056
args .lr_scheduler ,
1051
1057
optimizer = optimizer ,
1052
- num_warmup_steps = args . lr_warmup_steps * accelerator . num_processes ,
1053
- num_training_steps = args . max_train_steps * accelerator . num_processes ,
1058
+ num_warmup_steps = num_warmup_steps_for_scheduler ,
1059
+ num_training_steps = num_training_steps_for_scheduler ,
1054
1060
)
1055
1061
1056
1062
# Prepare everything with our `accelerator`.
@@ -1065,8 +1071,14 @@ def main(args):
1065
1071
1066
1072
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
1067
1073
num_update_steps_per_epoch = math .ceil (len (train_dataloader ) / args .gradient_accumulation_steps )
1068
- if overrode_max_train_steps :
1074
+ if args . max_train_steps is None :
1069
1075
args .max_train_steps = args .num_train_epochs * num_update_steps_per_epoch
1076
+ if num_training_steps_for_scheduler != args .max_train_steps * accelerator .num_processes :
1077
+ logger .warning (
1078
+ f"The length of the 'train_dataloader' after 'accelerator.prepare' ({ len (train_dataloader )} ) does not match "
1079
+ f"the expected length ({ len_train_dataloader_after_sharding } ) when the learning rate scheduler was created. "
1080
+ f"This inconsistency may result in the learning rate scheduler not functioning properly."
1081
+ )
1070
1082
# Afterwards we recalculate our number of training epochs
1071
1083
args .num_train_epochs = math .ceil (args .max_train_steps / num_update_steps_per_epoch )
1072
1084
0 commit comments