Skip to content

Commit

Permalink
Merge pull request #364 from itayhubara/fix_rcps_llama2
Browse files Browse the repository at this point in the history
new rcp for llama after gradient clipping fix
  • Loading branch information
nv-rborkar authored Apr 5, 2024
2 parents e758874 + 6ad128b commit be06569
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 12 deletions.
2 changes: 1 addition & 1 deletion mlperf_logging/benchmark_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
'rnnt': 10,
'unet3d': 40,
'gnn' : 10,
'llama2_70b_lora': 12,
'llama2_70b_lora': 10,
},

'hpc' : {
Expand Down
6 changes: 2 additions & 4 deletions mlperf_logging/rcp_checker/rcp_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
'rnnt': 10,
'stable_diffusion': 10,
'gnn': 10,
'llama2_70b_lora': 12,
'llama2_70b_lora': 10,
},
"hpc": {
'cosmoflow': 10,
Expand Down Expand Up @@ -153,7 +153,7 @@ def get_submission_epochs(result_files, ruleset, bert_train_samples):

if (bert_train_samples and benchmark != "bert"):
logging.info(' bert_train_samples set for submission that is not bert')
if (not_converged > 1 and benchmark != 'unet3d') or (not_converged > 4 and benchmark == 'unet3d') or (not_converged > 2 and benchmark == 'llama2_70b_lora'):
if (not_converged > 1 and benchmark != 'unet3d') or (not_converged > 4 and benchmark == 'unet3d'):
subm_epochs = None
return bs, subm_epochs, benchmark

Expand Down Expand Up @@ -267,7 +267,6 @@ def compute_rcp_stats(self):
# Use olympic mean
epoch_list.sort()
samples_rejected = 4 if record_contents['Benchmark'] == 'unet3d' else 1
samples_rejected = 2 if record_contents['Benchmark'] == 'llama2_70b_lora' else samples_rejected
record_contents['RCP Mean'] = np.mean(epoch_list[samples_rejected:len(epoch_list)-samples_rejected])
record_contents['RCP Stdev'] = np.std(epoch_list[samples_rejected:len(epoch_list)-samples_rejected])
min_epochs = self._find_min_acceptable_mean(
Expand Down Expand Up @@ -434,7 +433,6 @@ def _eval_submission_record(self, rcp_record, subm_epochs, results_dir):
'''Compare reference and submission convergence.'''
subm_epochs.sort()
samples_rejected = 4 if rcp_record["Benchmark"] == 'unet3d' else 1
samples_rejected = 2 if rcp_record["Benchmark"] == 'llama2_70b_lora' else samples_rejected
mean_subm_epochs = np.mean(subm_epochs[samples_rejected:len(subm_epochs)-samples_rejected])
norm_factor = self._find_norm_factor(rcp_record, mean_subm_epochs)
if mean_subm_epochs >= (rcp_record["RCP Mean"] / rcp_record["Max Speedup"]):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
"max_steps": 1024
},
"samples to converge": [
3840, 4224, 4608, 3840, 3840, 3840, 3840, 3456, 3840, 3456, 3456, 3840,
4224, 3456, 3840, 4224, 4992, 3840, 4992, 4224, 3840, 4608, 3840, 3840
3072,2688,3456,3072,3072,3072,3456,3456,3072,2688,
3456,3072,3072,3072,3840,3456,2688,3072,3456,3456
]
},

Expand All @@ -40,8 +40,8 @@
"max_steps": 1024
},
"samples to converge": [
5376, 5760, 5760, 6144, 5376, 7296, 5760, 5760, 5760, 6144, 5760, 7680,
5376, 5760, 6912, 7680, 6144, 6144, 5760, 5376, 6144, 6144, 6528, 6912
3840,3840,4224,3840,3840,3840,4608,3840,4608,3840,
4992,3840,3840,3840,4992,3840,3840,4224,3840,3456
]
},
"llama2_70b_lora_ref_32":
Expand All @@ -62,9 +62,8 @@
"max_steps": 1024
},
"samples to converge": [
9600, 10368, 7680, 9600, 9600, 8832, 8064, 10752, 8448, 8064, 8832, 9984,
8064, 13824, 8448, 8832, 8064, 9216, 8832, 8064, 9984, 9984, 9216, 8848

5760,6528,6144,6528,5376,6528,5760,6144,6144,6528,
6144,6144,6144,5760,5760,5760,5760,5760,6144,5760
]
}
}

0 comments on commit be06569

Please sign in to comment.