diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index aa57eff..fa8d3e4 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -16,7 +16,7 @@ 'rnnt': 10, 'unet3d': 40, 'gnn' : 10, - 'llama2_70b_lora': 12, + 'llama2_70b_lora': 10, }, 'hpc' : { diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 45f07b4..b63ebe2 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -29,7 +29,7 @@ 'rnnt': 10, 'stable_diffusion': 10, 'gnn': 10, - 'llama2_70b_lora': 12, + 'llama2_70b_lora': 10, }, "hpc": { 'cosmoflow': 10, @@ -153,7 +153,7 @@ def get_submission_epochs(result_files, ruleset, bert_train_samples): if (bert_train_samples and benchmark != "bert"): logging.info(' bert_train_samples set for submission that is not bert') - if (not_converged > 1 and benchmark != 'unet3d') or (not_converged > 4 and benchmark == 'unet3d') or (not_converged > 2 and benchmark == 'llama2_70b_lora'): + if (not_converged > 1 and benchmark != 'unet3d') or (not_converged > 4 and benchmark == 'unet3d'): subm_epochs = None return bs, subm_epochs, benchmark @@ -267,7 +267,6 @@ def compute_rcp_stats(self): # Use olympic mean epoch_list.sort() samples_rejected = 4 if record_contents['Benchmark'] == 'unet3d' else 1 - samples_rejected = 2 if record_contents['Benchmark'] == 'llama2_70b_lora' else samples_rejected record_contents['RCP Mean'] = np.mean(epoch_list[samples_rejected:len(epoch_list)-samples_rejected]) record_contents['RCP Stdev'] = np.std(epoch_list[samples_rejected:len(epoch_list)-samples_rejected]) min_epochs = self._find_min_acceptable_mean( @@ -434,7 +433,6 @@ def _eval_submission_record(self, rcp_record, subm_epochs, results_dir): '''Compare reference and submission convergence.''' subm_epochs.sort() samples_rejected = 4 if rcp_record["Benchmark"] == 'unet3d' else 1 - samples_rejected = 2 if rcp_record["Benchmark"] == 'llama2_70b_lora' else samples_rejected mean_subm_epochs = np.mean(subm_epochs[samples_rejected:len(subm_epochs)-samples_rejected]) norm_factor = self._find_norm_factor(rcp_record, mean_subm_epochs) if mean_subm_epochs >= (rcp_record["RCP Mean"] / rcp_record["Max Speedup"]): diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_llama2_70b_lora.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_llama2_70b_lora.json index 2184a38..abfc4e4 100644 --- a/mlperf_logging/rcp_checker/training_4.0.0/rcps_llama2_70b_lora.json +++ b/mlperf_logging/rcp_checker/training_4.0.0/rcps_llama2_70b_lora.json @@ -17,8 +17,8 @@ "max_steps": 1024 }, "samples to converge": [ - 3840, 4224, 4608, 3840, 3840, 3840, 3840, 3456, 3840, 3456, 3456, 3840, - 4224, 3456, 3840, 4224, 4992, 3840, 4992, 4224, 3840, 4608, 3840, 3840 + 3072,2688,3456,3072,3072,3072,3456,3456,3072,2688, + 3456,3072,3072,3072,3840,3456,2688,3072,3456,3456 ] }, @@ -40,8 +40,8 @@ "max_steps": 1024 }, "samples to converge": [ - 5376, 5760, 5760, 6144, 5376, 7296, 5760, 5760, 5760, 6144, 5760, 7680, - 5376, 5760, 6912, 7680, 6144, 6144, 5760, 5376, 6144, 6144, 6528, 6912 + 3840,3840,4224,3840,3840,3840,4608,3840,4608,3840, + 4992,3840,3840,3840,4992,3840,3840,4224,3840,3456 ] }, "llama2_70b_lora_ref_32": @@ -62,9 +62,8 @@ "max_steps": 1024 }, "samples to converge": [ - 9600, 10368, 7680, 9600, 9600, 8832, 8064, 10752, 8448, 8064, 8832, 9984, - 8064, 13824, 8448, 8832, 8064, 9216, 8832, 8064, 9984, 9984, 9216, 8848 - + 5760,6528,6144,6528,5376,6528,5760,6144,6144,6528, + 6144,6144,6144,5760,5760,5760,5760,5760,6144,5760 ] } }