Merge pull request #364 from itayhubara/fix_rcps_llama2

nv-rborkar · web-flow · commit be065690076d · 2024-04-05T09:45:52.000-07:00
new rcp for llama after gradient clipping fix
diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py
@@ -16,7 +16,7 @@
         'rnnt': 10,
         'unet3d': 40,
         'gnn' : 10,  
-        'llama2_70b_lora': 12,
+        'llama2_70b_lora': 10,
     },
     
     'hpc' : {
diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py
@@ -29,7 +29,7 @@
         'rnnt': 10,
         'stable_diffusion': 10,
         'gnn': 10,  
-        'llama2_70b_lora': 12,
+        'llama2_70b_lora': 10,
     },
     "hpc": {
         'cosmoflow': 10,
@@ -153,7 +153,7 @@ def get_submission_epochs(result_files, ruleset, bert_train_samples):
 
     if (bert_train_samples and benchmark != "bert"):
         logging.info(' bert_train_samples set for submission that is not bert')
-    if (not_converged > 1 and benchmark != 'unet3d') or (not_converged > 4 and benchmark == 'unet3d') or (not_converged > 2 and benchmark == 'llama2_70b_lora'):
+    if (not_converged > 1 and benchmark != 'unet3d') or (not_converged > 4 and benchmark == 'unet3d'):
         subm_epochs = None
     return bs, subm_epochs, benchmark
 
@@ -267,7 +267,6 @@ def compute_rcp_stats(self):
             # Use olympic mean
             epoch_list.sort()
             samples_rejected = 4 if record_contents['Benchmark'] == 'unet3d' else 1
-            samples_rejected = 2 if record_contents['Benchmark'] == 'llama2_70b_lora' else samples_rejected
             record_contents['RCP Mean'] = np.mean(epoch_list[samples_rejected:len(epoch_list)-samples_rejected])
             record_contents['RCP Stdev'] = np.std(epoch_list[samples_rejected:len(epoch_list)-samples_rejected])
             min_epochs = self._find_min_acceptable_mean(
@@ -434,7 +433,6 @@ def _eval_submission_record(self, rcp_record, subm_epochs, results_dir):
         '''Compare reference and submission convergence.'''
         subm_epochs.sort()
         samples_rejected = 4 if rcp_record["Benchmark"] == 'unet3d' else 1
-        samples_rejected = 2 if rcp_record["Benchmark"] == 'llama2_70b_lora' else samples_rejected
         mean_subm_epochs = np.mean(subm_epochs[samples_rejected:len(subm_epochs)-samples_rejected])
         norm_factor = self._find_norm_factor(rcp_record, mean_subm_epochs)
         if mean_subm_epochs >= (rcp_record["RCP Mean"] / rcp_record["Max Speedup"]):
diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_llama2_70b_lora.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_llama2_70b_lora.json
@@ -17,8 +17,8 @@
       "max_steps": 1024 
     },
     "samples to converge": [
-      3840, 4224, 4608, 3840, 3840, 3840, 3840, 3456, 3840, 3456, 3456, 3840,
-      4224, 3456, 3840, 4224, 4992, 3840, 4992, 4224, 3840, 4608, 3840, 3840
+      3072,2688,3456,3072,3072,3072,3456,3456,3072,2688,
+      3456,3072,3072,3072,3840,3456,2688,3072,3456,3456
     ]
   },
 
@@ -40,8 +40,8 @@
       "max_steps": 1024 
     },
     "samples to converge": [
-      5376, 5760, 5760, 6144, 5376, 7296, 5760, 5760, 5760, 6144, 5760, 7680,
-      5376, 5760, 6912, 7680, 6144, 6144, 5760, 5376, 6144, 6144, 6528, 6912
+      3840,3840,4224,3840,3840,3840,4608,3840,4608,3840,
+      4992,3840,3840,3840,4992,3840,3840,4224,3840,3456
     ]
   },
   "llama2_70b_lora_ref_32":
@@ -62,9 +62,8 @@
       "max_steps": 1024 
     },
     "samples to converge": [
-      9600, 10368, 7680, 9600, 9600, 8832, 8064, 10752, 8448, 8064, 8832, 9984,
-      8064, 13824, 8448, 8832, 8064, 9216, 8832, 8064, 9984, 9984, 9216, 8848
-
+      5760,6528,6144,6528,5376,6528,5760,6144,6144,6528,
+      6144,6144,6144,5760,5760,5760,5760,5760,6144,5760
     ]
   }  
 }