Skip to content

Commit be06569

Browse files
authored
Merge pull request #364 from itayhubara/fix_rcps_llama2
new rcp for llama after gradient clipping fix
2 parents e758874 + 6ad128b commit be06569

File tree

3 files changed

+9
-12
lines changed

3 files changed

+9
-12
lines changed

mlperf_logging/benchmark_meta.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
'rnnt': 10,
1717
'unet3d': 40,
1818
'gnn' : 10,
19-
'llama2_70b_lora': 12,
19+
'llama2_70b_lora': 10,
2020
},
2121

2222
'hpc' : {

mlperf_logging/rcp_checker/rcp_checker.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
'rnnt': 10,
3030
'stable_diffusion': 10,
3131
'gnn': 10,
32-
'llama2_70b_lora': 12,
32+
'llama2_70b_lora': 10,
3333
},
3434
"hpc": {
3535
'cosmoflow': 10,
@@ -153,7 +153,7 @@ def get_submission_epochs(result_files, ruleset, bert_train_samples):
153153

154154
if (bert_train_samples and benchmark != "bert"):
155155
logging.info(' bert_train_samples set for submission that is not bert')
156-
if (not_converged > 1 and benchmark != 'unet3d') or (not_converged > 4 and benchmark == 'unet3d') or (not_converged > 2 and benchmark == 'llama2_70b_lora'):
156+
if (not_converged > 1 and benchmark != 'unet3d') or (not_converged > 4 and benchmark == 'unet3d'):
157157
subm_epochs = None
158158
return bs, subm_epochs, benchmark
159159

@@ -267,7 +267,6 @@ def compute_rcp_stats(self):
267267
# Use olympic mean
268268
epoch_list.sort()
269269
samples_rejected = 4 if record_contents['Benchmark'] == 'unet3d' else 1
270-
samples_rejected = 2 if record_contents['Benchmark'] == 'llama2_70b_lora' else samples_rejected
271270
record_contents['RCP Mean'] = np.mean(epoch_list[samples_rejected:len(epoch_list)-samples_rejected])
272271
record_contents['RCP Stdev'] = np.std(epoch_list[samples_rejected:len(epoch_list)-samples_rejected])
273272
min_epochs = self._find_min_acceptable_mean(
@@ -434,7 +433,6 @@ def _eval_submission_record(self, rcp_record, subm_epochs, results_dir):
434433
'''Compare reference and submission convergence.'''
435434
subm_epochs.sort()
436435
samples_rejected = 4 if rcp_record["Benchmark"] == 'unet3d' else 1
437-
samples_rejected = 2 if rcp_record["Benchmark"] == 'llama2_70b_lora' else samples_rejected
438436
mean_subm_epochs = np.mean(subm_epochs[samples_rejected:len(subm_epochs)-samples_rejected])
439437
norm_factor = self._find_norm_factor(rcp_record, mean_subm_epochs)
440438
if mean_subm_epochs >= (rcp_record["RCP Mean"] / rcp_record["Max Speedup"]):

mlperf_logging/rcp_checker/training_4.0.0/rcps_llama2_70b_lora.json

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
"max_steps": 1024
1818
},
1919
"samples to converge": [
20-
3840, 4224, 4608, 3840, 3840, 3840, 3840, 3456, 3840, 3456, 3456, 3840,
21-
4224, 3456, 3840, 4224, 4992, 3840, 4992, 4224, 3840, 4608, 3840, 3840
20+
3072,2688,3456,3072,3072,3072,3456,3456,3072,2688,
21+
3456,3072,3072,3072,3840,3456,2688,3072,3456,3456
2222
]
2323
},
2424

@@ -40,8 +40,8 @@
4040
"max_steps": 1024
4141
},
4242
"samples to converge": [
43-
5376, 5760, 5760, 6144, 5376, 7296, 5760, 5760, 5760, 6144, 5760, 7680,
44-
5376, 5760, 6912, 7680, 6144, 6144, 5760, 5376, 6144, 6144, 6528, 6912
43+
3840,3840,4224,3840,3840,3840,4608,3840,4608,3840,
44+
4992,3840,3840,3840,4992,3840,3840,4224,3840,3456
4545
]
4646
},
4747
"llama2_70b_lora_ref_32":
@@ -62,9 +62,8 @@
6262
"max_steps": 1024
6363
},
6464
"samples to converge": [
65-
9600, 10368, 7680, 9600, 9600, 8832, 8064, 10752, 8448, 8064, 8832, 9984,
66-
8064, 13824, 8448, 8832, 8064, 9216, 8832, 8064, 9984, 9984, 9216, 8848
67-
65+
5760,6528,6144,6528,5376,6528,5760,6144,6144,6528,
66+
6144,6144,6144,5760,5760,5760,5760,5760,6144,5760
6867
]
6968
}
7069
}

0 commit comments

Comments
 (0)