From 0a0a659d895eb34818bd929f76acdd616f8de3ee Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Fri, 5 Apr 2024 14:05:07 -0500 Subject: [PATCH] Add 4.0 config + RPCs --- .../training_4.0.0/closed_bert.yaml | 48 +++ .../training_4.0.0/closed_dlrm_dcnv2.yaml | 59 ++++ .../training_4.0.0/closed_gpt3.yaml | 79 +++++ .../training_4.0.0/closed_resnet.yaml | 17 + .../training_4.0.0/closed_resnet_lars.yaml | 37 +++ .../training_4.0.0/closed_resnet_sgd.yaml | 36 +++ .../training_4.0.0/closed_ssd.yaml | 35 ++ .../training_4.0.0/closed_unet3d.yaml | 73 +++++ .../training_4.0.0/open_bert.yaml | 7 + .../training_4.0.0/open_dlrm_dcnv2.yaml | 7 + .../training_4.0.0/open_gpt3.yaml | 79 +++++ .../training_4.0.0/open_llama2_70b_lora.yaml | 7 + .../training_4.0.0/open_resnet.yaml | 7 + .../training_4.0.0/open_ssd.yaml | 7 + .../training_4.0.0/open_unet3d.yaml | 7 + .../rcp_checker/training_4.0.0/rcps_bert.json | 303 ++++++++++++++++++ .../training_4.0.0/rcps_dlrm_dcnv2.json | 133 ++++++++ .../rcp_checker/training_4.0.0/rcps_gpt3.json | 78 +++++ .../training_4.0.0/rcps_resnet.json | 221 +++++++++++++ .../rcp_checker/training_4.0.0/rcps_ssd.json | 145 +++++++++ 20 files changed, 1385 insertions(+) create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/closed_bert.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/closed_dlrm_dcnv2.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/closed_gpt3.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/closed_resnet.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/closed_resnet_lars.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/closed_resnet_sgd.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/closed_ssd.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/closed_unet3d.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/open_bert.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/open_dlrm_dcnv2.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/open_gpt3.yaml create mode 100755 mlperf_logging/compliance_checker/training_4.0.0/open_llama2_70b_lora.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/open_resnet.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/open_ssd.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/open_unet3d.yaml create mode 100644 mlperf_logging/rcp_checker/training_4.0.0/rcps_bert.json create mode 100644 mlperf_logging/rcp_checker/training_4.0.0/rcps_dlrm_dcnv2.json create mode 100644 mlperf_logging/rcp_checker/training_4.0.0/rcps_gpt3.json create mode 100644 mlperf_logging/rcp_checker/training_4.0.0/rcps_resnet.json create mode 100644 mlperf_logging/rcp_checker/training_4.0.0/rcps_ssd.json diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_bert.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_bert.yaml new file mode 100644 index 00000000..408f669b --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/closed_bert.yaml @@ -0,0 +1,48 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_training_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: num_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: start_warmup_step + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_weight_decay_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] >= 0.720) and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_dlrm_dcnv2.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_dlrm_dcnv2.yaml new file mode 100644 index 00000000..45344bd2 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/closed_dlrm_dcnv2.yaml @@ -0,0 +1,59 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adagrad' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adagrad_learning_rate_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_adagrad_initial_accumulator_value + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_adagrad_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-8 " + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_learning_rate_decay_start_step + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.80275 and v['value'] <= 1.0" + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 89137319 " diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_gpt3.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_gpt3.yaml new file mode 100644 index 00000000..8007184a --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/closed_gpt3.yaml @@ -0,0 +1,79 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 2048 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] == 2e-5 or v['value'] == 3e-5 " + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adam_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adam_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adam_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-8 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 2.69) and v['value'] > 2.6" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_resnet.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_resnet.yaml new file mode 100644 index 00000000..67ec8107 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/closed_resnet.yaml @@ -0,0 +1,17 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0" + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['sgd', 'lars'] " + POST: " enqueue_config('training_3.1.0/closed_resnet_{}.yaml'.format(v['value'])) " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.7590 and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_resnet_lars.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_resnet_lars.yaml new file mode 100644 index 00000000..0e8b96f7 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/closed_resnet_lars.yaml @@ -0,0 +1,37 @@ + +- KEY: + NAME: lars_opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: lars_opt_end_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.0001 " + +- KEY: + NAME: lars_opt_learning_rate_decay_poly_power + REQ: EXACTLY_ONE + CHECK: " v['value'] == 2 " + +- KEY: + NAME: lars_opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: lars_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.0 " + +- KEY: + NAME: lars_opt_learning_rate_warmup_epochs + REQ: EXACTLY_ONE + +- KEY: + NAME: lars_opt_momentum + REQ: EXACTLY_ONE + +- KEY: + NAME: lars_opt_weight_decay + REQ: EXACTLY_ONE + CHECK: " is_integer(math.log2(v['value'] / 0.0001)) " + diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_resnet_sgd.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_resnet_sgd.yaml new file mode 100644 index 00000000..bc3b194f --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/closed_resnet_sgd.yaml @@ -0,0 +1,36 @@ + +- KEY: + NAME: sgd_opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " is_integer(v['value'] / 0.001) " + +- KEY: + NAME: sgd_opt_end_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.0001 " + +- KEY: + NAME: sgd_opt_learning_rate_decay_poly_power + REQ: EXACTLY_ONE + +- KEY: + NAME: sgd_opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: sgd_opt_weight_decay + REQ: EXACTLY_ONE + CHECK: " is_integer(math.log2(v['value'] / 0.0001)) " + +- KEY: + NAME: sgd_opt_momentum + REQ: EXACTLY_ONE + +- KEY: + NAME: model_bn_span + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_epochs + REQ: EXACTLY_ONE + diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_ssd.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_ssd.yaml new file mode 100644 index 00000000..794ab7ab --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/closed_ssd.yaml @@ -0,0 +1,35 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: opt_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.0" + +- KEY: + NAME: opt_learning_rate_warmup_epochs + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: opt_learning_rate_warmup_factor + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.340 and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_unet3d.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_unet3d.yaml new file mode 100644 index 00000000..40184add --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/closed_unet3d.yaml @@ -0,0 +1,73 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_epochs + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0" + +- KEY: + NAME: opt_learning_rate_decay_boundary_epochs + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_factor + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: train_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 168" + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 42" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: " v['value'] >= 0.908" + +- KEY: + NAME: seed + REQ: EXACTLY_ONE + +######################## CUSTOM ######################## + +- KEY: + NAME: opt_momentum + REQ: EXACTLY_ONE + +- KEY: + NAME: oversampling + REQ: EXACTLY_ONE + +- KEY: + NAME: training_input_shape + REQ: EXACTLY_ONE + +- KEY: + NAME: validation_input_shape + REQ: EXACTLY_ONE + +- KEY: + NAME: validation_overlap + REQ: EXACTLY_ONE + + diff --git a/mlperf_logging/compliance_checker/training_4.0.0/open_bert.yaml b/mlperf_logging/compliance_checker/training_4.0.0/open_bert.yaml new file mode 100644 index 00000000..14c4176d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/open_bert.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.0.0/open_dlrm_dcnv2.yaml b/mlperf_logging/compliance_checker/training_4.0.0/open_dlrm_dcnv2.yaml new file mode 100644 index 00000000..7f70c0c3 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/open_dlrm_dcnv2.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] <= 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.0.0/open_gpt3.yaml b/mlperf_logging/compliance_checker/training_4.0.0/open_gpt3.yaml new file mode 100644 index 00000000..8007184a --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/open_gpt3.yaml @@ -0,0 +1,79 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 2048 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] == 2e-5 or v['value'] == 3e-5 " + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adam_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adam_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adam_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-8 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 2.69) and v['value'] > 2.6" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + diff --git a/mlperf_logging/compliance_checker/training_4.0.0/open_llama2_70b_lora.yaml b/mlperf_logging/compliance_checker/training_4.0.0/open_llama2_70b_lora.yaml new file mode 100755 index 00000000..14c4176d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/open_llama2_70b_lora.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.0.0/open_resnet.yaml b/mlperf_logging/compliance_checker/training_4.0.0/open_resnet.yaml new file mode 100644 index 00000000..14c4176d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/open_resnet.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.0.0/open_ssd.yaml b/mlperf_logging/compliance_checker/training_4.0.0/open_ssd.yaml new file mode 100644 index 00000000..14c4176d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/open_ssd.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.0.0/open_unet3d.yaml b/mlperf_logging/compliance_checker/training_4.0.0/open_unet3d.yaml new file mode 100644 index 00000000..14c4176d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/open_unet3d.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_bert.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_bert.json new file mode 100644 index 00000000..b28a1b15 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_4.0.0/rcps_bert.json @@ -0,0 +1,303 @@ +{ + + "bert_ref_256": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "Prior to 1.0 submission", + "Platform": "TPU-v4-16 / TF1, TF version ~2.4", + "BS": 256, + "Hyperparams": { + "opt_base_learning_rate": 0.00035, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 13700, + "num_warmup_steps": 0, + "start_warmup_step": 0, + "opt_lamb_beta_1": 0.9, + "opt_lamb_beta_2": 0.999, + "opt_lamb_weight_decay_rate": 0.01, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 2834944, 2508800, 2709504, 2609152, 2383360, 2308096, 2910208, 2333184, 2283008, 2935296, + 2483712, 2558976, 2709504, 2232832, 2333184, 2533888, 2709504, 2257920, 2609152, 2809856] + }, + + "bert_ref_448": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "Prior to 2.1 submission, with Habana's HP set", + "Platform": "TPU-v4-32 / TF1, TF version ~2.10", + "BS": 448, + "Hyperparams": { + "opt_base_learning_rate": 0.000425, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 6700, + "num_warmup_steps": 0, + "start_warmup_step": 0, + "opt_lamb_beta_1": 0.9, + "opt_lamb_beta_2": 0.999, + "opt_lamb_weight_decay_rate": 0.01, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 2132480, 2333184, 2408448, 2483712, 2684416, 2107392, 2157568, 2709504, 2533888, 2584064, + 1981952, 2182656, 2408448, 2433536, 2333184, 2533888, 2458624, 2558976, 2584064, 2358272, + 2358272, 2358272, 2759680] + }, + + "bert_ref_1536": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "At 1.0 submission", + "Platform": "TPU-v4-128 / TF1, TF version ~2.4", + "BS": 1536, + "Hyperparams": { + "opt_base_learning_rate": 0.002, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 2254, + "num_warmup_steps": 0, + "start_warmup_step": 0, + "opt_lamb_beta_1": 0.66, + "opt_lamb_beta_2": 0.996, + "opt_lamb_weight_decay_rate": 0.01, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 2836240, 2801664, 2801664, 2727936, 2801664, 2875392, 2899968, 2727936, 2777088, 2875392, + 2777088, 2801664, 2678784, 2801664, 2703360, 2629632, 2727936, 2703360, 2654208, 2949120] + }, + + "bert_ref_4096": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "Prior to 1.1 submission", + "Platform": "TPU-v4-128 / TF1, TF version ~2.4", + "BS": 4096, + "Hyperparams": { + "opt_base_learning_rate": 0.0024, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 855, + "num_warmup_steps": 0, + "start_warmup_step": 0, + "opt_lamb_beta_1": 0.66, + "opt_lamb_beta_2": 0.998, + "opt_lamb_weight_decay_rate": 0.01, + "gradient_accumulation_steps": 16 + }, + "Epochs to converge": [ + 2801664, 3022848, 2801664, 3022848, 3047424, 2727936, 2973696, 2703360, 2924544, 2629632, + 2678784, 2850816, 2777088, 2826240, 2801664, 2850816, 2924544, 2924544, 2727936, 2850816] + }, + + + "bert_ref_3072": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "Prior to 1.0 submission", + "Platform": "TPU-v4-128 / TF1, TF version ~2.4", + "BS": 3072, + "Hyperparams": { + "opt_base_learning_rate": 0.002, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 1141, + "num_warmup_steps": 100, + "start_warmup_step": 0, + "opt_lamb_beta_1": 0.66, + "opt_lamb_beta_2": 0.998, + "opt_lamb_weight_decay_rate": 0.01, + "gradient_accumulation_steps": 96 + }, + "Epochs to converge": [ + 2703360, 2482176, 3072000, 2654208, 2580480, 2727936, 2605056, 2801664, 2777088, 2580480, + 2875392, 2826240, 2973696, 2850816, 2678784, 2919120, 3121152, 2605056, 2678784, 2850816] + }, + + "bert_ref_4608": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "Prior to 2.0 submission", + "Platform": "TPU-v4-16 / TF1, TF version ~2.8", + "BS": 4608, + "Hyperparams": { + "opt_base_learning_rate": 0.0035, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 700, + "num_warmup_steps": 0, + "start_warmup_step": 0, + "opt_lamb_beta_1": 0.62, + "opt_lamb_beta_2": 0.9, + "opt_lamb_weight_decay_rate": 0.01, + "gradient_accumulation_steps": 144 + }, + "Epochs to converge": [ + 2626560, 2833920, 2787840, 2949120, 2880000, 2810880, 2880000, 3041280, 2787840, 2833920, + 2741760, 2810880, 2649600, 2718720, 2488320, 2603520, 2833920, 2787840, 2810880, 3018240] + }, + + "bert_ref_6144": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "At 1.0 submission", + "Platform": "TPU-v4-128 / TF1, TF version ~2.4", + "BS": 6144, + "Hyperparams": { + "opt_base_learning_rate": 0.0029293, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 700, + "num_warmup_steps": 0, + "start_warmup_step": -700, + "opt_lamb_beta_1": 0.7206, + "opt_lamb_beta_2": 0.78921, + "opt_lamb_weight_decay_rate": 0.001, + "gradient_accumulation_steps": 24 + }, + "Epochs to converge": [ + 3366912, 3244032, 3219456, 3686400, 3317760, 3293184, 3416064, 3317760, 3391488, 2998272, + 3317760, 3072000, 3416064, 3293184, 3391488, 3514368, 3194880, 3465216, 3244032, 3268608] + }, + + "bert_ref_6912": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "At 1.0 submission", + "Platform": "TPU-v4-128 / TF1, TF version ~2.4", + "BS": 6912, + "Hyperparams": { + "opt_base_learning_rate": 0.0029293, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 700, + "num_warmup_steps": 0, + "start_warmup_step": -700, + "opt_lamb_beta_1": 0.7206, + "opt_lamb_beta_2": 0.78921, + "opt_lamb_weight_decay_rate": 0.001, + "gradient_accumulation_steps": 27 + }, + "Epochs to converge": [ + 3621888, 3677184, 3400704, 3594240, 3483648, 3732480, 3677184, 3797776, 3621888, 3760128, + 3649536, 3483648, 3566592, 3649536, 3621888, 3483648, 3290112, 3704832, 3594240, 3511296] + }, + + "bert_ref_8192": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "Prior to 1.0 submission", + "Platform": "TPU-v4-128 / TF1, TF version ~2.4", + "BS": 8192, + "Hyperparams": { + "opt_base_learning_rate": 0.00288293, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 600, + "num_warmup_steps": 287, + "start_warmup_step": -76, + "opt_lamb_beta_1": 0.88, + "opt_lamb_beta_2": 0.88, + "opt_lamb_weight_decay_rate": 0.0166629, + "gradient_accumulation_steps": 16 + }, + "Epochs to converge": [ + 4251648, 4153344, 4055040, 4177920, 4177920, 4079616, 4276224, 4128768, 4177920, 4153344, + 4177920, 4079616, 4300800, 4153344, 4276224, 4423680, 4276224, 4104192, 4251648, 4153344] + }, + + "bert_ref_8704": + { + "Benchmark": "bert", + "Creator": "NVIDIA", + "When": "At 1.1 submission", + "Platform": "TBD", + "BS": 8704, + "Hyperparams": { + "opt_base_learning_rate": 0.002971656225, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 600, + "num_warmup_steps": 287, + "start_warmup_step": -76, + "opt_lamb_beta_1": 0.88, + "opt_lamb_beta_2": 0.88, + "opt_lamb_weight_decay_rate": 0.0166629, + "gradient_accumulation_steps": 34 + }, + "Epochs to converge": [ + 4343040, 4143360, 4143360, 4442880, 4392960, 4243200, 4193280, 4542720, 4492800, 4243200, + 4243200, 4392960, 4243200, 4193280, 4093440, 4392960, 4093440, 4243200, 4093440, 4392960] + }, + + "bert_ref_12288": + { + "Benchmark": "bert", + "Creator": "NVIDIA", + "When": "At 1.1 submission", + "Platform": "TBD", + "BS": 12288, + "Hyperparams": { + "opt_base_learning_rate": 0.0031, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 500, + "num_warmup_steps": 300, + "start_warmup_step": -100, + "opt_lamb_beta_1": 0.80, + "opt_lamb_beta_2": 0.925, + "opt_lamb_weight_decay_rate": 0.0166629, + "gradient_accumulation_steps": 32 + }, + "Epochs to converge": [ + 4542720, 4392960, 4642560, 4542720, 4542720, 4492800, 4343040, 4343040, 4442880, 4442880, + 4442880, 4442880, 4442880, 4692480, 4492800, 4442880, 4442880, 4442880, 4492800, 4343040] + }, + + "bert_ref_13056": + { + "Benchmark": "bert", + "Creator": "NVIDIA", + "When": "At 1.1 submission", + "Platform": "TBD", + "BS": 13056, + "Hyperparams": { + "opt_base_learning_rate": 0.00319540686, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 500, + "num_warmup_steps": 300, + "start_warmup_step": -100, + "opt_lamb_beta_1": 0.80, + "opt_lamb_beta_2": 0.925, + "opt_lamb_weight_decay_rate": 0.0166629, + "gradient_accumulation_steps": 34 + }, + "Epochs to converge": [ + 4442880, 4592640, 4642560, 4842240, 4742400, 4592640, 4642560, 4692480, 4942080, 4542720, + 4592640, 4093440, 4442880, 4792320, 4642560, 4592640, 4592640, 4892160, 4742400, 4592640] + }, + + "bert_ref_16384": + { + "Benchmark": "bert", + "Creator": "NVIDIA", + "When": "At 2.0 submission", + "Platform": "TPU-v3-128", + "BS": 16384, + "Hyperparams": { + "opt_base_learning_rate": 0.0033, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 600, + "num_warmup_steps": 290, + "start_warmup_step": -100, + "opt_lamb_beta_1": 0.75, + "opt_lamb_beta_2": 0.9, + "opt_lamb_weight_decay_rate": 0.0166629, + "gradient_accumulation_steps": 32 + }, + "Epochs to converge": [ + 5619712, 5770240, 5720064, 5419008, 5519360, 5569536, 5218304, 5469184, 5419008, 5218304, + 5669888, 5669888, 5519360, 5569536, 5368832, 5469184, 5569536, 5469184, 5368832, 5469184] + } +} diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_dlrm_dcnv2.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_dlrm_dcnv2.json new file mode 100644 index 00000000..b0fd3e6b --- /dev/null +++ b/mlperf_logging/rcp_checker/training_4.0.0/rcps_dlrm_dcnv2.json @@ -0,0 +1,133 @@ +{ + + "dlrm_dcnv2_ref_32768": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "DGX-A100", + "BS": 32768, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, + 0.75, 0.7, 0.7, 0.7, 0.75, 0.75, 0.75, 0.7, 0.7, 0.7, + 0.7, 0.7, 0.75, 0.7, 0.65, 0.7, 0.7, 0.7, 0.7, 0.7 + ] + }, + + "dlrm_dcnv2_ref_55296": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "At 3.0 submission", + "Platform": "DGX-A100", + "BS": 55296, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.75, 0.75, 0.75, 0.7, 0.8, 0.75, 0.75, 0.75, 0.75, 0.75, + 0.9, 0.7, 0.75, 0.8, 0.7, 0.8, 0.7, 0.7, 0.75, 0.7, + 0.7, 0.9, 0.75, 0.7, 0.8, 0.75, 0.75, 0.8, 0.75, 0.8, + 0.9, 0.75, 0.8, 0.75, 0.8, 0.75, 0.75, 0.75, 0.7, 0.75, + 0.75, 0.8, 0.75, 0.8, 0.8, 0.9, 0.75, 0.75, 0.7, 0.75 + ] + }, + + "dlrm_dcnv2_ref_65536": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "DGX-A100", + "BS": 65536, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.75, 0.8, 0.75, 0.75, 0.8, 0.75, 0.8, 0.9, 0.95, 0.75, + 0.75, 0.75, 0.85, 0.85, 0.7, 0.75, 0.75, 0.9, 0.85, 0.8, + 0.7, 0.75, 0.75, 0.75, 0.8, 0.9, 0.75, 0.8, 0.85, 0.8 + ] + }, + + "dlrm_dcnv2_ref_102400": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "DGX-A100", + "BS": 102400, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.85, 0.95, 0.95, 0.85, 0.9, 0.8, 0.85, 0.9, 0.9, 0.9, + 0.95, 0.9, 0.9, 0.9, 0.9, 0.9, 0.85, 0.85, 0.9, 0.9, + 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.85, 0.9, 0.9, + 0.9, 0.95, 0.85, 0.9, 0.9, 0.9, 0.85, 0.9, 0.95, 0.9, + 0.85, 0.95, 0.9, 0.9, 0.8, 0.9, 0.9, 0.9, 0.85, 0.9 + ] + }, + + "dlrm_dcnv2_ref_135168": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "At 3.0 submission", + "Platform": "DGX-A100", + "BS": 135168, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.0034, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.95, 0.9, 0.9, 0.9, 0.9, 0.95, 0.9, 0.95, 0.95, 0.9, + 0.95, 0.95, 0.95, 1.0, 0.85, 0.9, 0.9, 0.95, 0.95, 0.95, + 0.95, 0.9, 0.9, 0.9, 0.95, 0.95, 1.0, 0.9, 0.95, 0.95, + 0.85, 0.95, 0.95, 0.95, 0.9, 0.95, 0.9, 0.9, 1.0, 0.9, + 0.95, 0.9, 0.95, 0.95, 0.95, 0.95, 0.95, 0.9, 0.9, 0.9, + 0.9, 0.9, 0.9, 0.9, 0.95, 0.85, 0.95, 0.95, 0.9, 0.95, + 0.95, 0.95, 0.95, 1.0, 0.9, 0.95, 0.9, 1.0, 0.85, 0.9, + 0.9, 0.95, 0.95, 0.9, 0.95, 0.9, 0.95, 0.85, 0.95, 0.95, + 0.95, 0.9, 0.9, 0.95, 0.9, 0.95, 0.9, 1.0 + ] + } + +} diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_gpt3.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_gpt3.json new file mode 100644 index 00000000..24aa0120 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_4.0.0/rcps_gpt3.json @@ -0,0 +1,78 @@ +{ + + "gpt3_ref_1536": + { + "Benchmark": "gpt3", + "Creator": "Google & NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "TPU-v4-1536 / PaxML, 1024 A100-80GB / Megatron-LM", + "BS": 1536, + "Hyperparams": { + "opt_base_learning_rate": 2e-5 + }, + "Epochs to converge": [ + 1157627904, 1157627904, 1157627904, 1258291200, 1207959552, 1258291200 + ] + }, + + "gpt3_ref_2048": + { + "Benchmark": "gpt3", + "Creator": "Google & NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "TPU-v4-2048 / PaxML, 1024 A100-80GB / Megatron-LM", + "BS": 2048, + "Hyperparams": { + "opt_base_learning_rate": 2e-5 + }, + "Epochs to converge": [ + 1157627904, 1207959552, 1157627904, 1207959552, 1207959552, 1157627904 + ] + }, + + "gpt3_ref_3072": + { + "Benchmark": "gpt3", + "Creator": "Google & NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "TPU-v4-1536 / PaxML, 1024 A100-80GB / Megatron-LM", + "BS": 3072, + "Hyperparams": { + "opt_base_learning_rate": 2e-5 + }, + "Epochs to converge": [ + 1258291200, 1207959552, 1207959552, 1207959552, 1207959552, 1207959552 + ] + }, + + "gpt3_ref_4096": + { + "Benchmark": "gpt3", + "Creator": "Google & NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "TPU-v4-2048 / PaxML, 1024 A100-80GB / Megatron-LM", + "BS": 4096, + "Hyperparams": { + "opt_base_learning_rate": 3e-5 + }, + "Epochs to converge": [ + 1258291200, 1258291200, 1308622848, 1258291200, 1258291200, 1258291200 + ] + }, + + "gpt3_ref_8192": + { + "Benchmark": "gpt3", + "Creator": "Google & NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "TPU-v4-2048 / PaxML, 1024 A100-80GB / Megatron-LM", + "BS": 8192, + "Hyperparams": { + "opt_base_learning_rate": 3e-5 + }, + "Epochs to converge": [ + 1610612736, 1660944384, 1660944384, 1610612736, 1610612736, 1610612736 + ] + } + +} diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_resnet.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_resnet.json new file mode 100644 index 00000000..d05e9195 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_4.0.0/rcps_resnet.json @@ -0,0 +1,221 @@ +{ + + "resnet_ref_1632": + { + "Benchmark": "resnet", + "Creator": "NVIDIA", + "When": "At 1.0 submission", + "Platform": "TBD", + "BS": 1632, + "Hyperparams": { + "optimizer": "lars", + "opt_base_learning_rate": 7.4, + "opt_learning_rate_warmup_epochs": 2, + "opt_momentum": 0.9, + "opt_weight_decay": 5.0e-5, + "opt_learning_rate_decay_steps": 27476, + "model_bn_span": 204 + }, + "Epochs to converge": [ + 34, 34, 34, 35, 35, 37, 35, 35, 33, 35, + 35, 37, 37, 35, 35, 34, 35 + ] + }, + + "resnet_ref_3264": + { + "Benchmark": "resnet", + "Creator": "NVIDIA", + "When": "At 1.0 submission", + "Platform": "TBD", + "BS": 3264, + "Hyperparams": { + "optimizer": "lars", + "opt_base_learning_rate": 10.5, + "opt_learning_rate_warmup_epochs": 2, + "opt_momentum": 0.9, + "opt_weight_decay": 0.00005, + "opt_learning_rate_decay_steps": 14523, + "model_bn_span": 204 + }, + "Epochs to converge": [ + 36, 35, 34, 34, 36, 35, 35, 35, 35, 37, + 35, 35, 35, 36, 35 + ] + }, + + "resnet_ref_4096": + { + "Benchmark": "resnet", + "Creator": "Google", + "When": "Prior to 1.0 submission", + "Platform": "TPU-v4-64 / TF2, TF version ~2.4", + "BS": 4096, + "Hyperparams": { + "optimizer": "lars", + "opt_base_learning_rate": 10, + "opt_learning_rate_warmup_epochs": 5, + "opt_momentum": 0.9, + "opt_weight_decay": 2e-4, + "opt_learning_rate_decay_steps": 13137, + "model_bn_span": 64 + }, + "Epochs to converge": [ + 39, 42, 40, 41, 39, 40, 41, 39, 40, 39] + }, + + "resnet_ref_8192": + { + "Benchmark": "resnet", + "Creator": "Google", + "When": "Prior to 1.0 submission", + "Platform": "TPU-v4-128 / TF2, TF version ~2.4", + "BS": 8192, + "Hyperparams": { + "optimizer": "lars", + "opt_base_learning_rate": 17, + "opt_learning_rate_warmup_epochs": 5, + "opt_momentum": 0.9, + "opt_weight_decay": 2e-4, + "opt_learning_rate_decay_steps": 6095, + "model_bn_span": 64 + }, + "Epochs to converge": [ + 42, 44, 43, 41, 41, 41, 42, 42, 43, 41] + }, + + "resnet_ref_13056": + { + "Benchmark": "resnet", + "Creator": "NVIDIA", + "When": "At 1.0 submission", + "Platform": "TBD", + "BS": 13056, + "Hyperparams": { + "optimizer": "lars", + "opt_base_learning_rate": 12.8, + "opt_learning_rate_warmup_epochs": 9, + "opt_momentum": 0.9, + "opt_weight_decay": 2.5e-5, + "opt_learning_rate_decay_steps": 4710, + "model_bn_span": 204 + }, + "Epochs to converge": [ + 46, 46, 44, 46, 45, 46, 45, 45, 45, 46 + ] + }, + + "resnet_ref_32768": + { + "Benchmark": "resnet", + "Creator": "Google", + "When": "Prior to 1.0 submission", + "Platform": "TPU-v4-32 / TF2, TF version ~2.4", + "BS": 32768, + "Hyperparams": { + "optimizer": "lars", + "opt_base_learning_rate": 21.2, + "opt_learning_rate_warmup_epochs": 16, + "opt_momentum": 0.94, + "opt_weight_decay": 2.5e-5, + "opt_learning_rate_decay_steps": 2267, + "model_bn_span": 128 + }, + "Epochs to converge": [ + 56, 56, 55, 56, 56, 56, 56, 56, 57, 56] + }, + + "resnet_ref_49152": + { + "Benchmark": "resnet", + "Creator": "NVIDIA", + "When": "At 1.1 submission", + "Platform": "TBD", + "BS": 49152, + "Hyperparams": { + "optimizer": "lars", + "opt_base_learning_rate": 19.405, + "opt_end_learning_rate": 0.0001, + "opt_learning_rate_decay_poly_power": 2, + "epsilon": 0, + "opt_learning_rate_warmup_epochs": 24, + "opt_momentum": 0.951807, + "opt_weight_decay": 0.0001, + "opt_learning_rate_decay_steps": 1846, + "model_bn_span": 256 + }, + "Epochs to converge": [ + 68, 68, 68, 68, 69, 68, 68, 67, 69, 67 + ] + }, + + "resnet_ref_58880": + { + "Benchmark": "resnet", + "Creator": "NVIDIA", + "When": "At 1.1 submission", + "Platform": "TBD", + "BS": 58880, + "Hyperparams": { + "optimizer": "lars", + "opt_base_learning_rate": 23.246, + "opt_end_learning_rate": 0.0001, + "opt_learning_rate_decay_poly_power": 2, + "epsilon": 0, + "opt_learning_rate_warmup_epochs": 29, + "opt_momentum": 0.951807, + "opt_weight_decay": 0.0001, + "opt_learning_rate_decay_steps": 1785, + "model_bn_span": 230 + }, + "Epochs to converge": [ + 80, 81, 80, 78, 80, 80, 81, 79, 80, 80 + ] + }, + + "resnet_ref_65536": + { + "Benchmark": "resnet", + "Creator": "Google", + "When": "Prior to 1.0 submission", + "Platform": "TPU-v3-512 / TF2, TF version ~2.4", + "BS": 65536, + "Hyperparams": { + "optimizer": "lars", + "opt_base_learning_rate": 24.699, + "opt_learning_rate_warmup_epochs": 31, + "opt_momentum": 0.951807, + "opt_weight_decay": 1e-4, + "opt_learning_rate_decay_steps": 1133, + "model_bn_span": 128 + }, + "Epochs to converge": [ + 83, 85, 84, 86, 85, 85, 83, 84, 85, 85] + }, + + "resnet_ref_67840": + { + "Benchmark": "resnet", + "Creator": "NVIDIA", + "When": "At 2.0 submission", + "Platform": "TBD", + "BS": 67840, + "Hyperparams": { + "optimizer": "lars", + "opt_base_learning_rate": 24.699, + "opt_end_learning_rate": 0.0001, + "opt_learning_rate_decay_poly_power": 2, + "epsilon": 0, + "opt_learning_rate_warmup_epochs": 31, + "opt_momentum": 0.951807, + "opt_weight_decay": 0.0001, + "opt_learning_rate_decay_steps": 1133, + "model_bn_span": 424 + }, + "Epochs to converge": [ + 86, 87, 86, 87, 86, 85, 86, 86, 86, 85, 86, 86, 86, 86, 86, 85, 86, 86, 87, 85, 85, 86, 85, 87, 86, 85, 85, 86 + ] + } + +} + diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_ssd.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_ssd.json new file mode 100644 index 00000000..a4e56d22 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_4.0.0/rcps_ssd.json @@ -0,0 +1,145 @@ +{ + + "ssd_ref_256": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v2.0", + "Platform": "1xDGX-A100", + "BS": 256, + "Hyperparams": { + "opt_base_learning_rate": 0.0001, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4] + }, + + "ssd_ref_320": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.1", + "Platform": "2xDGX-A100", + "BS": 320, + "Hyperparams": { + "opt_base_learning_rate": 0.0001, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5] + }, + + "ssd_ref_512": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v2.0", + "Platform": "8xDGX-A100", + "BS": 512, + "Hyperparams": { + "opt_base_learning_rate": 0.0001, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5] + }, + + "ssd_ref_768": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.0", + "Platform": "8xDGX-A100", + "BS": 768, + "Hyperparams": { + "opt_base_learning_rate": 0.00013, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5] + }, + + "ssd_ref_1024": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.0", + "Platform": "8xDGX-A100", + "BS": 1024, + "Hyperparams": { + "opt_base_learning_rate": 0.00011, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + }, + + "ssd_ref_1280": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v2.1", + "Platform": "16xDGX-A100", + "BS": 1280, + "Hyperparams": { + "opt_base_learning_rate": 0.00013, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7] + }, + + "ssd_ref_2048": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v2.1", + "Platform": "16xDGX-A100", + "BS": 2048, + "Hyperparams": { + "opt_base_learning_rate": 0.000135, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 9] + }, + + "ssd_ref_4096": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v2.0", + "Platform": "16xDGX-A100", + "BS": 4096, + "Hyperparams": { + "opt_base_learning_rate": 0.0001, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 16, 16, 16] + } +} +