From 16fa097119d47edd6de8c698d084424e7f173f59 Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Wed, 5 Feb 2025 16:38:45 -0800 Subject: [PATCH 1/7] adds Llama 3.1 RCPs --- mlperf_logging/rcp_checker/rcp_checker.py | 1 + .../training_5.0.0/rcps_llama31_405b.json | 68 +++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index d54d00a..f86d533 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -32,6 +32,7 @@ 'gnn': 10, 'rgat': 10, 'llama2_70b_lora': 10, + 'llama31_405b': 3, }, "hpc": { 'cosmoflow': 10, diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json new file mode 100644 index 0000000..607fee4 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json @@ -0,0 +1,68 @@ +{ + "llama31_405b_ref_1152": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.0 submission", + "Platform": "288xDGX-H100", + "BS": 1152, + "Hyperparams": { + "opt_base_learning_rate": 8e-05, + "opt_learning_rate_warmup_steps": 8000, + "gradient_accumulation_steps": 144 + }, + "Epochs to converge": [ + 2642411520,2642411520,2642411520, + 2642411520,2642411520,2642411520 + ] + }, + + "llama31_405b_ref_2304": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.0 submission", + "Platform": "288xDGX-H100", + "BS": 2304, + "Hyperparams": { + "opt_base_learning_rate": 16e-05, + "opt_learning_rate_warmup_steps": 4000, + "gradient_accumulation_steps": 288 + }, + "Epochs to converge": [ + 3019898880,3019898880,3019898880, + 3019898880,3397386240,3397386240 + ] + }, + "llama31_405b_ref_4608": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.0 submission", + "Platform": "288xDGX-H100", + "BS": 4608, + "Hyperparams": { + "opt_base_learning_rate": 32e-05, + "opt_learning_rate_warmup_steps": 2000, + "gradient_accumulation_steps": 576 + }, + "Epochs to converge": [ + ] + }, + "llama31_405b_ref_9216": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.0 submission", + "Platform": "288xDGX-H100", + "BS": 9216, + "Hyperparams": { + "opt_base_learning_rate": 64e-05, + "opt_learning_rate_warmup_steps": 1000, + "gradient_accumulation_steps": 1152 + }, + "Epochs to converge": [ + ] + } + } + \ No newline at end of file From 6f31c742cd29d81bc868e9e96b4b65b703971475 Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Wed, 5 Feb 2025 16:48:51 -0800 Subject: [PATCH 2/7] adds compliance checker --- .../training_5.0.0/closed_llama31_405b.yaml | 80 +++++++++++++++++++ .../training_5.0.0/open_llama31_405b.yaml | 74 +++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/closed_llama31_405b.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/open_llama31_405b.yaml diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_llama31_405b.yaml new file mode 100644 index 0000000..d93afa6 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/closed_llama31_405b.yaml @@ -0,0 +1,80 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] * s['global_batch_size'] == 1152 * 8e-5 " + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adam_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adam_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adam_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-05 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 47185920 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 5.6) and v['value'] > 0.0" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_llama31_405b.yaml new file mode 100644 index 0000000..d16b0c8 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/open_llama31_405b.yaml @@ -0,0 +1,74 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adam_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adam_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adam_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 47185920 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 5.6) and v['value'] > 0.0" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + From c0791949ee9a96fa1f0206f0d52234cd3ae94f08 Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Wed, 5 Feb 2025 16:49:51 -0800 Subject: [PATCH 3/7] adds Llama 3.1 405B constant --- mlperf_logging/mllog/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index 54904ba..d272c1e 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -54,6 +54,7 @@ LLAMA2_70B_LORA = "llama2_70b_lora" GNN = "gnn" RGAT = "rgat" +LLAMA31_405B = "llama31_405b" # Constant values - model info ADAGRAD = "adagrad" From 9c289f74210512ff601f1506fbd744ee4356bd92 Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Wed, 5 Feb 2025 16:53:01 -0800 Subject: [PATCH 4/7] updates common yaml, result summarizer, and benchmark meta --- mlperf_logging/benchmark_meta.py | 4 +++- .../compliance_checker/training_5.0.0/closed_common.yaml | 2 +- .../compliance_checker/training_5.0.0/open_common.yaml | 2 +- mlperf_logging/result_summarizer/config.yaml | 1 + 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index 54792e8..d323050 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -19,6 +19,7 @@ 'gnn' : 10, 'rgat': 10, 'llama2_70b_lora': 10, + 'llama31_405b': 3, }, 'hpc' : { @@ -140,7 +141,8 @@ 'retinanet', 'stable_diffusion', 'llama2_70b_lora', - 'rgat' + 'rgat', + 'llama31_405b' ] }, diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml index e5cc807..4d0ea48 100755 --- a/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml +++ b/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml @@ -2,7 +2,7 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['retinanet', 'stable_diffusion', 'dlrm_dcnv2', 'bert', 'rgat', 'llama2_70b_lora'] " + CHECK: " v['value'] in ['retinanet', 'stable_diffusion', 'dlrm_dcnv2', 'bert', 'rgat', 'llama2_70b_lora', 'llama31_405b'] " POST: " enqueue_config('training_4.1.0/closed_{}.yaml'.format(v['value'])) " - KEY: diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml index 5b4f1a3..85803a8 100644 --- a/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml +++ b/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml @@ -2,5 +2,5 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['retinanet', 'stable_diffusion', 'dlrm_dcnv2', 'bert', 'rgat', 'llama2_70b_lora'] " + CHECK: " v['value'] in ['retinanet', 'stable_diffusion', 'dlrm_dcnv2', 'bert', 'rgat', 'llama2_70b_lora', 'llama31_405b'] " POST: " enqueue_config('training_4.1.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 9a07065..b308035 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -91,6 +91,7 @@ columns: stable_diffusion: ["Benchmark results (minutes)", "Text to image", "Laion 400m and Coco-2017", "StableDiffusion"] llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] rgat: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"] + llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] default: [" ", " ", " "] hpc: From 65000ecd848eda745093b70ee5cfec500a1b98aa Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Fri, 14 Feb 2025 08:45:32 -0800 Subject: [PATCH 5/7] adds 2 RCPs for 4k --- mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json | 1 + 1 file changed, 1 insertion(+) diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json index 607fee4..eb41347 100644 --- a/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json +++ b/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json @@ -47,6 +47,7 @@ "gradient_accumulation_steps": 576 }, "Epochs to converge": [ + 3774873600,4152360960 ] }, "llama31_405b_ref_9216": From 7bcf915ac42225a8bb06e4708ae1355034922cfb Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Thu, 20 Feb 2025 09:54:28 -0800 Subject: [PATCH 6/7] use samples instead of tokens --- .../training_5.0.0/closed_llama31_405b.yaml | 2 +- .../training_5.0.0/open_llama31_405b.yaml | 2 +- .../rcp_checker/training_5.0.0/rcps_llama31_405b.json | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_llama31_405b.yaml index d93afa6..76a6c74 100644 --- a/mlperf_logging/compliance_checker/training_5.0.0/closed_llama31_405b.yaml +++ b/mlperf_logging/compliance_checker/training_5.0.0/closed_llama31_405b.yaml @@ -64,7 +64,7 @@ - KEY: NAME: eval_samples REQ: EXACTLY_ONE - CHECK: " v['value'] == 47185920 " + CHECK: " v['value'] == 5760 " - KEY: NAME: eval_accuracy diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_llama31_405b.yaml index d16b0c8..3b642ff 100644 --- a/mlperf_logging/compliance_checker/training_5.0.0/open_llama31_405b.yaml +++ b/mlperf_logging/compliance_checker/training_5.0.0/open_llama31_405b.yaml @@ -58,7 +58,7 @@ - KEY: NAME: eval_samples REQ: EXACTLY_ONE - CHECK: " v['value'] == 47185920 " + CHECK: " v['value'] == 5760 " - KEY: NAME: eval_accuracy diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json index eb41347..174b183 100644 --- a/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json +++ b/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json @@ -12,8 +12,8 @@ "gradient_accumulation_steps": 144 }, "Epochs to converge": [ - 2642411520,2642411520,2642411520, - 2642411520,2642411520,2642411520 + 322560,322560,322560, + 322560,322560,322560 ] }, @@ -30,8 +30,8 @@ "gradient_accumulation_steps": 288 }, "Epochs to converge": [ - 3019898880,3019898880,3019898880, - 3019898880,3397386240,3397386240 + 368640,368640,368640, + 368640,414720,414720 ] }, "llama31_405b_ref_4608": @@ -47,7 +47,7 @@ "gradient_accumulation_steps": 576 }, "Epochs to converge": [ - 3774873600,4152360960 + 460800, 460800, 506880 ] }, "llama31_405b_ref_9216": From c293cb461ddbdc4a79cc1732106f33b8eaf04417 Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Fri, 21 Feb 2025 13:46:24 -0800 Subject: [PATCH 7/7] adds remaining 4608 RCPs --- .../training_5.0.0/rcps_llama31_405b.json | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json index 174b183..8b298f3 100644 --- a/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json +++ b/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama31_405b.json @@ -47,22 +47,8 @@ "gradient_accumulation_steps": 576 }, "Epochs to converge": [ - 460800, 460800, 506880 - ] - }, - "llama31_405b_ref_9216": - { - "Benchmark": "llama31_405b", - "Creator": "NVIDIA", - "When": "Reference RCPs before 5.0 submission", - "Platform": "288xDGX-H100", - "BS": 9216, - "Hyperparams": { - "opt_base_learning_rate": 64e-05, - "opt_learning_rate_warmup_steps": 1000, - "gradient_accumulation_steps": 1152 - }, - "Epochs to converge": [ + 460800,460800,506880, + 506880,506880,506880 ] } }