Add BERT-L perf regression test on MI100 and re-enable batch size tes…

…t (#7240) * restore bs test and add perf test * update perf number and fix path to results
ucb-bar · Apr 5, 2021 · 9f14af9 · 9f14af9
1 parent 10102c0
commit 9f14af9
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 30 deletions.
diff --git a/orttraining/tools/ci_test/run_batch_size_test.py b/orttraining/tools/ci_test/run_batch_size_test.py
@@ -46,9 +46,9 @@ def main():
     ]
 
     configs['MI100_32G'] = [
-        Config(True, 128, 201, 20, ""),
-        Config(True, 512, 31, 80, ""),
-        Config(False, 128, 109, 20, ""),
+        Config(True, 128, 200, 20, ""),
+        Config(True, 512, 30, 80, ""),
+        Config(False, 128, 108, 20, ""),
         Config(False, 512, 16, 80, ""),
     ]
 

diff --git a/orttraining/tools/ci_test/run_bert_perf_test.py b/orttraining/tools/ci_test/run_bert_perf_test.py
@@ -6,6 +6,7 @@
 import subprocess
 import sys
 import os
+import json
 from collections import namedtuple
 
 SCRIPT_DIR = os.path.realpath(os.path.dirname(__file__))
@@ -18,27 +19,36 @@ def parse_args():
                       help="Path to the training data root directory.")
   parser.add_argument("--model_root", required=True,
                       help="Path to the model root directory.")
+  parser.add_argument("--gpu_sku", choices=['V100_16G', 'MI100_32G'], default='V100_16G', required=False,
+                      help="GPU model (e.g. V100_16G, MI100_32G).")
   return parser.parse_args()
 
 # using the same params from "GitHub Master Merge Schedule" in OneNotes
 def main():
     args = parse_args()
 
-    Config = namedtuple('Config', ['use_mixed_precision', 'max_seq_length', 'batch_size', 'max_predictions_per_seq'])
-    configs = [
-        Config(True, 128, 76, 20),
-        Config(True, 512, 11, 80),
-        Config(False, 128, 39, 20),
-        Config(False, 512, 6, 80)
+    Config = namedtuple('Config', ['use_mixed_precision', 'max_seq_length', 'batch_size', 'max_predictions_per_seq', 'expected_perf'])
+    configs = {}
+    configs['V100_16G'] = [
+        Config(True, 128, 76, 20, -1.0),
+        Config(True, 512, 11, 80, -1.0),
+        Config(False, 128, 39, 20, -1.0),
+        Config(False, 512, 6, 80, -1.0)
+    ]
+
+    configs['MI100_32G'] = [
+        Config(True, 128, 128, 20, 240),
     ]
 
     # run BERT training
-    for c in configs:
+    for c in configs[args.gpu_sku]:
+        model = 'bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12'
+        precision_prefix = ('fp16' if c.use_mixed_precision else 'fp32')
         print("######## testing name - " + ('fp16-' if c.use_mixed_precision else 'fp32-') + str(c.max_seq_length) + " ##############")
         cmds = [
             os.path.join(args.binary_dir, "onnxruntime_training_bert"),
             "--model_name", os.path.join(
-                args.model_root, "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"),
+                args.model_root, "nv/bert-large/{}".format(model)),
             "--train_data_dir", os.path.join(
                 args.training_data_root, str(c.max_seq_length), "books_wiki_en_corpus/train"),
             "--test_data_dir", os.path.join(
@@ -64,7 +74,12 @@ def main():
             cmds.append("--allreduce_in_fp16"),
 
         subprocess.run(cmds).check_returncode()
-
+        if c.expected_perf > 0.0:
+            json_filename = 'onnxruntime_perf_metrics_{}.onnx_bert_{}_{}_Lamb.json'.format(model, precision_prefix, c.max_seq_length)
+            with open(os.path.join(SCRIPT_DIR, 'results', json_filename)) as json_file:
+                results = json.load(json_file)
+                assert(results['EndToEndThroughput'] > 0.98*c.expected_perf)
+
     return 0
 
 if __name__ == "__main__":

diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-amd-e2e-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-amd-e2e-test-ci-pipeline.yml
@@ -48,21 +48,30 @@ jobs:
   - script: |-
       cd ./build/RelWithDebInfo &&\
       ../../tools/ci_build/github/pai/pai_test_launcher.sh
-    displayName: 'Run unit tests'
+    displayName: 'Run onnxruntime unit tests'
 
-#  - script: |-
-#     python orttraining/tools/ci_test/run_batch_size_test.py \
-#       --binary_dir build/RelWithDebInfo \
-#       --model_root training_e2e_test_data/models \
-#       --gpu_sku MI100_32G
-#    displayName: 'Run batch size test'
-#    condition: succeededOrFailed() # ensure all tests are run
+  - script: |-
+     python orttraining/tools/ci_test/run_batch_size_test.py \
+       --binary_dir build/RelWithDebInfo \
+       --model_root training_e2e_test_data/models \
+       --gpu_sku MI100_32G
+    displayName: 'Run C++ BERT-L batch size test'
+    condition: succeededOrFailed() # ensure all tests are run
 
+  - script: |-
+     python orttraining/tools/ci_test/run_bert_perf_test.py \
+        --binary_dir build/RelWithDebInfo \
+        --model_root training_e2e_test_data/models \
+        --training_data_root training_e2e_test_data/data \
+        --gpu_sku MI100_32G
+    displayName: 'Run C++ BERT-L performance test'
+    condition: succeededOrFailed() # ensure all tests are run
+  
   - script: |-
      python orttraining/tools/ci_test/run_convergence_test.py \
        --binary_dir build/RelWithDebInfo \
        --model_root training_e2e_test_data/models \
        --training_data_root training_e2e_test_data/data \
        --gpu_sku MI100_32G
-    displayName: 'Run convergence test'
+    displayName: 'Run C++ BERT-L convergence test'
     condition: succeededOrFailed() # ensure all tests are run
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
@@ -42,21 +42,30 @@ steps:
 - script: |-
     cd ./build/RelWithDebInfo &&\
     ../../tools/ci_build/github/pai/pai_test_launcher.sh
-  displayName: 'Run unit tests'
+  displayName: 'Run onnxruntime unit tests'
 
-#- script: |-
-#   python orttraining/tools/ci_test/run_batch_size_test.py \
-#      --binary_dir build/RelWithDebInfo \
-#      --model_root training_e2e_test_data/models \
-#     --gpu_sku MI100_32G
-#  displayName: 'Run batch size test'
-#  condition: succeededOrFailed() # ensure all tests are run
+- script: |-
+   python orttraining/tools/ci_test/run_batch_size_test.py \
+      --binary_dir build/RelWithDebInfo \
+      --model_root training_e2e_test_data/models \
+     --gpu_sku MI100_32G
+  displayName: 'Run C++ BERT-L batch size test'
+  condition: succeededOrFailed() # ensure all tests are run
+
+- script: |-
+    python orttraining/tools/ci_test/run_bert_perf_test.py \
+      --binary_dir build/RelWithDebInfo \
+      --model_root training_e2e_test_data/models \
+      --training_data_root training_e2e_test_data/data \
+      --gpu_sku MI100_32G
+  displayName: 'Run C++ BERT-L performance test'
+  condition: succeededOrFailed() # ensure all tests are run
 
 - script: |-
     python orttraining/tools/ci_test/run_convergence_test.py \
       --binary_dir build/RelWithDebInfo \
       --model_root training_e2e_test_data/models \
       --training_data_root training_e2e_test_data/data \
       --gpu_sku MI100_32G
-  displayName: 'Run convergence test'
+  displayName: 'Run C++ BERT-L convergence test'
   condition: succeededOrFailed() # ensure all tests are run