Skip to content

Commit

Permalink
Add BERT-L perf regression test on MI100 and re-enable batch size tes…
Browse files Browse the repository at this point in the history
…t (#7240)

* restore bs test and add perf test

* update perf number and fix path to results
  • Loading branch information
Suffian Khan authored Apr 5, 2021
1 parent 10102c0 commit 9f14af9
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 30 deletions.
6 changes: 3 additions & 3 deletions orttraining/tools/ci_test/run_batch_size_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ def main():
]

configs['MI100_32G'] = [
Config(True, 128, 201, 20, ""),
Config(True, 512, 31, 80, ""),
Config(False, 128, 109, 20, ""),
Config(True, 128, 200, 20, ""),
Config(True, 512, 30, 80, ""),
Config(False, 128, 108, 20, ""),
Config(False, 512, 16, 80, ""),
]

Expand Down
33 changes: 24 additions & 9 deletions orttraining/tools/ci_test/run_bert_perf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import subprocess
import sys
import os
import json
from collections import namedtuple

SCRIPT_DIR = os.path.realpath(os.path.dirname(__file__))
Expand All @@ -18,27 +19,36 @@ def parse_args():
help="Path to the training data root directory.")
parser.add_argument("--model_root", required=True,
help="Path to the model root directory.")
parser.add_argument("--gpu_sku", choices=['V100_16G', 'MI100_32G'], default='V100_16G', required=False,
help="GPU model (e.g. V100_16G, MI100_32G).")
return parser.parse_args()

# using the same params from "GitHub Master Merge Schedule" in OneNotes
def main():
args = parse_args()

Config = namedtuple('Config', ['use_mixed_precision', 'max_seq_length', 'batch_size', 'max_predictions_per_seq'])
configs = [
Config(True, 128, 76, 20),
Config(True, 512, 11, 80),
Config(False, 128, 39, 20),
Config(False, 512, 6, 80)
Config = namedtuple('Config', ['use_mixed_precision', 'max_seq_length', 'batch_size', 'max_predictions_per_seq', 'expected_perf'])
configs = {}
configs['V100_16G'] = [
Config(True, 128, 76, 20, -1.0),
Config(True, 512, 11, 80, -1.0),
Config(False, 128, 39, 20, -1.0),
Config(False, 512, 6, 80, -1.0)
]

configs['MI100_32G'] = [
Config(True, 128, 128, 20, 240),
]

# run BERT training
for c in configs:
for c in configs[args.gpu_sku]:
model = 'bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12'
precision_prefix = ('fp16' if c.use_mixed_precision else 'fp32')
print("######## testing name - " + ('fp16-' if c.use_mixed_precision else 'fp32-') + str(c.max_seq_length) + " ##############")
cmds = [
os.path.join(args.binary_dir, "onnxruntime_training_bert"),
"--model_name", os.path.join(
args.model_root, "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"),
args.model_root, "nv/bert-large/{}".format(model)),
"--train_data_dir", os.path.join(
args.training_data_root, str(c.max_seq_length), "books_wiki_en_corpus/train"),
"--test_data_dir", os.path.join(
Expand All @@ -64,7 +74,12 @@ def main():
cmds.append("--allreduce_in_fp16"),

subprocess.run(cmds).check_returncode()

if c.expected_perf > 0.0:
json_filename = 'onnxruntime_perf_metrics_{}.onnx_bert_{}_{}_Lamb.json'.format(model, precision_prefix, c.max_seq_length)
with open(os.path.join(SCRIPT_DIR, 'results', json_filename)) as json_file:
results = json.load(json_file)
assert(results['EndToEndThroughput'] > 0.98*c.expected_perf)

return 0

if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,30 @@ jobs:
- script: |-
cd ./build/RelWithDebInfo &&\
../../tools/ci_build/github/pai/pai_test_launcher.sh
displayName: 'Run unit tests'
displayName: 'Run onnxruntime unit tests'
# - script: |-
# python orttraining/tools/ci_test/run_batch_size_test.py \
# --binary_dir build/RelWithDebInfo \
# --model_root training_e2e_test_data/models \
# --gpu_sku MI100_32G
# displayName: 'Run batch size test'
# condition: succeededOrFailed() # ensure all tests are run
- script: |-
python orttraining/tools/ci_test/run_batch_size_test.py \
--binary_dir build/RelWithDebInfo \
--model_root training_e2e_test_data/models \
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L batch size test'
condition: succeededOrFailed() # ensure all tests are run
- script: |-
python orttraining/tools/ci_test/run_bert_perf_test.py \
--binary_dir build/RelWithDebInfo \
--model_root training_e2e_test_data/models \
--training_data_root training_e2e_test_data/data \
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L performance test'
condition: succeededOrFailed() # ensure all tests are run
- script: |-
python orttraining/tools/ci_test/run_convergence_test.py \
--binary_dir build/RelWithDebInfo \
--model_root training_e2e_test_data/models \
--training_data_root training_e2e_test_data/data \
--gpu_sku MI100_32G
displayName: 'Run convergence test'
displayName: 'Run C++ BERT-L convergence test'
condition: succeededOrFailed() # ensure all tests are run
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,30 @@ steps:
- script: |-
cd ./build/RelWithDebInfo &&\
../../tools/ci_build/github/pai/pai_test_launcher.sh
displayName: 'Run unit tests'
displayName: 'Run onnxruntime unit tests'

#- script: |-
# python orttraining/tools/ci_test/run_batch_size_test.py \
# --binary_dir build/RelWithDebInfo \
# --model_root training_e2e_test_data/models \
# --gpu_sku MI100_32G
# displayName: 'Run batch size test'
# condition: succeededOrFailed() # ensure all tests are run
- script: |-
python orttraining/tools/ci_test/run_batch_size_test.py \
--binary_dir build/RelWithDebInfo \
--model_root training_e2e_test_data/models \
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L batch size test'
condition: succeededOrFailed() # ensure all tests are run

- script: |-
python orttraining/tools/ci_test/run_bert_perf_test.py \
--binary_dir build/RelWithDebInfo \
--model_root training_e2e_test_data/models \
--training_data_root training_e2e_test_data/data \
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L performance test'
condition: succeededOrFailed() # ensure all tests are run

- script: |-
python orttraining/tools/ci_test/run_convergence_test.py \
--binary_dir build/RelWithDebInfo \
--model_root training_e2e_test_data/models \
--training_data_root training_e2e_test_data/data \
--gpu_sku MI100_32G
displayName: 'Run convergence test'
displayName: 'Run C++ BERT-L convergence test'
condition: succeededOrFailed() # ensure all tests are run

0 comments on commit 9f14af9

Please sign in to comment.