From e2531b777e1d70975d33cbccd11c43b38c6507a2 Mon Sep 17 00:00:00 2001 From: Marek Wawrzos Date: Tue, 11 May 2021 19:46:45 +0200 Subject: [PATCH 1/5] [RCP checker] fix batch size value for RNN-T RCPs (#121) --- mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json b/mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json index 81942532..6fef28f7 100644 --- a/mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json +++ b/mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json @@ -3,7 +3,7 @@ "rnn_t_ref_1k": { "Benchmark": "rnnt", - "BS": 128, + "BS": 1024, "Hyperparams": { "opt_base_learning_rate": 0.004, "opt_lamb_learning_rate_hold_epochs": 40, @@ -26,7 +26,7 @@ "rnn_t_ref_2k": { "Benchmark": "rnnt", - "BS": 256, + "BS": 2048, "Hyperparams": { "opt_base_learning_rate": 0.007, "opt_lamb_learning_rate_hold_epochs": 40, From a51c1e39a159023774696da639a99b7d2090066a Mon Sep 17 00:00:00 2001 From: Matthew Frank Date: Tue, 11 May 2021 13:00:42 -0500 Subject: [PATCH 2/5] adding pack_submission script (#102) --- pack_submission.sh | 86 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100755 pack_submission.sh diff --git a/pack_submission.sh b/pack_submission.sh new file mode 100755 index 00000000..6508e676 --- /dev/null +++ b/pack_submission.sh @@ -0,0 +1,86 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## Encrypting your project for submission + +# In MLPerf Training v1.0, a policy has been introduced to allow submitters +# to submit an encrypted tarball of their submission repository along with the +# decryption password and SHA1 hash of the encrypted tarball to the MLPerf +# Training results chair. + +# To create an encrypted tarball and generate the SHA1 of the tarball, first +# change the `SUBMITTER` variable in `scripts/pack_submission.sh` to your +# company name. Then from the project root, run: + +# bash pack_submission.sh --pack + +# This command will prompt to enter and then confirm an encryption password. +# After this command finishes running, there will be 2 files: + +# - `mlperf_submission_${SUBMITTER}.tar.gz` - The encrypted tarball, encrypted with AES256 +# - `mlperf_submission_${SUBMITTER}.sha1` - A text file containing the sha1 hash of the encrypted tarball + +# To test that the submission has been successfully packed, run: + +# bash path/to/pack_submission.sh --unpack + +# The 3 things that must be shared with the MLPerf Inference results chair for +# submission are: +# 1. `mlperf_submission_${SUBMITTER}.tar.gz` - The encrypted tarball, encrypted with AES256 +# 2. `mlperf_submission_${SUBMITTER}.sha1` - A text file containing the sha1 hash of the encrypted tarball +# 3. The decryption password + +# Before submission deadline, upload the tarball to a public cloud storage and +# email the link along with items 2-3 to the MLCommons submissions address: submissions@mlcommons.org +# Also, include the last two lines of the submission_checker_log.txt like +# below in the body of the email as cursory evidence of a valid submission. + +# INFO:main:Results=265, NoResults=0 +# INFO:main:SUMMARY: submission looks OK + +function print_error_and_exit { + echo "usage: SUBMITTER= ${0} --pack|--unpack|--list" + exit 1 +} + +# make sure SUBMITTER is set +[ -z "${SUBMITTER}" ] && print_error_and_exit + +TARBALL_NAME=mlperf_submission_${SUBMITTER}.tar.gz +SHA1_FILE_NAME=mlperf_submission_${SUBMITTER}.sha1 +FOLDERS=$(find -mindepth 1 -maxdepth 1 -type d -not -name '.*') + +if [ "$1" = "--pack" ]; then + echo "Packing folders ${FOLDERS} into tarball ${TARBALL_NAME} and encrypting" + tar --create --gzip --file - ${FOLDERS} | openssl enc -e -aes256 -out ${TARBALL_NAME} + echo "Generating sha1sum of tarball" + sha1sum ${TARBALL_NAME} | tee ${SHA1_FILE_NAME} +elif [ "$1" = "--unpack" ]; then + echo "Checking sha1sum of tarball" + if [ "`sha1sum ${TARBALL_NAME}`" = "`cat ${SHA1_FILE_NAME}`" ]; then + echo "sha1sum matches." + openssl enc -d -aes256 -in ${TARBALL_NAME} | tar --extract --gzip --file - + else + echo "ERROR: sha1sum of ${TARBALL_NAME} does not match contents of ${SHA1_FILE_NAME}" + fi +elif [ "$1" = "--list" ]; then + if [ "`sha1sum ${TARBALL_NAME}`" = "`cat ${SHA1_FILE_NAME}`" ]; then + openssl enc -d -aes256 -in ${TARBALL_NAME} | tar --list --gzip --file - + else + echo "ERROR: sha1sum of ${TARBALL_NAME} does not match contents of ${SHA1_FILE_NAME}" + fi +else + print_error_and_exit +fi + From 4a2517ed06a1d5962f1aeb3e0d1f61b5b1cffb85 Mon Sep 17 00:00:00 2001 From: itay hubara Date: Fri, 14 May 2021 05:24:39 +0300 Subject: [PATCH 3/5] adding some constants for BERT (#113) Co-authored-by: Xinyuan Huang --- mlperf_logging/mllog/constants.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index a6a86f8d..b2f2fec3 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -46,12 +46,14 @@ TRANSFORMER = "transformer" RNNT = "rnnt" UNET3D = "unet3d" +BERT ="bert" # Constant values - model info ADAM = "adam" LARS = "lars" LAZY_ADAM = "lazy_adam" SGD = "sgd" +LAMB ="lamb" # Constant values - metadata info ABORTED = "aborted" @@ -101,6 +103,7 @@ MIN_IMAGE_SIZE = "min_image_size" MODEL_BN_SPAN = "model_bn_span" NUM_IMAGE_CANDIDATES = "num_image_candidates" +NUM_WARMUP_STEPS = "num_warmup_steps" OPT_ADAM_BETA_1 = "opt_adam_beta_1" OPT_ADAM_BETA_2 = "opt_adam_beta_2" OPT_ADAM_EPSILON = "opt_adam_epsilon" @@ -108,6 +111,7 @@ OPT_BASE_LR = "opt_base_learning_rate" OPT_LAMB_LR_MIN = "opt_lamb_learning_rate_min" OPT_LAMB_LR_DECAY_POLY_POWER = "opt_lamb_learning_rate_decay_poly_power" +OPT_LAMB_WEIGHT_DECAY = "opt_lamb_weight_decay_rate" OPT_LAMB_BETA_1 = "opt_lamb_beta_1" OPT_LAMB_BETA_2 = "opt_lamb_beta_2" OPT_LAMB_EPSILON = "opt_lamb_epsilon" @@ -121,6 +125,7 @@ OPT_LR_DECAY_START_STEP = "opt_learning_rate_decay_start_step" OPT_LR_DECAY_STEPS = "opt_learning_rate_decay_steps" OPT_LR_REMAIN_STEPS = "opt_learning_rate_remain_steps" +OPT_LR_TRAINING_STEPS = "opt_learning_rate_training_steps" OPT_LR_WARMUP_EPOCHS = "opt_learning_rate_warmup_epochs" OPT_LR_WARMUP_FACTOR = "opt_learning_rate_warmup_factor" OPT_LR_WARMUP_STEPS = "opt_learning_rate_warmup_steps" @@ -140,6 +145,7 @@ MODEL_EVAL_EMA_FACTOR = "model_eval_ema_factor" MODEL_WEIGHTS_INITIALIZATION_SCALE = "model_weights_initialization_scale" EVAL_MAX_PREDICTION_SYMBOLS = "eval_max_prediction_symbols" +START_WARMUP_STEP ="start_warmup_step" # Log keys - misc. BBOX = "bbox" From 7b1fb7df3224272a9e4b54d9afc2e28ef30e2bd1 Mon Sep 17 00:00:00 2001 From: Shang Wang <66387198+shangw-nvidia@users.noreply.github.com> Date: Thu, 13 May 2021 22:25:02 -0400 Subject: [PATCH 4/5] [RCP][DLRM] Changing epoch number starting from 1. (#123) --- mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json b/mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json index 99165ad9..2a3dfb0a 100644 --- a/mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json +++ b/mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json @@ -13,7 +13,8 @@ "sgd_opt_learning_rate_decay_steps": 30000 }, "Epochs to converge": [ - 0.8, 0.75, 0.75, 0.75, 0.75, 0.8, 0.7, 0.75, 0.75, 0.75] + 1.8, 1.75, 1.75, 1.75, 1.75, 1.8, 1.7, 1.75, 1.75, 1.75 + ] }, "dlrm_ref_55296": @@ -29,9 +30,10 @@ "sgd_opt_learning_rate_decay_steps": 27772 }, "Epochs to converge": [ - 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.95, - 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, - 0.9, 0.9, 0.85, 0.9, 0.9, 0.9, 0.9, 0.95, 0.9] + 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.95, + 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, + 1.9, 1.9, 1.85, 1.9, 1.9, 1.9, 1.9, 1.95, 1.9 + ] } } From ab442825763bcecc3dc83d0b4d49df2d62c6ca77 Mon Sep 17 00:00:00 2001 From: Elias Mizan <43765737+emizan76@users.noreply.github.com> Date: Fri, 14 May 2021 19:10:00 -0700 Subject: [PATCH 5/5] RCP update: Resnet 64K, Unet3D, BERT (#119) * First RCP checker commit, just a small commit with a README file to make sure the github + forking flow work for me. * Rcp_checker implementation: * Added 1.0.0/rcps.json file. Still in progress as RCPs have not been finalized * Code is in rcp_checker.py. This currently contains a single RCP_Checker class with functions to consume json file, construct the RCP structure, compute means, stdevs, and min allowed speedups, find RCPs based on benchmark and batch size, and generate interpolated RCPs. This is all the processing needed to happen at startup. No support yet to process and evaluate submission runs. This is TBD * __main__.py run a couple of simple tests, this will be moved eventually to a separate test file. * Added a few more 1.0.0 RCPs (still in progress) Added submission directory processing and comparison to RCPs in rcp_checker. Connected RCP checker to the result_summarizer. Fixed a couple of bugs. * Added remaining RCPs (resnet, bert, rnnt, unet3d), and fixed ones already in (maskrcnn, dlrm, ssd). Made a few fixes suggested by Victor * Update mlperf_logging/rcp_checker/README.md Co-authored-by: Marek Wawrzos * One step closer to v1.0.0 * System Description Checker: - Updated to 1.0.0 * Package Checker: - Added support for 1.0.0 - Added calls to the RCP checker - Added call to the system description checker - Added support for the Unet3d olympic scoring (reject top and bottom 4) * Results Summarizer - Added support for 1.0.0 - Refactored olympic scoring calculation to be able to accommodate unet3d (reject top and bottom 4) - Made a couple of fixes to RCP checker interface and disabled RCP checks for minigo. * RCP Checker - Split monolithic RCP json file into 1 json file / benchmark. This improves readability and makes adding more RCPs easier - Added support for Unet3D RCP checking: Reject top and bottom 4 scores instead of 1 - Added verbose mode to assist submitters with debugging - Fixed a couple of bugs I found after previous PR was merged. * Documentation: - Updated README files for RCP checker, results summarizer, package checker and system description checker * Fixed suggested by Marek. * Fixed a bug in the RCP checker Updated max compile time rulw for 20mins to 30mins. Removed a print statement from the result_summarizer. * Logging 1.0.0 fixes based on some testing and more knowledge on submission procedure 1. Added --rcp_bypass command line flag in package checker. Submitter can use it to allow uploading of benchmarks that fail the RCP test. This is a package checker flag that is propagated to the RCP checker. It has no meaning using it on a standalone RCP checker run, as the package checker outputs controls whether a submission is valid. 2. Removed RCP checker from result_summarizer. It does not need to run there as it is called by the package checker. 3. Fixes for open submission: Do not call the seed checker, nor the RCP checker. Fixed a bug where open_common was including closed_ rules. Since submitters in the open category can now use their own convergence rules I removed the convergence rules used in v0.7. So now the only rules for open submissions are the number of runs and open_common compliance rules. * Forgot to add verifier 1.0 top-level script in my previous commit. * Fixed failures pointed by Shang: - Line can start with :::MLLOG but it islegal to have anything else before :::MLLOG - Opened log files as latin-1, just like the compliance checker * Added Resnet temporary RCP for B=64K. The RCP was derived by Google's 0.7 tpu-v3-8192-TF submission and the 5 runs were duplicated Updated Unet3D RCPs. * Fixed RCP checker bug: When there were non-converging runs, the mean epochs to converge for the submission was under-reported. Updated compliance README file to 1.0.0 * Added final Resnet 64K RCP and updated 8K RCP. * Updated RCPs for Bert: Removed 768, added 1536 and updated 3072. Co-authored-by: Marek Wawrzos --- mlperf_logging/compliance_checker/README.md | 28 ++++++------ mlperf_logging/compliance_checker/__main__.py | 2 + .../rcp_checker/1.0.0/rcps_bert.json | 30 ++++++------- .../rcp_checker/1.0.0/rcps_resnet.json | 27 ++++++++++-- .../rcp_checker/1.0.0/rcps_unet3d.json | 43 +++++++++---------- mlperf_logging/rcp_checker/rcp_checker.py | 2 +- 6 files changed, 77 insertions(+), 55 deletions(-) diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md index c75b43e7..fe15615d 100644 --- a/mlperf_logging/compliance_checker/README.md +++ b/mlperf_logging/compliance_checker/README.md @@ -10,8 +10,9 @@ To check a log file for compliance: python -m mlperf_logging.compliance_checker [--config YAML] [--ruleset MLPERF_EDITION] FILENAME -By default, 0.7.0 edition rules are used and the default config is set to `0.7.0/common.yaml`. +By default, 1.0.0 edition rules are used and the default config is set to `1.0.0/common.yaml`. This config will check all common keys and enqueue benchmark specific config to be checked as well. +Old editions, still supported are 0.7.0 amd 0.6.0 Prints `SUCCESS` when no issues were found. Otherwise will print error details. @@ -19,20 +20,20 @@ As log examples use [NVIDIA's v0.6 training logs](https://github.com/mlperf/trai ### Existing config files - 0.7.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file - 0.7.0/resnet.yaml - 0.7.0/ssd.yaml - 0.7.0/minigo.yaml - 0.7.0/maskrcnn.yaml - 0.7.0/gnmt.yaml - 0.7.0/transformer.yaml - 0.7.0/bert.yaml - 0.7.0/dlrm.yaml + 1.0.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file + 1.0.0/resnet.yaml + 1.0.0/ssd.yaml + 1.0.0/minigo.yaml + 1.0.0/maskrcnn.yaml + 1.0.0/rnnt.yaml + 1.0.0/unet3d.yaml + 1.0.0/bert.yaml + 1.0.0/dlrm.yaml ### Implementation details Compliance checking is done following below algorithm. -1. Parser converts the log into a list of records, each record corresponds to MLL +1. Parser converts the log into a list of records, each record corresponds to MLLOG line and contains all relevant extracted information 2. Set of rules to be checked in loaded from provided config yaml file 3. Process optional `BEGIN` rule if present by executing provided `CODE` section @@ -114,7 +115,7 @@ Example: `ll` is a structure representing current log line that triggered `KEY` record. `ll` has the following fields that can be accessed: - `full_string` - the complete line as a string -- `timestamp` - seconds as a float, e.g. 1234.567 +- `timestamp` - milliseconds as an integer - `key` - the string key - `value` - the parsed value associated with the key, or None if no value - `lineno` - line number in the original file of the current key @@ -143,7 +144,7 @@ Example: NAME: submission_benchmark REQ: EXACTLY_ONE CHECK: " v['value'] in ['resnet', 'ssd', 'maskrcnn', 'transformer', 'gnmt'] " - POST: " enqueue_config('0.7.0/{}.yaml'.format(v['value'])) " + POST: " enqueue_config('1.0.0/{}.yaml'.format(v['value'])) " #### Other operations @@ -158,6 +159,7 @@ For instance, can define rules that would print out information as shown in the Tested and confirmed working using the following software versions: - Python 2.7.12 + PyYAML 3.11 - Python 3.6.8 + PyYAML 5.1 +- Python 2.9.2 + PyYAML 5.3.1 ### How to install PyYaML diff --git a/mlperf_logging/compliance_checker/__main__.py b/mlperf_logging/compliance_checker/__main__.py index 438bcd7e..1b8f808e 100644 --- a/mlperf_logging/compliance_checker/__main__.py +++ b/mlperf_logging/compliance_checker/__main__.py @@ -23,3 +23,5 @@ if not valid: sys.exit(1) +else: + print('SUCCESS') diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_bert.json b/mlperf_logging/rcp_checker/1.0.0/rcps_bert.json index 7bbd3124..39b61ae7 100644 --- a/mlperf_logging/rcp_checker/1.0.0/rcps_bert.json +++ b/mlperf_logging/rcp_checker/1.0.0/rcps_bert.json @@ -38,23 +38,23 @@ 2508800, 2458624, 2684416, 2533888, 2533888, 2784768, 2308096, 2784768, 2584064, 2809856] }, - "bert_ref_768": + "bert_ref_1536": { "Benchmark": "bert", - "BS": 768, + "BS": 1536, "Hyperparams": { - "opt_base_learning_rate": 0.00035, + "opt_base_learning_rate": 0.002, "opt_epsilon": 1e-6, - "opt_learning_rate_training_steps": 8000, - "num_warmup_steps": 420, + "opt_learning_rate_training_steps": 2254, + "num_warmup_steps": 0, "start_warmup_step": 0, - "opt_lamb_beta_1": 0.91063, - "opt_lamb_beta_2": 0.96497, + "opt_lamb_beta_1": 0.66, + "opt_lamb_beta_2": 0.996, "opt_lamb_weight_decay_rate": 0.01 }, "Epochs to converge": [ - 3979008, 3598848, 3598848, 3776256, 3168000, 3370752, 3598848, 3472128, 3826944, 3472128, - 3066624, 3345408, 3269376, 3776256, 3396096, 3852288, 3294720, 4004352, 3396096, 3091968] + 2836240, 2801664, 2801664, 2727936, 2801664, 2875392, 2899968, 2727936, 2777088, 2875392, + 2777088, 2801664, 2678784, 2801664, 2703360, 2629632, 2727936, 2703360, 2654208, 2949120] }, "bert_ref_3072": @@ -62,18 +62,18 @@ "Benchmark": "bert", "BS": 3072, "Hyperparams": { - "opt_base_learning_rate": 0.0015, + "opt_base_learning_rate": 0.002, "opt_epsilon": 1e-6, - "opt_learning_rate_training_steps": 1271, + "opt_learning_rate_training_steps": 1141, "num_warmup_steps": 100, "start_warmup_step": 0, - "opt_lamb_beta_1": 0.9, - "opt_lamb_beta_2": 0.999, + "opt_lamb_beta_1": 0.66, + "opt_lamb_beta_2": 0.998, "opt_lamb_weight_decay_rate": 0.01 }, "Epochs to converge": [ - 3465216, 3563520, 3489792, 3416064, 3489792, 3514368, 3760128, 3489792, 3612672, 3465216, - 3317760, 3661824, 3268608, 3563520, 3588096, 3366912, 3538944, 3489792, 3489792, 3710976] + 2703360, 2482176, 3072000, 2654208, 2580480, 2727936, 2605056, 2801664, 2777088, 2580480, + 2875392, 2826240, 2973696, 2850816, 2678784, 2919120, 3121152, 2605056, 2678784, 2850816] }, "bert_ref_8192": diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_resnet.json b/mlperf_logging/rcp_checker/1.0.0/rcps_resnet.json index 32f97bc6..a42b3df8 100644 --- a/mlperf_logging/rcp_checker/1.0.0/rcps_resnet.json +++ b/mlperf_logging/rcp_checker/1.0.0/rcps_resnet.json @@ -50,11 +50,11 @@ "epsilon": 0, "opt_learning_rate_warmup_epochs": 5, "opt_momentum": 0.9, - "opt_weight_decay": 2e-3, - "opt_learning_rate_decay_steps": 6720 + "opt_weight_decay": 2e-4, + "opt_learning_rate_decay_steps": 6095 }, "Epochs to converge": [ - 41, 40, 42, 42, 41, 41, 42, 42, 41, 41] + 42, 44, 43, 41, 41, 41, 42, 42, 43, 41] }, "resnet_ref_32768": @@ -68,12 +68,31 @@ "opt_learning_rate_decay_poly_power": 2, "epsilon": 0, "opt_learning_rate_warmup_epochs": 16, - "opt_momentum": 2.5e-5, + "opt_momentum": 0.94, "opt_weight_decay": 2e-3, "opt_learning_rate_decay_steps": 58 }, "Epochs to converge": [ 56, 56, 55, 56, 56, 56, 56, 56, 57, 56] + }, + + "resnet_ref_65536": + { + "Benchmark": "resnet", + "BS": 65536, + "Hyperparams": { + "optimizer": "lars", + "opt_base_learning_rate": 24.699, + "opt_end_learning_rate": 1e-4, + "opt_learning_rate_decay_poly_power": 2, + "epsilon": 0, + "opt_learning_rate_warmup_epochs": 31, + "opt_momentum": 0.951807, + "opt_weight_decay": 1e-4, + "opt_learning_rate_decay_steps": 1133 + }, + "Epochs to converge": [ + 83, 85, 84, 86, 85, 85, 83, 84, 85, 85] } } diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_unet3d.json b/mlperf_logging/rcp_checker/1.0.0/rcps_unet3d.json index 1e8f320b..f4a0ed87 100644 --- a/mlperf_logging/rcp_checker/1.0.0/rcps_unet3d.json +++ b/mlperf_logging/rcp_checker/1.0.0/rcps_unet3d.json @@ -1,6 +1,6 @@ { - "unet3d_ref_2": + "unet3d_ref_2_fp32": { "Benchmark": "unet3d", "BS": 2, @@ -9,21 +9,18 @@ "opt_learning_rate_warmup_epochs": 200 }, "Epochs to converge": [ - 1980, 1940, 2800, 3020, 2920, 1820, 2300, 2200, 2400, 1780, - 2840, 3880, 2120, 2860, 1920, 1480, 2380, 2360, 2220, 3920, - 2640, 2240, 2100, 2740, 1740, 3360, 2000, 2460, 2460, 2680, - 2320, 2000, 2040, 2180, 2540, 1400, 1720, 1860, 2940, 1880, - 1980, 2020, 2440, 2020, 2780, 1660, 2320, 2380, 2680, 2000, - 3140, 1680, 1660, 2560, 2660, 1560, 2100, 2000, 2300, 2240, - 1780, 2460, 2240, 3500, 1520, 3360, 2260, 2280, 2440, 2800, - 2380, 2020, 2880, 2720, 3960, 3840, 3220, 1300, 3140, 3160, - 3820, 3220, 2640, 3220, 3680, 2860, 3740, 2320, 2260, 3660, - 2260, 2560, 1760, 2720, 1940, 2640, 2200, 2500, 2640, 3460, - 1660, 2480, 1560, 2720, 2840, 2300, 1740, 3720, 2800, 3940, - 3460, 3380, 3580, 2360, 2720, 3320, 2360, 2980, 3000, 3800, - 2100, 1720, 2700, 1780, 3260, 2680, 2140, 3680, 2700] + 3420, 3420, 1440, 2320, 2940, 2240, 2600, 2840, 3320, 2360, + 4040, 2920, 3360, 2080, 3060, 2900, 4000, 3120, 2120, 2540, + 1880, 2640, 2660, 2160, 1420, 2880, 2360, 2260, 2900, 2640, + 2380, 3060, 1880, 2420, 2560, 2580, 2180, 2960, 2480, 2140, + 3500, 2420, 2500, 3860, 1620, 2260, 2160, 1280, 2320, 2140, + 2580, 3020, 2480, 3300, 2140, 3400, 2940, 2520, 3680, 3380, + 3080, 2660, 2980, 2740, 2140, 2140, 3000, 2820, 2960, 2420, + 2760, 2940, 3280, 2660, 2200, 1660, 1520, 2320, 2180, 2280, + 2960, 2140, 3280, 2980, 3580, 3280, 3420] }, - "unet3d_ref_32": + + "unet3d_ref_32_amp": { "Benchmark": "unet3d", "BS": 32, @@ -32,13 +29,15 @@ "opt_learning_rate_warmup_epochs": 1000 }, "Epochs to converge": [ - 2220, 1960, 3200, 2440, 2000, 2060, 2420, 2160, 2480, 2480, - 3460, 2280, 1660, 2500, 3040, 1860, 2020, 2100, 2560, 3660, - 2100, 1760, 2720, 1360, 1580, 4680, 1860, 1680, 1740, 2120, - 1720, 2140, 1740, 2220, 1900, 1680, 3040, 1820, 2420, 1380, - 2020, 2420, 2020, 2660, 3680, 1740, 2600, 2720, 1940, 2420, - 2160, 2060, 2620, 2500, 2080, 3040, 1820, 2780, 1780, 1880, - 2240, 2460, 1860] + 1512, 3492, 1422, 2052, 2610, 1908, 2052, 1692, 1674, 2196, + 2682, 2412, 1980, 2556, 2466, 2358, 2880, 1638, 1890, 2178, + 1764, 1872, 2070, 2322, 2178, 2070, 2916, 1548, 1998, 2214, + 2034, 2322, 1602, 2610, 1908, 1944, 2646, 2250, 2268, 1854, + 1206, 2610, 2394, 2214, 1710, 3240, 2070, 1278, 2034, 1314, + 2376, 1530, 1656, 1674, 1494, 2160, 2862, 1152, 1440, 1926, + 1440, 2250, 2358, 1836, 2178, 1818, 1458, 1188, 2358, 1692, + 1962, 2412, 1296, 2232, 2196, 1926, 1260, 2070, 3042, 2106, + 2088, 1926, 2430, 1764, 1854, 2430, 2214, 1638, 2790] } } diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index ac62cea6..c774f64b 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -60,7 +60,7 @@ def get_submission_epochs(result_files, benchmark): if conv_result == "success": subm_epochs.append(conv_epoch) else: - subm_epochs.append(-1) + subm_epochs.append(1e9) not_converged = not_converged + 1 if (not_converged > 1 and benchmark != 'unet3d') or (not_converged > 4 and benchmark == 'unet3d'): subm_epochs = None