diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md index c75b43e7..fe15615d 100644 --- a/mlperf_logging/compliance_checker/README.md +++ b/mlperf_logging/compliance_checker/README.md @@ -10,8 +10,9 @@ To check a log file for compliance: python -m mlperf_logging.compliance_checker [--config YAML] [--ruleset MLPERF_EDITION] FILENAME -By default, 0.7.0 edition rules are used and the default config is set to `0.7.0/common.yaml`. +By default, 1.0.0 edition rules are used and the default config is set to `1.0.0/common.yaml`. This config will check all common keys and enqueue benchmark specific config to be checked as well. +Old editions, still supported are 0.7.0 amd 0.6.0 Prints `SUCCESS` when no issues were found. Otherwise will print error details. @@ -19,20 +20,20 @@ As log examples use [NVIDIA's v0.6 training logs](https://github.com/mlperf/trai ### Existing config files - 0.7.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file - 0.7.0/resnet.yaml - 0.7.0/ssd.yaml - 0.7.0/minigo.yaml - 0.7.0/maskrcnn.yaml - 0.7.0/gnmt.yaml - 0.7.0/transformer.yaml - 0.7.0/bert.yaml - 0.7.0/dlrm.yaml + 1.0.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file + 1.0.0/resnet.yaml + 1.0.0/ssd.yaml + 1.0.0/minigo.yaml + 1.0.0/maskrcnn.yaml + 1.0.0/rnnt.yaml + 1.0.0/unet3d.yaml + 1.0.0/bert.yaml + 1.0.0/dlrm.yaml ### Implementation details Compliance checking is done following below algorithm. -1. Parser converts the log into a list of records, each record corresponds to MLL +1. Parser converts the log into a list of records, each record corresponds to MLLOG line and contains all relevant extracted information 2. Set of rules to be checked in loaded from provided config yaml file 3. Process optional `BEGIN` rule if present by executing provided `CODE` section @@ -114,7 +115,7 @@ Example: `ll` is a structure representing current log line that triggered `KEY` record. `ll` has the following fields that can be accessed: - `full_string` - the complete line as a string -- `timestamp` - seconds as a float, e.g. 1234.567 +- `timestamp` - milliseconds as an integer - `key` - the string key - `value` - the parsed value associated with the key, or None if no value - `lineno` - line number in the original file of the current key @@ -143,7 +144,7 @@ Example: NAME: submission_benchmark REQ: EXACTLY_ONE CHECK: " v['value'] in ['resnet', 'ssd', 'maskrcnn', 'transformer', 'gnmt'] " - POST: " enqueue_config('0.7.0/{}.yaml'.format(v['value'])) " + POST: " enqueue_config('1.0.0/{}.yaml'.format(v['value'])) " #### Other operations @@ -158,6 +159,7 @@ For instance, can define rules that would print out information as shown in the Tested and confirmed working using the following software versions: - Python 2.7.12 + PyYAML 3.11 - Python 3.6.8 + PyYAML 5.1 +- Python 2.9.2 + PyYAML 5.3.1 ### How to install PyYaML diff --git a/mlperf_logging/compliance_checker/__main__.py b/mlperf_logging/compliance_checker/__main__.py index 438bcd7e..1b8f808e 100644 --- a/mlperf_logging/compliance_checker/__main__.py +++ b/mlperf_logging/compliance_checker/__main__.py @@ -23,3 +23,5 @@ if not valid: sys.exit(1) +else: + print('SUCCESS') diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index a6a86f8d..b2f2fec3 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -46,12 +46,14 @@ TRANSFORMER = "transformer" RNNT = "rnnt" UNET3D = "unet3d" +BERT ="bert" # Constant values - model info ADAM = "adam" LARS = "lars" LAZY_ADAM = "lazy_adam" SGD = "sgd" +LAMB ="lamb" # Constant values - metadata info ABORTED = "aborted" @@ -101,6 +103,7 @@ MIN_IMAGE_SIZE = "min_image_size" MODEL_BN_SPAN = "model_bn_span" NUM_IMAGE_CANDIDATES = "num_image_candidates" +NUM_WARMUP_STEPS = "num_warmup_steps" OPT_ADAM_BETA_1 = "opt_adam_beta_1" OPT_ADAM_BETA_2 = "opt_adam_beta_2" OPT_ADAM_EPSILON = "opt_adam_epsilon" @@ -108,6 +111,7 @@ OPT_BASE_LR = "opt_base_learning_rate" OPT_LAMB_LR_MIN = "opt_lamb_learning_rate_min" OPT_LAMB_LR_DECAY_POLY_POWER = "opt_lamb_learning_rate_decay_poly_power" +OPT_LAMB_WEIGHT_DECAY = "opt_lamb_weight_decay_rate" OPT_LAMB_BETA_1 = "opt_lamb_beta_1" OPT_LAMB_BETA_2 = "opt_lamb_beta_2" OPT_LAMB_EPSILON = "opt_lamb_epsilon" @@ -121,6 +125,7 @@ OPT_LR_DECAY_START_STEP = "opt_learning_rate_decay_start_step" OPT_LR_DECAY_STEPS = "opt_learning_rate_decay_steps" OPT_LR_REMAIN_STEPS = "opt_learning_rate_remain_steps" +OPT_LR_TRAINING_STEPS = "opt_learning_rate_training_steps" OPT_LR_WARMUP_EPOCHS = "opt_learning_rate_warmup_epochs" OPT_LR_WARMUP_FACTOR = "opt_learning_rate_warmup_factor" OPT_LR_WARMUP_STEPS = "opt_learning_rate_warmup_steps" @@ -140,6 +145,7 @@ MODEL_EVAL_EMA_FACTOR = "model_eval_ema_factor" MODEL_WEIGHTS_INITIALIZATION_SCALE = "model_weights_initialization_scale" EVAL_MAX_PREDICTION_SYMBOLS = "eval_max_prediction_symbols" +START_WARMUP_STEP ="start_warmup_step" # Log keys - misc. BBOX = "bbox" diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_bert.json b/mlperf_logging/rcp_checker/1.0.0/rcps_bert.json index 7bbd3124..39b61ae7 100644 --- a/mlperf_logging/rcp_checker/1.0.0/rcps_bert.json +++ b/mlperf_logging/rcp_checker/1.0.0/rcps_bert.json @@ -38,23 +38,23 @@ 2508800, 2458624, 2684416, 2533888, 2533888, 2784768, 2308096, 2784768, 2584064, 2809856] }, - "bert_ref_768": + "bert_ref_1536": { "Benchmark": "bert", - "BS": 768, + "BS": 1536, "Hyperparams": { - "opt_base_learning_rate": 0.00035, + "opt_base_learning_rate": 0.002, "opt_epsilon": 1e-6, - "opt_learning_rate_training_steps": 8000, - "num_warmup_steps": 420, + "opt_learning_rate_training_steps": 2254, + "num_warmup_steps": 0, "start_warmup_step": 0, - "opt_lamb_beta_1": 0.91063, - "opt_lamb_beta_2": 0.96497, + "opt_lamb_beta_1": 0.66, + "opt_lamb_beta_2": 0.996, "opt_lamb_weight_decay_rate": 0.01 }, "Epochs to converge": [ - 3979008, 3598848, 3598848, 3776256, 3168000, 3370752, 3598848, 3472128, 3826944, 3472128, - 3066624, 3345408, 3269376, 3776256, 3396096, 3852288, 3294720, 4004352, 3396096, 3091968] + 2836240, 2801664, 2801664, 2727936, 2801664, 2875392, 2899968, 2727936, 2777088, 2875392, + 2777088, 2801664, 2678784, 2801664, 2703360, 2629632, 2727936, 2703360, 2654208, 2949120] }, "bert_ref_3072": @@ -62,18 +62,18 @@ "Benchmark": "bert", "BS": 3072, "Hyperparams": { - "opt_base_learning_rate": 0.0015, + "opt_base_learning_rate": 0.002, "opt_epsilon": 1e-6, - "opt_learning_rate_training_steps": 1271, + "opt_learning_rate_training_steps": 1141, "num_warmup_steps": 100, "start_warmup_step": 0, - "opt_lamb_beta_1": 0.9, - "opt_lamb_beta_2": 0.999, + "opt_lamb_beta_1": 0.66, + "opt_lamb_beta_2": 0.998, "opt_lamb_weight_decay_rate": 0.01 }, "Epochs to converge": [ - 3465216, 3563520, 3489792, 3416064, 3489792, 3514368, 3760128, 3489792, 3612672, 3465216, - 3317760, 3661824, 3268608, 3563520, 3588096, 3366912, 3538944, 3489792, 3489792, 3710976] + 2703360, 2482176, 3072000, 2654208, 2580480, 2727936, 2605056, 2801664, 2777088, 2580480, + 2875392, 2826240, 2973696, 2850816, 2678784, 2919120, 3121152, 2605056, 2678784, 2850816] }, "bert_ref_8192": diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json b/mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json index 99165ad9..2a3dfb0a 100644 --- a/mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json +++ b/mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json @@ -13,7 +13,8 @@ "sgd_opt_learning_rate_decay_steps": 30000 }, "Epochs to converge": [ - 0.8, 0.75, 0.75, 0.75, 0.75, 0.8, 0.7, 0.75, 0.75, 0.75] + 1.8, 1.75, 1.75, 1.75, 1.75, 1.8, 1.7, 1.75, 1.75, 1.75 + ] }, "dlrm_ref_55296": @@ -29,9 +30,10 @@ "sgd_opt_learning_rate_decay_steps": 27772 }, "Epochs to converge": [ - 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.95, - 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, - 0.9, 0.9, 0.85, 0.9, 0.9, 0.9, 0.9, 0.95, 0.9] + 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.95, + 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, + 1.9, 1.9, 1.85, 1.9, 1.9, 1.9, 1.9, 1.95, 1.9 + ] } } diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_resnet.json b/mlperf_logging/rcp_checker/1.0.0/rcps_resnet.json index 32f97bc6..a42b3df8 100644 --- a/mlperf_logging/rcp_checker/1.0.0/rcps_resnet.json +++ b/mlperf_logging/rcp_checker/1.0.0/rcps_resnet.json @@ -50,11 +50,11 @@ "epsilon": 0, "opt_learning_rate_warmup_epochs": 5, "opt_momentum": 0.9, - "opt_weight_decay": 2e-3, - "opt_learning_rate_decay_steps": 6720 + "opt_weight_decay": 2e-4, + "opt_learning_rate_decay_steps": 6095 }, "Epochs to converge": [ - 41, 40, 42, 42, 41, 41, 42, 42, 41, 41] + 42, 44, 43, 41, 41, 41, 42, 42, 43, 41] }, "resnet_ref_32768": @@ -68,12 +68,31 @@ "opt_learning_rate_decay_poly_power": 2, "epsilon": 0, "opt_learning_rate_warmup_epochs": 16, - "opt_momentum": 2.5e-5, + "opt_momentum": 0.94, "opt_weight_decay": 2e-3, "opt_learning_rate_decay_steps": 58 }, "Epochs to converge": [ 56, 56, 55, 56, 56, 56, 56, 56, 57, 56] + }, + + "resnet_ref_65536": + { + "Benchmark": "resnet", + "BS": 65536, + "Hyperparams": { + "optimizer": "lars", + "opt_base_learning_rate": 24.699, + "opt_end_learning_rate": 1e-4, + "opt_learning_rate_decay_poly_power": 2, + "epsilon": 0, + "opt_learning_rate_warmup_epochs": 31, + "opt_momentum": 0.951807, + "opt_weight_decay": 1e-4, + "opt_learning_rate_decay_steps": 1133 + }, + "Epochs to converge": [ + 83, 85, 84, 86, 85, 85, 83, 84, 85, 85] } } diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json b/mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json index 81942532..6fef28f7 100644 --- a/mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json +++ b/mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json @@ -3,7 +3,7 @@ "rnn_t_ref_1k": { "Benchmark": "rnnt", - "BS": 128, + "BS": 1024, "Hyperparams": { "opt_base_learning_rate": 0.004, "opt_lamb_learning_rate_hold_epochs": 40, @@ -26,7 +26,7 @@ "rnn_t_ref_2k": { "Benchmark": "rnnt", - "BS": 256, + "BS": 2048, "Hyperparams": { "opt_base_learning_rate": 0.007, "opt_lamb_learning_rate_hold_epochs": 40, diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_unet3d.json b/mlperf_logging/rcp_checker/1.0.0/rcps_unet3d.json index 1e8f320b..f4a0ed87 100644 --- a/mlperf_logging/rcp_checker/1.0.0/rcps_unet3d.json +++ b/mlperf_logging/rcp_checker/1.0.0/rcps_unet3d.json @@ -1,6 +1,6 @@ { - "unet3d_ref_2": + "unet3d_ref_2_fp32": { "Benchmark": "unet3d", "BS": 2, @@ -9,21 +9,18 @@ "opt_learning_rate_warmup_epochs": 200 }, "Epochs to converge": [ - 1980, 1940, 2800, 3020, 2920, 1820, 2300, 2200, 2400, 1780, - 2840, 3880, 2120, 2860, 1920, 1480, 2380, 2360, 2220, 3920, - 2640, 2240, 2100, 2740, 1740, 3360, 2000, 2460, 2460, 2680, - 2320, 2000, 2040, 2180, 2540, 1400, 1720, 1860, 2940, 1880, - 1980, 2020, 2440, 2020, 2780, 1660, 2320, 2380, 2680, 2000, - 3140, 1680, 1660, 2560, 2660, 1560, 2100, 2000, 2300, 2240, - 1780, 2460, 2240, 3500, 1520, 3360, 2260, 2280, 2440, 2800, - 2380, 2020, 2880, 2720, 3960, 3840, 3220, 1300, 3140, 3160, - 3820, 3220, 2640, 3220, 3680, 2860, 3740, 2320, 2260, 3660, - 2260, 2560, 1760, 2720, 1940, 2640, 2200, 2500, 2640, 3460, - 1660, 2480, 1560, 2720, 2840, 2300, 1740, 3720, 2800, 3940, - 3460, 3380, 3580, 2360, 2720, 3320, 2360, 2980, 3000, 3800, - 2100, 1720, 2700, 1780, 3260, 2680, 2140, 3680, 2700] + 3420, 3420, 1440, 2320, 2940, 2240, 2600, 2840, 3320, 2360, + 4040, 2920, 3360, 2080, 3060, 2900, 4000, 3120, 2120, 2540, + 1880, 2640, 2660, 2160, 1420, 2880, 2360, 2260, 2900, 2640, + 2380, 3060, 1880, 2420, 2560, 2580, 2180, 2960, 2480, 2140, + 3500, 2420, 2500, 3860, 1620, 2260, 2160, 1280, 2320, 2140, + 2580, 3020, 2480, 3300, 2140, 3400, 2940, 2520, 3680, 3380, + 3080, 2660, 2980, 2740, 2140, 2140, 3000, 2820, 2960, 2420, + 2760, 2940, 3280, 2660, 2200, 1660, 1520, 2320, 2180, 2280, + 2960, 2140, 3280, 2980, 3580, 3280, 3420] }, - "unet3d_ref_32": + + "unet3d_ref_32_amp": { "Benchmark": "unet3d", "BS": 32, @@ -32,13 +29,15 @@ "opt_learning_rate_warmup_epochs": 1000 }, "Epochs to converge": [ - 2220, 1960, 3200, 2440, 2000, 2060, 2420, 2160, 2480, 2480, - 3460, 2280, 1660, 2500, 3040, 1860, 2020, 2100, 2560, 3660, - 2100, 1760, 2720, 1360, 1580, 4680, 1860, 1680, 1740, 2120, - 1720, 2140, 1740, 2220, 1900, 1680, 3040, 1820, 2420, 1380, - 2020, 2420, 2020, 2660, 3680, 1740, 2600, 2720, 1940, 2420, - 2160, 2060, 2620, 2500, 2080, 3040, 1820, 2780, 1780, 1880, - 2240, 2460, 1860] + 1512, 3492, 1422, 2052, 2610, 1908, 2052, 1692, 1674, 2196, + 2682, 2412, 1980, 2556, 2466, 2358, 2880, 1638, 1890, 2178, + 1764, 1872, 2070, 2322, 2178, 2070, 2916, 1548, 1998, 2214, + 2034, 2322, 1602, 2610, 1908, 1944, 2646, 2250, 2268, 1854, + 1206, 2610, 2394, 2214, 1710, 3240, 2070, 1278, 2034, 1314, + 2376, 1530, 1656, 1674, 1494, 2160, 2862, 1152, 1440, 1926, + 1440, 2250, 2358, 1836, 2178, 1818, 1458, 1188, 2358, 1692, + 1962, 2412, 1296, 2232, 2196, 1926, 1260, 2070, 3042, 2106, + 2088, 1926, 2430, 1764, 1854, 2430, 2214, 1638, 2790] } } diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index ac62cea6..c774f64b 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -60,7 +60,7 @@ def get_submission_epochs(result_files, benchmark): if conv_result == "success": subm_epochs.append(conv_epoch) else: - subm_epochs.append(-1) + subm_epochs.append(1e9) not_converged = not_converged + 1 if (not_converged > 1 and benchmark != 'unet3d') or (not_converged > 4 and benchmark == 'unet3d'): subm_epochs = None diff --git a/pack_submission.sh b/pack_submission.sh new file mode 100755 index 00000000..6508e676 --- /dev/null +++ b/pack_submission.sh @@ -0,0 +1,86 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## Encrypting your project for submission + +# In MLPerf Training v1.0, a policy has been introduced to allow submitters +# to submit an encrypted tarball of their submission repository along with the +# decryption password and SHA1 hash of the encrypted tarball to the MLPerf +# Training results chair. + +# To create an encrypted tarball and generate the SHA1 of the tarball, first +# change the `SUBMITTER` variable in `scripts/pack_submission.sh` to your +# company name. Then from the project root, run: + +# bash pack_submission.sh --pack + +# This command will prompt to enter and then confirm an encryption password. +# After this command finishes running, there will be 2 files: + +# - `mlperf_submission_${SUBMITTER}.tar.gz` - The encrypted tarball, encrypted with AES256 +# - `mlperf_submission_${SUBMITTER}.sha1` - A text file containing the sha1 hash of the encrypted tarball + +# To test that the submission has been successfully packed, run: + +# bash path/to/pack_submission.sh --unpack + +# The 3 things that must be shared with the MLPerf Inference results chair for +# submission are: +# 1. `mlperf_submission_${SUBMITTER}.tar.gz` - The encrypted tarball, encrypted with AES256 +# 2. `mlperf_submission_${SUBMITTER}.sha1` - A text file containing the sha1 hash of the encrypted tarball +# 3. The decryption password + +# Before submission deadline, upload the tarball to a public cloud storage and +# email the link along with items 2-3 to the MLCommons submissions address: submissions@mlcommons.org +# Also, include the last two lines of the submission_checker_log.txt like +# below in the body of the email as cursory evidence of a valid submission. + +# INFO:main:Results=265, NoResults=0 +# INFO:main:SUMMARY: submission looks OK + +function print_error_and_exit { + echo "usage: SUBMITTER= ${0} --pack|--unpack|--list" + exit 1 +} + +# make sure SUBMITTER is set +[ -z "${SUBMITTER}" ] && print_error_and_exit + +TARBALL_NAME=mlperf_submission_${SUBMITTER}.tar.gz +SHA1_FILE_NAME=mlperf_submission_${SUBMITTER}.sha1 +FOLDERS=$(find -mindepth 1 -maxdepth 1 -type d -not -name '.*') + +if [ "$1" = "--pack" ]; then + echo "Packing folders ${FOLDERS} into tarball ${TARBALL_NAME} and encrypting" + tar --create --gzip --file - ${FOLDERS} | openssl enc -e -aes256 -out ${TARBALL_NAME} + echo "Generating sha1sum of tarball" + sha1sum ${TARBALL_NAME} | tee ${SHA1_FILE_NAME} +elif [ "$1" = "--unpack" ]; then + echo "Checking sha1sum of tarball" + if [ "`sha1sum ${TARBALL_NAME}`" = "`cat ${SHA1_FILE_NAME}`" ]; then + echo "sha1sum matches." + openssl enc -d -aes256 -in ${TARBALL_NAME} | tar --extract --gzip --file - + else + echo "ERROR: sha1sum of ${TARBALL_NAME} does not match contents of ${SHA1_FILE_NAME}" + fi +elif [ "$1" = "--list" ]; then + if [ "`sha1sum ${TARBALL_NAME}`" = "`cat ${SHA1_FILE_NAME}`" ]; then + openssl enc -d -aes256 -in ${TARBALL_NAME} | tar --list --gzip --file - + else + echo "ERROR: sha1sum of ${TARBALL_NAME} does not match contents of ${SHA1_FILE_NAME}" + fi +else + print_error_and_exit +fi +