From e2531b777e1d70975d33cbccd11c43b38c6507a2 Mon Sep 17 00:00:00 2001
From: Marek Wawrzos <mwawrzos@nvidia.com>
Date: Tue, 11 May 2021 19:46:45 +0200
Subject: [PATCH 1/5] [RCP checker] fix batch size value for RNN-T RCPs (#121)

---
 mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json b/mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json
index 81942532..6fef28f7 100644
--- a/mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json
+++ b/mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json
@@ -3,7 +3,7 @@
   "rnn_t_ref_1k":
   {
     "Benchmark": "rnnt",
-    "BS": 128,
+    "BS": 1024,
     "Hyperparams": {
       "opt_base_learning_rate": 0.004,
       "opt_lamb_learning_rate_hold_epochs": 40,
@@ -26,7 +26,7 @@
   "rnn_t_ref_2k":
   {
     "Benchmark": "rnnt",
-    "BS": 256,
+    "BS": 2048,
     "Hyperparams": {
       "opt_base_learning_rate": 0.007,
       "opt_lamb_learning_rate_hold_epochs": 40,

From a51c1e39a159023774696da639a99b7d2090066a Mon Sep 17 00:00:00 2001
From: Matthew Frank <mfrank@nvidia.com>
Date: Tue, 11 May 2021 13:00:42 -0500
Subject: [PATCH 2/5] adding pack_submission script (#102)

---
 pack_submission.sh | 86 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100755 pack_submission.sh

diff --git a/pack_submission.sh b/pack_submission.sh
new file mode 100755
index 00000000..6508e676
--- /dev/null
+++ b/pack_submission.sh
@@ -0,0 +1,86 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+## Encrypting your project for submission
+
+# In MLPerf Training v1.0, a policy has been introduced to allow submitters
+# to submit an encrypted tarball of their submission repository along with the
+# decryption password and SHA1 hash of the encrypted tarball to the MLPerf
+# Training results chair.
+
+# To create an encrypted tarball and generate the SHA1 of the tarball, first
+# change the `SUBMITTER` variable in `scripts/pack_submission.sh` to your
+# company name. Then from the project root, run:
+
+# bash pack_submission.sh --pack
+
+# This command will prompt to enter and then confirm an encryption password.
+# After this command finishes running, there will be 2 files:
+
+# - `mlperf_submission_${SUBMITTER}.tar.gz` - The encrypted tarball, encrypted with AES256
+# - `mlperf_submission_${SUBMITTER}.sha1` - A text file containing the sha1 hash of the encrypted tarball
+
+# To test that the submission has been successfully packed, run:
+
+# bash path/to/pack_submission.sh --unpack
+
+# The 3 things that must be shared with the MLPerf Inference results chair for
+# submission are:
+# 1. `mlperf_submission_${SUBMITTER}.tar.gz` - The encrypted tarball, encrypted with AES256
+# 2. `mlperf_submission_${SUBMITTER}.sha1` - A text file containing the sha1 hash of the encrypted tarball
+# 3. The decryption password
+
+# Before submission deadline, upload the tarball to a public cloud storage and
+# email the link along with items 2-3 to the MLCommons submissions address: submissions@mlcommons.org
+# Also, include the last two lines of the submission_checker_log.txt like
+# below in the body of the email as cursory evidence of a valid submission.
+
+# INFO:main:Results=265, NoResults=0
+# INFO:main:SUMMARY: submission looks OK
+
+function print_error_and_exit {
+    echo "usage: SUBMITTER=<COMPANY_NAME> ${0} --pack|--unpack|--list"
+    exit 1
+}
+
+# make sure SUBMITTER is set
+[ -z "${SUBMITTER}" ] && print_error_and_exit
+
+TARBALL_NAME=mlperf_submission_${SUBMITTER}.tar.gz
+SHA1_FILE_NAME=mlperf_submission_${SUBMITTER}.sha1
+FOLDERS=$(find -mindepth 1 -maxdepth 1 -type d -not -name '.*')
+
+if [ "$1" = "--pack" ]; then
+    echo "Packing folders ${FOLDERS} into tarball ${TARBALL_NAME} and encrypting"
+    tar --create --gzip --file - ${FOLDERS} | openssl enc -e -aes256 -out ${TARBALL_NAME}
+    echo "Generating sha1sum of tarball"
+    sha1sum ${TARBALL_NAME} | tee ${SHA1_FILE_NAME}
+elif [ "$1" = "--unpack" ]; then
+    echo "Checking sha1sum of tarball"
+    if [ "`sha1sum ${TARBALL_NAME}`" = "`cat ${SHA1_FILE_NAME}`" ]; then
+        echo "sha1sum matches."
+        openssl enc -d -aes256 -in ${TARBALL_NAME} | tar --extract --gzip --file -
+    else
+        echo "ERROR: sha1sum of ${TARBALL_NAME} does not match contents of ${SHA1_FILE_NAME}"
+    fi
+elif [ "$1" = "--list" ]; then
+    if [ "`sha1sum ${TARBALL_NAME}`" = "`cat ${SHA1_FILE_NAME}`" ]; then
+	openssl enc -d -aes256 -in ${TARBALL_NAME} | tar --list --gzip --file -
+    else
+        echo "ERROR: sha1sum of ${TARBALL_NAME} does not match contents of ${SHA1_FILE_NAME}"
+    fi
+else
+    print_error_and_exit
+fi
+

From 4a2517ed06a1d5962f1aeb3e0d1f61b5b1cffb85 Mon Sep 17 00:00:00 2001
From: itay hubara <itayhubara@users.noreply.github.com>
Date: Fri, 14 May 2021 05:24:39 +0300
Subject: [PATCH 3/5] adding some constants for BERT (#113)

Co-authored-by: Xinyuan Huang <xyhuang@users.noreply.github.com>
---
 mlperf_logging/mllog/constants.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py
index a6a86f8d..b2f2fec3 100644
--- a/mlperf_logging/mllog/constants.py
+++ b/mlperf_logging/mllog/constants.py
@@ -46,12 +46,14 @@
 TRANSFORMER = "transformer"
 RNNT = "rnnt"
 UNET3D = "unet3d"
+BERT ="bert"
 
 # Constant values - model info
 ADAM = "adam"
 LARS = "lars"
 LAZY_ADAM = "lazy_adam"
 SGD = "sgd"
+LAMB ="lamb"
 
 # Constant values - metadata info
 ABORTED = "aborted"
@@ -101,6 +103,7 @@
 MIN_IMAGE_SIZE = "min_image_size"
 MODEL_BN_SPAN = "model_bn_span"
 NUM_IMAGE_CANDIDATES = "num_image_candidates"
+NUM_WARMUP_STEPS = "num_warmup_steps"
 OPT_ADAM_BETA_1 = "opt_adam_beta_1"
 OPT_ADAM_BETA_2 = "opt_adam_beta_2"
 OPT_ADAM_EPSILON = "opt_adam_epsilon"
@@ -108,6 +111,7 @@
 OPT_BASE_LR = "opt_base_learning_rate"
 OPT_LAMB_LR_MIN = "opt_lamb_learning_rate_min"
 OPT_LAMB_LR_DECAY_POLY_POWER = "opt_lamb_learning_rate_decay_poly_power"
+OPT_LAMB_WEIGHT_DECAY = "opt_lamb_weight_decay_rate"
 OPT_LAMB_BETA_1 = "opt_lamb_beta_1"
 OPT_LAMB_BETA_2 = "opt_lamb_beta_2"
 OPT_LAMB_EPSILON = "opt_lamb_epsilon"
@@ -121,6 +125,7 @@
 OPT_LR_DECAY_START_STEP = "opt_learning_rate_decay_start_step"
 OPT_LR_DECAY_STEPS = "opt_learning_rate_decay_steps"
 OPT_LR_REMAIN_STEPS = "opt_learning_rate_remain_steps"
+OPT_LR_TRAINING_STEPS = "opt_learning_rate_training_steps"
 OPT_LR_WARMUP_EPOCHS = "opt_learning_rate_warmup_epochs"
 OPT_LR_WARMUP_FACTOR = "opt_learning_rate_warmup_factor"
 OPT_LR_WARMUP_STEPS = "opt_learning_rate_warmup_steps"
@@ -140,6 +145,7 @@
 MODEL_EVAL_EMA_FACTOR = "model_eval_ema_factor"
 MODEL_WEIGHTS_INITIALIZATION_SCALE = "model_weights_initialization_scale"
 EVAL_MAX_PREDICTION_SYMBOLS = "eval_max_prediction_symbols"
+START_WARMUP_STEP ="start_warmup_step"
 
 # Log keys - misc.
 BBOX = "bbox"

From 7b1fb7df3224272a9e4b54d9afc2e28ef30e2bd1 Mon Sep 17 00:00:00 2001
From: Shang Wang <66387198+shangw-nvidia@users.noreply.github.com>
Date: Thu, 13 May 2021 22:25:02 -0400
Subject: [PATCH 4/5] [RCP][DLRM] Changing epoch number starting from 1. (#123)

---
 mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json b/mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json
index 99165ad9..2a3dfb0a 100644
--- a/mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json
+++ b/mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json
@@ -13,7 +13,8 @@
        "sgd_opt_learning_rate_decay_steps": 30000
     },
     "Epochs to converge": [
-       0.8, 0.75, 0.75, 0.75, 0.75, 0.8, 0.7, 0.75, 0.75, 0.75]
+       1.8, 1.75, 1.75, 1.75, 1.75, 1.8, 1.7, 1.75, 1.75, 1.75
+    ]
   },
 
   "dlrm_ref_55296":
@@ -29,9 +30,10 @@
        "sgd_opt_learning_rate_decay_steps": 27772
     },
     "Epochs to converge": [
-       0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.95,
-       0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9,
-       0.9, 0.9, 0.85, 0.9, 0.9, 0.9, 0.9, 0.95, 0.9]
+       1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.95,
+       1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9,
+       1.9, 1.9, 1.85, 1.9, 1.9, 1.9, 1.9, 1.95, 1.9 
+    ]
   }
 
 }

From ab442825763bcecc3dc83d0b4d49df2d62c6ca77 Mon Sep 17 00:00:00 2001
From: Elias Mizan <43765737+emizan76@users.noreply.github.com>
Date: Fri, 14 May 2021 19:10:00 -0700
Subject: [PATCH 5/5] RCP update: Resnet 64K, Unet3D, BERT (#119)

* First RCP checker commit, just a small commit with a README file to make sure the github +
forking flow work for me.

* Rcp_checker implementation:

* Added 1.0.0/rcps.json file. Still in progress as RCPs have not been
finalized

* Code is in rcp_checker.py. This currently contains a single
RCP_Checker class with functions to consume json file, construct the RCP
structure, compute means, stdevs, and min allowed speedups,
find RCPs based on benchmark and batch size, and generate interpolated
RCPs. This is all the processing needed to happen at startup.
No support yet to process and evaluate submission runs. This is TBD

* __main__.py run a couple of simple tests, this will be moved
eventually to a separate test file.

* Added a few more 1.0.0 RCPs (still in progress)

Added submission directory processing and comparison to RCPs in
rcp_checker.

Connected RCP checker to the result_summarizer.

Fixed a couple of bugs.

* Added remaining RCPs (resnet, bert, rnnt, unet3d), and fixed ones already in (maskrcnn, dlrm, ssd).

Made a few fixes suggested by Victor

* Update mlperf_logging/rcp_checker/README.md

Co-authored-by: Marek Wawrzos <marek.28.93@gmail.com>

* One step closer to v1.0.0

* System Description Checker:
  - Updated to 1.0.0

* Package Checker:
  - Added support for 1.0.0
  - Added calls to the RCP checker
  - Added call to the system description checker
  - Added support for the Unet3d olympic scoring (reject top and bottom
  4)

* Results Summarizer
  - Added support for 1.0.0
  - Refactored olympic scoring calculation to be able to accommodate
  unet3d (reject top and bottom 4)
  - Made a couple of fixes to RCP checker interface and disabled RCP
  checks for minigo.

* RCP Checker
  - Split monolithic RCP json file into 1 json file / benchmark. This
  improves readability and makes adding more RCPs easier
  - Added support for Unet3D RCP checking: Reject top and bottom 4
  scores instead of 1
  - Added verbose mode to assist submitters with debugging
  - Fixed a couple of bugs I found after previous PR was merged.

* Documentation:
  - Updated README files for RCP checker, results summarizer, package
  checker and system description checker

* Fixed suggested by Marek.

* Fixed a bug in the RCP checker

Updated max compile time rulw for 20mins to 30mins.

Removed a print statement from the result_summarizer.

* Logging 1.0.0 fixes based on some testing and more knowledge on
submission procedure

1. Added --rcp_bypass command line flag in package checker.
   Submitter can use it to allow uploading of benchmarks that fail the
   RCP test. This is a package checker flag that is propagated to the
   RCP checker. It has no meaning using it on a standalone RCP checker
   run, as the package checker outputs controls whether a submission is
   valid.

2. Removed RCP checker from result_summarizer. It does not need to run
   there as it is called by the package checker.

3. Fixes for open submission: Do not call the seed checker, nor the RCP
   checker. Fixed a bug where open_common was including
   closed_<benchmark> rules. Since submitters in the open category can
   now use their own convergence rules I removed the convergence
   rules used in v0.7. So now the only rules for open submissions are
   the number of runs and open_common compliance rules.

* Forgot to add verifier 1.0 top-level script in my previous commit.

* Fixed failures pointed by Shang:
- Line can start with :::MLLOG but it islegal to have anything else
before :::MLLOG
- Opened log files as latin-1, just like the compliance checker

* Added Resnet temporary RCP for B=64K. The RCP was derived by Google's
0.7 tpu-v3-8192-TF submission and the 5 runs were duplicated

Updated Unet3D RCPs.

* Fixed RCP checker bug: When there were non-converging runs, the mean
epochs to converge for the submission was under-reported.

Updated compliance README file to 1.0.0

* Added final Resnet 64K RCP and updated 8K RCP.

* Updated RCPs for Bert: Removed 768, added 1536 and updated 3072.

Co-authored-by: Marek Wawrzos <marek.28.93@gmail.com>
---
 mlperf_logging/compliance_checker/README.md   | 28 ++++++------
 mlperf_logging/compliance_checker/__main__.py |  2 +
 .../rcp_checker/1.0.0/rcps_bert.json          | 30 ++++++-------
 .../rcp_checker/1.0.0/rcps_resnet.json        | 27 ++++++++++--
 .../rcp_checker/1.0.0/rcps_unet3d.json        | 43 +++++++++----------
 mlperf_logging/rcp_checker/rcp_checker.py     |  2 +-
 6 files changed, 77 insertions(+), 55 deletions(-)

diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md
index c75b43e7..fe15615d 100644
--- a/mlperf_logging/compliance_checker/README.md
+++ b/mlperf_logging/compliance_checker/README.md
@@ -10,8 +10,9 @@ To check a log file for compliance:
 
     python -m mlperf_logging.compliance_checker [--config YAML] [--ruleset MLPERF_EDITION] FILENAME
 
-By default, 0.7.0 edition rules are used and the default config is set to `0.7.0/common.yaml`.
+By default, 1.0.0 edition rules are used and the default config is set to `1.0.0/common.yaml`.
 This config will check all common keys and enqueue benchmark specific config to be checked as well.
+Old editions, still supported are 0.7.0 amd 0.6.0
 
 Prints `SUCCESS` when no issues were found. Otherwise will print error details.
 
@@ -19,20 +20,20 @@ As log examples use [NVIDIA's v0.6 training logs](https://github.com/mlperf/trai
 
 ### Existing config files
 
-    0.7.0/common.yaml        - currently the default config file, checks common fields complience and equeues benchmark-specific config file
-    0.7.0/resnet.yaml
-    0.7.0/ssd.yaml
-    0.7.0/minigo.yaml
-    0.7.0/maskrcnn.yaml
-    0.7.0/gnmt.yaml
-    0.7.0/transformer.yaml
-    0.7.0/bert.yaml
-    0.7.0/dlrm.yaml
+    1.0.0/common.yaml        - currently the default config file, checks common fields complience and equeues benchmark-specific config file
+    1.0.0/resnet.yaml
+    1.0.0/ssd.yaml
+    1.0.0/minigo.yaml
+    1.0.0/maskrcnn.yaml
+    1.0.0/rnnt.yaml
+    1.0.0/unet3d.yaml
+    1.0.0/bert.yaml
+    1.0.0/dlrm.yaml
 
 ### Implementation details
 Compliance checking is done following below algorithm.
 
-1. Parser converts the log into a list of records, each record corresponds to MLL 
+1. Parser converts the log into a list of records, each record corresponds to MLLOG
    line and contains all relevant extracted information
 2. Set of rules to be checked in loaded from provided config yaml file
 3. Process optional `BEGIN` rule if present by executing provided `CODE` section
@@ -114,7 +115,7 @@ Example:
 `ll` is a structure representing current log line that triggered `KEY` record. `ll` has the following fields
 that can be accessed:
 - `full_string` - the complete line as a string
-- `timestamp` - seconds as a float, e.g. 1234.567
+- `timestamp` - milliseconds as an integer
 - `key` - the string key
 - `value` - the parsed value associated with the key, or None if no value
 - `lineno` - line number in the original file of the current key
@@ -143,7 +144,7 @@ Example:
         NAME:  submission_benchmark
         REQ:   EXACTLY_ONE
         CHECK: " v['value'] in ['resnet', 'ssd', 'maskrcnn', 'transformer', 'gnmt'] "
-        POST:  " enqueue_config('0.7.0/{}.yaml'.format(v['value'])) "
+        POST:  " enqueue_config('1.0.0/{}.yaml'.format(v['value'])) "
 
 
 #### Other operations
@@ -158,6 +159,7 @@ For instance, can define rules that would print out information as shown in the
 Tested and confirmed working using the following software versions:
 - Python 2.7.12 + PyYAML 3.11
 - Python 3.6.8  + PyYAML 5.1
+- Python 2.9.2 + PyYAML 5.3.1
 
 ### How to install PyYaML
 
diff --git a/mlperf_logging/compliance_checker/__main__.py b/mlperf_logging/compliance_checker/__main__.py
index 438bcd7e..1b8f808e 100644
--- a/mlperf_logging/compliance_checker/__main__.py
+++ b/mlperf_logging/compliance_checker/__main__.py
@@ -23,3 +23,5 @@
 
 if not valid:
     sys.exit(1)
+else:
+    print('SUCCESS')
diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_bert.json b/mlperf_logging/rcp_checker/1.0.0/rcps_bert.json
index 7bbd3124..39b61ae7 100644
--- a/mlperf_logging/rcp_checker/1.0.0/rcps_bert.json
+++ b/mlperf_logging/rcp_checker/1.0.0/rcps_bert.json
@@ -38,23 +38,23 @@
        2508800, 2458624, 2684416, 2533888, 2533888, 2784768, 2308096, 2784768, 2584064, 2809856]
   },
 
-  "bert_ref_768":
+  "bert_ref_1536":
   {
     "Benchmark": "bert",
-    "BS": 768,
+    "BS": 1536,
     "Hyperparams": {
-      "opt_base_learning_rate": 0.00035,
+      "opt_base_learning_rate": 0.002,
       "opt_epsilon": 1e-6,
-      "opt_learning_rate_training_steps": 8000,
-      "num_warmup_steps": 420,
+      "opt_learning_rate_training_steps": 2254,
+      "num_warmup_steps": 0,
       "start_warmup_step": 0,
-      "opt_lamb_beta_1": 0.91063,
-      "opt_lamb_beta_2": 0.96497,
+      "opt_lamb_beta_1": 0.66,
+      "opt_lamb_beta_2": 0.996,
       "opt_lamb_weight_decay_rate": 0.01
     },
     "Epochs to converge": [
-       3979008, 3598848, 3598848, 3776256, 3168000, 3370752, 3598848, 3472128, 3826944, 3472128,
-       3066624, 3345408, 3269376, 3776256, 3396096, 3852288, 3294720, 4004352, 3396096, 3091968]
+       2836240, 2801664, 2801664, 2727936, 2801664, 2875392, 2899968, 2727936, 2777088, 2875392,
+       2777088, 2801664, 2678784, 2801664, 2703360, 2629632, 2727936, 2703360, 2654208, 2949120]
   },
 
   "bert_ref_3072":
@@ -62,18 +62,18 @@
     "Benchmark": "bert",
     "BS": 3072,
     "Hyperparams": {
-      "opt_base_learning_rate": 0.0015,
+      "opt_base_learning_rate": 0.002,
       "opt_epsilon": 1e-6,
-      "opt_learning_rate_training_steps": 1271,
+      "opt_learning_rate_training_steps": 1141,
       "num_warmup_steps": 100,
       "start_warmup_step": 0,
-      "opt_lamb_beta_1": 0.9,
-      "opt_lamb_beta_2": 0.999,
+      "opt_lamb_beta_1": 0.66,
+      "opt_lamb_beta_2": 0.998,
       "opt_lamb_weight_decay_rate": 0.01
     },
     "Epochs to converge": [
-       3465216, 3563520, 3489792, 3416064, 3489792, 3514368, 3760128, 3489792, 3612672, 3465216,
-       3317760, 3661824, 3268608, 3563520, 3588096, 3366912, 3538944, 3489792, 3489792, 3710976]
+       2703360, 2482176, 3072000, 2654208, 2580480, 2727936, 2605056, 2801664, 2777088, 2580480,
+       2875392, 2826240, 2973696, 2850816, 2678784, 2919120, 3121152, 2605056, 2678784, 2850816]
   },
 
   "bert_ref_8192":
diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_resnet.json b/mlperf_logging/rcp_checker/1.0.0/rcps_resnet.json
index 32f97bc6..a42b3df8 100644
--- a/mlperf_logging/rcp_checker/1.0.0/rcps_resnet.json
+++ b/mlperf_logging/rcp_checker/1.0.0/rcps_resnet.json
@@ -50,11 +50,11 @@
       "epsilon": 0,
       "opt_learning_rate_warmup_epochs": 5,
       "opt_momentum": 0.9,
-      "opt_weight_decay": 2e-3,
-      "opt_learning_rate_decay_steps": 6720
+      "opt_weight_decay": 2e-4,
+      "opt_learning_rate_decay_steps": 6095
     },
     "Epochs to converge": [
-      41, 40, 42, 42, 41, 41, 42, 42, 41, 41]
+      42, 44, 43, 41, 41, 41, 42, 42, 43, 41]
   },
 
   "resnet_ref_32768":
@@ -68,12 +68,31 @@
       "opt_learning_rate_decay_poly_power": 2,
       "epsilon": 0,
       "opt_learning_rate_warmup_epochs": 16,
-      "opt_momentum": 2.5e-5,
+      "opt_momentum": 0.94,
       "opt_weight_decay": 2e-3,
       "opt_learning_rate_decay_steps": 58
     },
     "Epochs to converge": [
       56, 56, 55, 56, 56, 56, 56, 56, 57, 56]
+  },
+
+  "resnet_ref_65536":
+  {
+    "Benchmark": "resnet",
+    "BS": 65536,
+    "Hyperparams": {
+      "optimizer": "lars",
+      "opt_base_learning_rate": 24.699,
+      "opt_end_learning_rate": 1e-4,
+      "opt_learning_rate_decay_poly_power": 2,
+      "epsilon": 0,
+      "opt_learning_rate_warmup_epochs": 31,
+      "opt_momentum": 0.951807,
+      "opt_weight_decay": 1e-4,
+      "opt_learning_rate_decay_steps": 1133
+    },
+    "Epochs to converge": [
+      83, 85, 84, 86, 85, 85, 83, 84, 85, 85]
   }
 
 }
diff --git a/mlperf_logging/rcp_checker/1.0.0/rcps_unet3d.json b/mlperf_logging/rcp_checker/1.0.0/rcps_unet3d.json
index 1e8f320b..f4a0ed87 100644
--- a/mlperf_logging/rcp_checker/1.0.0/rcps_unet3d.json
+++ b/mlperf_logging/rcp_checker/1.0.0/rcps_unet3d.json
@@ -1,6 +1,6 @@
 {
 
-  "unet3d_ref_2":
+  "unet3d_ref_2_fp32":
   {
     "Benchmark": "unet3d",
     "BS": 2,
@@ -9,21 +9,18 @@
       "opt_learning_rate_warmup_epochs": 200
     },
     "Epochs to converge": [
-      1980, 1940, 2800, 3020, 2920, 1820, 2300, 2200, 2400, 1780,
-      2840, 3880, 2120, 2860, 1920, 1480, 2380, 2360, 2220, 3920,
-      2640, 2240, 2100, 2740, 1740, 3360, 2000, 2460, 2460, 2680,
-      2320, 2000, 2040, 2180, 2540, 1400, 1720, 1860, 2940, 1880,
-      1980, 2020, 2440, 2020, 2780, 1660, 2320, 2380, 2680, 2000,
-      3140, 1680, 1660, 2560, 2660, 1560, 2100, 2000, 2300, 2240,
-      1780, 2460, 2240, 3500, 1520, 3360, 2260, 2280, 2440, 2800,
-      2380, 2020, 2880, 2720, 3960, 3840, 3220, 1300, 3140, 3160,
-      3820, 3220, 2640, 3220, 3680, 2860, 3740, 2320, 2260, 3660,
-      2260, 2560, 1760, 2720, 1940, 2640, 2200, 2500, 2640, 3460,
-      1660, 2480, 1560, 2720, 2840, 2300, 1740, 3720, 2800, 3940,
-      3460, 3380, 3580, 2360, 2720, 3320, 2360, 2980, 3000, 3800,
-      2100, 1720, 2700, 1780, 3260, 2680, 2140, 3680, 2700]
+      3420, 3420, 1440, 2320, 2940, 2240, 2600, 2840, 3320, 2360,
+      4040, 2920, 3360, 2080, 3060, 2900, 4000, 3120, 2120, 2540,
+      1880, 2640, 2660, 2160, 1420, 2880, 2360, 2260, 2900, 2640,
+      2380, 3060, 1880, 2420, 2560, 2580, 2180, 2960, 2480, 2140,
+      3500, 2420, 2500, 3860, 1620, 2260, 2160, 1280, 2320, 2140,
+      2580, 3020, 2480, 3300, 2140, 3400, 2940, 2520, 3680, 3380,
+      3080, 2660, 2980, 2740, 2140, 2140, 3000, 2820, 2960, 2420,
+      2760, 2940, 3280, 2660, 2200, 1660, 1520, 2320, 2180, 2280,
+      2960, 2140, 3280, 2980, 3580, 3280, 3420]
   },
-  "unet3d_ref_32":
+
+  "unet3d_ref_32_amp":
   {
     "Benchmark": "unet3d",
     "BS": 32,
@@ -32,13 +29,15 @@
       "opt_learning_rate_warmup_epochs": 1000
     },
     "Epochs to converge": [
-      2220, 1960, 3200, 2440, 2000, 2060, 2420, 2160, 2480, 2480,
-      3460, 2280, 1660, 2500, 3040, 1860, 2020, 2100, 2560, 3660,
-      2100, 1760, 2720, 1360, 1580, 4680, 1860, 1680, 1740, 2120,
-      1720, 2140, 1740, 2220, 1900, 1680, 3040, 1820, 2420, 1380,
-      2020, 2420, 2020, 2660, 3680, 1740, 2600, 2720, 1940, 2420,
-      2160, 2060, 2620, 2500, 2080, 3040, 1820, 2780, 1780, 1880,
-      2240, 2460, 1860]
+      1512, 3492, 1422, 2052, 2610, 1908, 2052, 1692, 1674, 2196,
+      2682, 2412, 1980, 2556, 2466, 2358, 2880, 1638, 1890, 2178,
+      1764, 1872, 2070, 2322, 2178, 2070, 2916, 1548, 1998, 2214,
+      2034, 2322, 1602, 2610, 1908, 1944, 2646, 2250, 2268, 1854,
+      1206, 2610, 2394, 2214, 1710, 3240, 2070, 1278, 2034, 1314,
+      2376, 1530, 1656, 1674, 1494, 2160, 2862, 1152, 1440, 1926,
+      1440, 2250, 2358, 1836, 2178, 1818, 1458, 1188, 2358, 1692,
+      1962, 2412, 1296, 2232, 2196, 1926, 1260, 2070, 3042, 2106,
+      2088, 1926, 2430, 1764, 1854, 2430, 2214, 1638, 2790]
   }
 
 }
diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py
index ac62cea6..c774f64b 100644
--- a/mlperf_logging/rcp_checker/rcp_checker.py
+++ b/mlperf_logging/rcp_checker/rcp_checker.py
@@ -60,7 +60,7 @@ def get_submission_epochs(result_files, benchmark):
                         if conv_result == "success":
                             subm_epochs.append(conv_epoch)
                         else:
-                            subm_epochs.append(-1)
+                            subm_epochs.append(1e9)
                             not_converged = not_converged + 1
     if (not_converged > 1 and benchmark != 'unet3d') or (not_converged > 4 and benchmark == 'unet3d'):
         subm_epochs = None