Skip to content

Commit

Permalink
Merge pull request #183 from xyhuang/1.1-branch
Browse files Browse the repository at this point in the history
1.1 branch
  • Loading branch information
xyhuang authored Oct 7, 2021
2 parents 05037fd + 98c9779 commit 59ec294
Show file tree
Hide file tree
Showing 26 changed files with 286 additions and 174 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ pip uninstall mlperf-logging
## Instructions

A submission needs to pass the package checker and run the result summarizer.
For submission 1.0 you can do that with
For submission 1.1 (latest) you can do that with. For previous versions use the respective verify script.

```sh
./verify_for_v1.0_training.sh <submission_directory>
./verify_for_v1.1_training.sh <submission_directory>
```

If you want to run the individual utilities/checker, please check the README files in the respective subdirectories.
11 changes: 11 additions & 0 deletions mlperf_logging/benchmark_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,17 @@
'rnnt',
'unet3d',
],

'1.1': [
'bert',
'dlrm',
'maskrcnn',
'minigo',
'resnet',
'ssd',
'rnnt',
'unet3d',
],
},

'hpc': {
Expand Down
10 changes: 10 additions & 0 deletions mlperf_logging/compliance_checker/hpc_1.0.0/closed_common.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,13 @@
- KEY:
NAME: gradient_accumulation_frequency
CHECK: " v['value'] > 0 "

- KEY:
NAME: number_of_nodes
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0"

- KEY:
NAME: accelerators_per_node
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0"
10 changes: 0 additions & 10 deletions mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,6 @@
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0"

- KEY:
NAME: number_of_nodes
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0"

- KEY:
NAME: accelerators_per_node
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0"

- KEY:
NAME: batchnorm_group_size
REQ: EXACTLY_ONE
Expand Down
5 changes: 5 additions & 0 deletions mlperf_logging/compliance_checker/hpc_1.0.0/open_common.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['deepcam', 'cosmoflow', 'oc20'] "
POST: " enqueue_config('hpc_1.0.0/open_{}.yaml'.format(v['value'])) "
19 changes: 12 additions & 7 deletions mlperf_logging/compliance_checker/mlp_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ def put_warning(self, msg, key):
if self.werror:
self.put_message(msg, key)
elif not self.quiet:
print(key, msg)
self.warnings[key] = msg

def put_message(self, msg, key=None):
Expand Down Expand Up @@ -121,9 +120,12 @@ def run_check_end(self, tests, state):
for test in tests:
try:
if not eval(test.strip(), state):
self.put_message(
f"failed test: {test}"
f"\ncurrent context[s]={preety_dict(state['s'])}",
if test.strip().split()[0] == "sorted(s['initialized_tensors'])":
self.put_warning(f" Warning: Failed weights initialization check (can be ignored for 1.1.0)", key='')
else:
self.put_message(
f"failed test: {test}"
f"\ncurrent context[s]={preety_dict(state['s'])}",
)
except:
self.put_message(
Expand Down Expand Up @@ -225,8 +227,11 @@ def configured_checks(self, loglines, config_file):

if v['REQ']=='AT_LEAST_ONE':
if len(reported_values[k])<1:
self.put_message(f"Required AT_LEAST_ONE occurrence of '{k}' but found {len(reported_values[k])}",
key=k)
if k == 'weights_initialization':
self.put_warning(f" Warning: Failed weights initialization check (can be ignored for 1.1.0)", key=k)
else:
self.put_message(f"Required AT_LEAST_ONE occurrence of '{k}' but found {len(reported_values[k])}",
key=k)

if v['REQ'].startswith('AT_LEAST_ONE_OR'):
alternatives.add(tuple({k, *self.parse_alternatives(v['REQ'])}))
Expand Down Expand Up @@ -300,7 +305,7 @@ def get_parser():
parser.add_argument('--usage', type=str, default='training',
choices=usage_choices(),
help='what WG do the benchmarks come from')
parser.add_argument('--ruleset', type=str, default='1.0.0',
parser.add_argument('--ruleset', type=str, default='1.1.0',
choices=rule_choices(),
help='what version of rules to check the log against')
parser.add_argument('--config', type=str,
Expand Down
3 changes: 3 additions & 0 deletions mlperf_logging/compliance_checker/mlp_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .ruleset_060 import parse_file as parse_file_060
from .ruleset_070 import parse_file as parse_file_070
from .ruleset_100 import parse_file as parse_file_100
from .ruleset_100 import parse_file as parse_file_110


def parse_file(filename, ruleset='0.6.0'):
Expand All @@ -10,5 +11,7 @@ def parse_file(filename, ruleset='0.6.0'):
return parse_file_070(filename)
elif ruleset == '1.0.0':
return parse_file_100(filename)
elif ruleset == '1.1.0':
return parse_file_110(filename)
else:
raise Exception(f'Ruleset "{ruleset}" is not supported')
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@
- KEY:
NAME: data_train_num_buckets
REQ: EXACTLY_ONE
CHECK: " v['value'] == 6 "
CHECK: " isinstance(v['value'], int) "

- KEY:
NAME: data_speed_perturbaton_min
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
NAME: submission_division
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['closed', 'open'] "
POST: " enqueue_config('training_1.0.0/{}_common.yaml'.format(v['value'])) "
POST: " enqueue_config('training_1.1.0/{}_common.yaml'.format(v['value'])) "

- KEY:
NAME: submission_status
Expand Down
7 changes: 3 additions & 4 deletions mlperf_logging/package_checker/package_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,6 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
any_pattern = '{folder}/*'.format(folder=benchmark_folder)
all_files = glob.glob(any_pattern, recursive=True)

print("LOOK:", benchmark, result_files)

# Find all source codes for this benchmark.
source_files = find_source_files_under(
os.path.join(folder, 'benchmarks', benchmark))
Expand Down Expand Up @@ -115,6 +113,7 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
print(all_files)
print('Detected {} total files in directory {}, but some do not conform '
'to naming convention, should you rename them to result_*.txt ?'.format(len(all_files), benchmark_folder))

if len(result_files) < len(all_files):
print('WARNING: Unknown files in results directory {}'.format(benchmark_folder))

Expand Down Expand Up @@ -158,12 +157,12 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
too_many_errors = True

# Check if each run use unique seeds.
if ruleset == '1.0.0' and division == 'closed':
if ruleset in {'1.0.0', '1.1.0'} and division == 'closed':
if not seed_checker.check_seeds(result_files, source_files):
too_many_errors = True

# Run RCP checker for 1.0.0
if ruleset == '1.0.0' and division == 'closed' and benchmark != 'minigo':
if ruleset in {'1.0.0', '1.1.0'} and division == 'closed' and benchmark != 'minigo':
rcp_chk = rcp_checker.make_checker(usage, ruleset, verbose=False, bert_train_samples=rcp_bert_train_samples)
rcp_chk._compute_rcp_stats()

Expand Down
28 changes: 22 additions & 6 deletions mlperf_logging/rcp_checker/hpc_1.0.0/rcps_cosmoflow.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,36 @@
"opt_weight_decay": 0.01
}
},
"cosmoflow_ref_128": {
"Benchmark": "cosmoflow",
"BS": 128,
"Epochs to converge": [ 16, 18, 18, 18, 18, 17, 18, 18, 18, 18, 18, 18, 17, 18, 18, 18, 19, 18, 18, 18 ],
"Hyperparams": {
"global_batch_size": 128,
"opt_name": "SGD",
"opt_base_learning_rate": 0.004,
"opt_learning_rate_warmup_epochs": 4,
"opt_learning_rate_warmup_factor": 4.0,
"opt_learning_rate_decay_boundary_epochs": [ 16, 32 ],
"opt_learning_rate_decay_factor": 0.25,
"dropout": 0.5,
"opt_weight_decay": 0.0
}
},
"cosmoflow_ref_1024": {
"Benchmark": "cosmoflow",
"BS": 1024,
"Epochs to converge": [ 34, 55, 45, 43, 51, 38, 39, 46, 45, 42, 40, 37, 48, 42, 45, 61, 34, 49, 44, 40, 46 ],
"Epochs to converge": [ 42, 38, 42, 40, 40, 39, 43, 37, 39, 37, 43, 39, 38, 42, 40, 42, 42, 38, 36, 43 ],
"Hyperparams": {
"global_batch_size": 1024,
"opt_name": "SGD",
"opt_base_learning_rate": 0.016,
"opt_learning_rate_warmup_epochs": 4,
"opt_learning_rate_warmup_factor": 16.0,
"opt_base_learning_rate": 0.012,
"opt_learning_rate_warmup_epochs": 0,
"opt_learning_rate_warmup_factor": 1.0,
"opt_learning_rate_decay_boundary_epochs": [ 32, 64 ],
"opt_learning_rate_decay_factor": 0.25,
"dropout": 0.0,
"opt_weight_decay": 0.01
"dropout": 0.5,
"opt_weight_decay": 0.0
}
}
}
5 changes: 0 additions & 5 deletions mlperf_logging/rcp_checker/hpc_1.0.0/rcps_deepcam.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
"batchnorm_groupsize": 1,
"global_batch_size": 128,
"gradient_accumulation_frequency": 1,
"num_workers": 32,
"opt_betas": [ 0.9, 0.999 ],
"opt_bias_correction": true,
"opt_eps": 1e-06,
Expand All @@ -31,7 +30,6 @@
"batchnorm_groupsize": 1,
"global_batch_size": 256,
"gradient_accumulation_frequency": 1,
"num_workers": 64,
"opt_betas": [ 0.9, 0.999 ],
"opt_bias_correction": true,
"opt_eps": 1e-06,
Expand All @@ -55,7 +53,6 @@
"batchnorm_groupsize": 1,
"global_batch_size": 512,
"gradient_accumulation_frequency": 1,
"num_workers": 128,
"opt_betas": [ 0.9, 0.999 ],
"opt_bias_correction": true,
"opt_eps": 1e-06,
Expand All @@ -79,7 +76,6 @@
"batchnorm_groupsize": 1,
"global_batch_size": 1024,
"gradient_accumulation_frequency": 1,
"num_workers": 256,
"opt_betas": [ 0.9, 0.999 ],
"opt_bias_correction": true,
"opt_eps": 1e-06,
Expand All @@ -103,7 +99,6 @@
"batchnorm_groupsize": 1,
"global_batch_size": 2048,
"gradient_accumulation_frequency": 1,
"num_workers": 1024,
"opt_betas": [ 0.9, 0.999 ],
"opt_bias_correction": true,
"opt_eps": 1e-06,
Expand Down
13 changes: 13 additions & 0 deletions mlperf_logging/rcp_checker/hpc_1.0.0/rcps_oc20.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,18 @@
"opt_learning_rate_decay_boundary_steps": [ 31264, 46896 ],
"opt_learning_rate_decay_factor": 0.1
}
},
"oc20_ref_2048": {
"Benchmark": "oc20",
"BS": 2048,
"Epochs to converge": [ 33, 32, 33, 33, 33, 34, 33, 33, 30, 33 ],
"Hyperparams": {
"global_batch_size": 2048,
"opt_base_learning_rate": 0.0016,
"opt_learning_rate_warmup_steps": 3908,
"opt_learning_rate_warmup_factor": 0.2,
"opt_learning_rate_decay_boundary_steps": [ 23448, 31264 ],
"opt_learning_rate_decay_factor": 0.1
}
}
}
6 changes: 3 additions & 3 deletions mlperf_logging/rcp_checker/rcp_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"hpc": {
'cosmoflow': 10,
'deepcam': 5,
#'oc20': 10,
'oc20': 5,
}
}

Expand Down Expand Up @@ -96,8 +96,8 @@ def get_submission_epochs(result_files, benchmark, bert_train_samples):
class RCP_Checker:

def __init__(self, usage, ruleset, verbose, bert_train_samples):
if ruleset != '1.0.0':
raise Exception('RCP Checker only supported in 1.0.0')
if ruleset not in {'1.0.0', "1.1.0"}:
raise Exception('RCP Checker only supported in 1.0.0 / 1.1.0')
self.usage = usage
self.ruleset = ruleset
self.alpha = 0.05
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
"bert_ref_256":
{
"Benchmark": "bert",
"Creator": "Google",
"When": "Prior to 1.0 submission",
"Platform": "TPU-v4-16 / TF1, TF version ~2.4",
"BS": 256,
"Hyperparams": {
"opt_base_learning_rate": 0.00035,
Expand All @@ -23,6 +26,9 @@
"bert_ref_448":
{
"Benchmark": "bert",
"Creator": "Google",
"When": "Prior to 1.0 submission",
"Platform": "TPU-v4-16 / TF1, TF version ~2.4",
"BS": 448,
"Hyperparams": {
"opt_base_learning_rate": 0.0004,
Expand All @@ -43,6 +49,9 @@
"bert_ref_1536":
{
"Benchmark": "bert",
"Creator": "Google",
"When": "At 1.0 submission",
"Platform": "TPU-v4-128 / TF1, TF version ~2.4",
"BS": 1536,
"Hyperparams": {
"opt_base_learning_rate": 0.002,
Expand All @@ -63,6 +72,9 @@
"bert_ref_3072":
{
"Benchmark": "bert",
"Creator": "Google",
"When": "Prior to 1.0 submission",
"Platform": "TPU-v4-128 / TF1, TF version ~2.4",
"BS": 3072,
"Hyperparams": {
"opt_base_learning_rate": 0.002,
Expand All @@ -83,6 +95,9 @@
"bert_ref_6144":
{
"Benchmark": "bert",
"Creator": "Google",
"When": "At 1.0 submission",
"Platform": "TPU-v4-128 / TF1, TF version ~2.4",
"BS": 6144,
"Hyperparams": {
"opt_base_learning_rate": 0.0029293,
Expand All @@ -103,6 +118,9 @@
"bert_ref_6912":
{
"Benchmark": "bert",
"Creator": "Google",
"When": "At 1.0 submission",
"Platform": "TPU-v4-128 / TF1, TF version ~2.4",
"BS": 6912,
"Hyperparams": {
"opt_base_learning_rate": 0.0029293,
Expand All @@ -123,6 +141,9 @@
"bert_ref_8192":
{
"Benchmark": "bert",
"Creator": "Google",
"When": "Prior to 1.0 submission",
"Platform": "TPU-v4-128 / TF1, TF version ~2.4",
"BS": 8192,
"Hyperparams": {
"opt_base_learning_rate": 0.00288293,
Expand All @@ -143,6 +164,9 @@
"bert_ref_12288":
{
"Benchmark": "bert",
"Creator": "NVIDIA",
"When": "At 1.0 submission",
"Platform": "TBD",
"BS": 12288,
"Hyperparams": {
"opt_base_learning_rate": 0.0033,
Expand All @@ -157,7 +181,7 @@
},
"Epochs to converge": [
4718592, 4816896, 4718592, 5210112, 4816896, 5406720, 5111808, 5210112, 5013504, 4816896,
5210112, 5013504, 4718592, 5111808, 4816896, 4915200, 5111808, 4718592, 4718592, 4620288]
5210112, 5013504, 4718592, 5111808, 4816896, 4915200, 5111808, 4718592, 4718592, 4620288]
}
}

Loading

0 comments on commit 59ec294

Please sign in to comment.