Skip to content

Commit

Permalink
Merge pull request #185 from xyhuang/1.1-branch
Browse files Browse the repository at this point in the history
Logging lib -- checker logging improvements (#180)
  • Loading branch information
xyhuang authored Oct 7, 2021
2 parents 695cd76 + 4c57bc2 commit b38926a
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 79 deletions.
18 changes: 11 additions & 7 deletions mlperf_logging/compliance_checker/__main__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
import sys
import logging

from . import mlp_compliance


parser = mlp_compliance.get_parser()
args = parser.parse_args()

logging.basicConfig(filename=args.log_output, encoding='utf-8', level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())
formatter = logging.Formatter("%(levelname)s - %(message)s")
logging.getLogger().handlers[0].setFormatter(formatter)
logging.getLogger().handlers[1].setFormatter(formatter)

config_file = args.config or f'{args.usage}_{args.ruleset}/common.yaml'

checker = mlp_compliance.make_checker(
Expand All @@ -17,12 +23,10 @@

valid, system_id, benchmark, result = mlp_compliance.main(args.filename, config_file, checker)

print(valid)
print(system_id)
print(benchmark)
print(result)

if not valid:
logging.error('FAILED')
print('** Logging output also at', args.log_output)
sys.exit(1)
else:
print('SUCCESS')
print('** Logging output also at', args.log_output)
logging.info('SUCCESS')
19 changes: 11 additions & 8 deletions mlperf_logging/compliance_checker/mlp_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os
import yaml
import json
import logging
import re
import math

Expand Down Expand Up @@ -82,7 +83,7 @@ def log_messages(self):
*self.not_overwritable
])
if message:
print(message)
logging.warning(" %s", message)

def has_messages(self):
return self.not_overwritable or self.overwritable
Expand Down Expand Up @@ -121,7 +122,7 @@ def run_check_end(self, tests, state):
try:
if not eval(test.strip(), state):
if test.strip().split()[0] == "sorted(s['initialized_tensors'])":
self.put_warning(f" Warning: Failed weights initialization check (can be ignored for 1.1.0)", key='')
self.put_warning(f" Warning: Failed weights initialization check (can be ignored for 1.1.0)", key='weights_initialization')
else:
self.put_message(
f"failed test: {test}"
Expand Down Expand Up @@ -258,6 +259,7 @@ def check_loglines(self, loglines, config):
current_dir = os.path.dirname(os.path.abspath(__file__))
while len(enqueued_configs)>0:
current_config = enqueued_configs.pop(0)
logging.info (' Compliance checks: %s', current_config)
config_file = general_file = os.path.join(current_dir, current_config)

if not os.path.exists(config_file):
Expand All @@ -269,14 +271,14 @@ def check_loglines(self, loglines, config):

def check_file(self, filename, config_file):

logging.info('Running compliance on file: %s', filename)
loglines, errors = mlp_parser.parse_file(filename, ruleset=self.ruleset)

if len(errors) > 0:
print('Found parsing errors:')
logging.warning(' Found parsing errors:')
for line, error in errors:
print(line)
print(' ^^ ', error)
print()
logging.warning(' %s',line)
logging.warning(' ^^ %s', error)
self.put_message('Log lines had parsing errors.')

self.check_loglines(loglines, config_file)
Expand Down Expand Up @@ -311,10 +313,11 @@ def get_parser():
parser.add_argument('--config', type=str,
help='mlperf logging config, by default it loads {usage}_{ruleset}/common.yaml', default=None)
parser.add_argument('--werror', action='store_true',
help='Treas warnings as errors')
help='Treat warnings as errors')
parser.add_argument('--quiet', action='store_true',
help='Suppress warnings. Does nothing if --werror is set')

parser.add_argument('--log_output', type=str, default='compliance_checker.log',
help='where to store compliance checker output log')
return parser


Expand Down
97 changes: 58 additions & 39 deletions mlperf_logging/package_checker/package_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import argparse
import glob
import json
import logging
import os
import sys

Expand All @@ -29,7 +30,7 @@ def _get_sub_folders(folder):


def _print_divider_bar():
print('------------------------------')
logging.info('------------------------------')


def check_training_result_files(folder, usage, ruleset, quiet, werror,
Expand All @@ -38,7 +39,7 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
Args:
folder: The folder for a submission package.
ruleset: The ruleset such as 0.6.0, 0.7.0, or 1.0.0
ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0, or 1.1.0
"""
allowed_benchmarks = get_allowed_benchmarks(usage, ruleset)
benchmark_file_counts = get_result_file_counts(usage)
Expand Down Expand Up @@ -75,7 +76,7 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,

# If it is not a recognized benchmark, skip further checks.
if benchmark not in allowed_benchmarks:
print('Skipping benchmark: {}'.format(benchmark))
logging.warning(' Skipping benchmark: %s', benchmark)
continue

# Find all result files for this benchmark.
Expand All @@ -89,33 +90,32 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
os.path.join(folder, 'benchmarks', benchmark))

_print_divider_bar()
print('System {}'.format(system))
print('Benchmark {}'.format(benchmark))
logging.info(' Running compliance checks in dir %s', benchmark_folder)
logging.info(' System %s', system)
logging.info(' Benchmark %s', benchmark)
_print_divider_bar()

if is_weak_scaling:
if len(result_files) < benchmark_file_counts[benchmark]:
print('Expected at least {} runs, but detected {} runs.'.format(
logging.error('Expected at least %d runs, but detected %d runs.',
benchmark_file_counts[benchmark],
len(result_files),
))
len(result_files))
too_many_errors = True
else:
# The number of result files must be an exact number.
# Print a comprehensive message if some files in results
# directory do not match naming convention (results_*.txt)
if len(result_files) != benchmark_file_counts[benchmark]:
print('Expected {} runs, but detected {} runs.'.format(
benchmark_file_counts[benchmark],
len(result_files),
))
logging.error('Incorrect number of files in dir, or wrong file names in directory %s, '
'found %d, expected %d',
benchmark_folder, len(result_files), benchmark_file_counts[benchmark])
too_many_errors = True
if len(all_files) > len(result_files):
print(all_files)
print('Detected {} total files in directory {}, but some do not conform '
'to naming convention, should you rename them to result_*.txt ?'.format(len(all_files), benchmark_folder))
logging.warning('Detected %d total files in directory %s, but some do not conform '
'to naming convention, should you rename them to result_*.txt ?',len(all_files), benchmark_folder)

if len(result_files) < len(all_files):
print('WARNING: Unknown files in results directory {}'.format(benchmark_folder))
logging.warning('Unknown files in result directory: %s', benchmark_folder)

errors_found = 0
result_files.sort()
Expand All @@ -126,7 +126,7 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,

# For each result file, run the benchmark's compliance checks.
_print_divider_bar()
print('Run {}'.format(run))
logging.info('Run %d/%d', result_files.index(result_file) + 1, len(result_files))
config_file = '{usage}_{ruleset}/common.yaml'.format(
usage=usage,
ruleset=ruleset,
Expand All @@ -146,36 +146,39 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror,
if not valid:
errors_found += 1
if errors_found == 1 and benchmark != 'unet3d':
print('WARNING: One file does not comply.')
print('WARNING: Allowing this failure under olympic scoring '
'rules.')
logging.warning(" 1 file does not comply, accepting this under olympic scoring")
elif errors_found > 0 and errors_found <= 4 and benchmark == 'unet3d':
print('WARNING: {errors} file does not comply.'.format(errors=errors_found))
print('WARNING: Allowing this failure for unet3d under olympic scoring '
'rules.')
logging.warning(" %d files do not comply, accepting this under olympic scoring", errors_found)
elif errors_found > 0:
too_many_errors = True
logging.error(" %d files do not comply, directory cannot be accepted", errors_found)

# Check if each run use unique seeds.
if ruleset in {'1.0.0', '1.1.0'} and division == 'closed':
if not seed_checker.check_seeds(result_files, source_files):
too_many_errors = True
logging.error('Seed checker failed')

# Run RCP checker for 1.0.0
# Run RCP checker for >= 1.0.0
if ruleset in {'1.0.0', '1.1.0'} and division == 'closed' and benchmark != 'minigo':
rcp_chk = rcp_checker.make_checker(usage, ruleset, verbose=False, bert_train_samples=rcp_bert_train_samples)
rcp_chk._compute_rcp_stats()

# Now go again through result files to do RCP checks
rcp_pass, rcp_msg = rcp_chk._check_directory(benchmark_folder, rcp_bypass)
if not rcp_pass:
print('ERROR: RCP Test Failed: {}.'.format(rcp_msg))
logging.error('RCP Test Failed: %s', rcp_msg)
too_many_errors = True
else:
logging.info('RCP Test Passed: %s', rcp_msg)

_print_divider_bar()

_print_divider_bar()
if too_many_errors:
raise Exception(
'Found too many errors in logging, see log above for details.')
logging.info('PACKAGE CHECKER FOUND ERRORS, LOOK INTO ERROR LOG LINES AND FIX THEM.')
else:
logging.info('PACKAGE CHECKER FOUND NO ERRORS, SUCCESS !')


def check_systems(folder, usage, ruleset):
Expand All @@ -184,34 +187,37 @@ def check_systems(folder, usage, ruleset):
Args:
folder: The folder for a submission package.
usage: The usage such as training, inference_edge, inference_server, hpc.
ruleset: The ruleset such as 0.6.0, 0.7.0, or 1.0.0.
ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0, or 1.1.0.
"""
system_folder = os.path.join(folder,'systems')
pattern = '{folder}/*.json'.format(folder=system_folder)
json_files = glob.glob(pattern)
too_many_errors = False

too_many_errors = False
for json_file in json_files:
valid, _, _, _ = system_desc_checker.check_system_desc(json_file, usage, ruleset)
if not valid:
too_many_errors = True

if too_many_errors:
raise Exception(
'Found too many errors in system checking, see log above for details.')

return not too_many_errors

def check_training_package(folder, usage, ruleset, quiet, werror, rcp_bypass, rcp_bert_train_samples):
def check_training_package(folder, usage, ruleset, quiet, werror, rcp_bypass, rcp_bert_train_samples, log_output):
"""Checks a training package for compliance.
Args:
folder: The folder for a submission package.
usage: The usage such as training or hpc
ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0 or 1.0.0.
ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0 or 1.1.0.
"""
if ruleset in {'1.0.0', '1.1.0'}:
logging.info(' Checking System Description Files')
if not check_systems(folder, usage, ruleset):
logging.error('System description file checker failed')

check_training_result_files(folder, usage, ruleset, quiet, werror, rcp_bypass, rcp_bert_train_samples)
if ruleset == '1.0.0':
check_systems(folder, usage, ruleset)
_print_divider_bar()
print('\n** Detailed log output is also at', log_output)


def get_parser():
parser = argparse.ArgumentParser(
Expand All @@ -234,7 +240,7 @@ def get_parser():
'ruleset',
type=str,
choices=rule_choices(),
help='the ruleset such as 0.6.0, 0.7.0, or 1.0.0'
help='the ruleset such as 0.6.0, 0.7.0, 1.0.0, or 1.1.0'
)
parser.add_argument(
'--werror',
Expand All @@ -258,14 +264,27 @@ def get_parser():
'bert benchmark is taken from train_samples, '
'istead of epoch_num',
)
parser.add_argument(
'--log_output',
type=str,
default='package_checker.log',
help='where to store package checker output log'
)
return parser


def main():
parser = get_parser()
args = parser.parse_args()

check_training_package(args.folder, args.usage, args.ruleset, args.quiet, args.werror, args.rcp_bypass, args.rcp_bert_train_samples)
logging.basicConfig(filename=args.log_output, encoding='utf-8', level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())
formatter = logging.Formatter("%(levelname)s - %(message)s")
logging.getLogger().handlers[0].setFormatter(formatter)
logging.getLogger().handlers[1].setFormatter(formatter)

check_training_package(args.folder, args.usage, args.ruleset, args.quiet, args.werror,
args.rcp_bypass, args.rcp_bert_train_samples, args.log_output)


if __name__ == '__main__':
Expand Down
12 changes: 10 additions & 2 deletions mlperf_logging/package_checker/seed_checker.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import warnings
import os
import logging

from ..compliance_checker import mlp_parser

Expand All @@ -10,6 +11,10 @@
}


def _print_divider_bar():
logging.info('------------------------------')


def is_source_file(path):
""" Check if a file is considered as a "source file" by extensions.
Expand Down Expand Up @@ -137,12 +142,15 @@ def check_seeds(self, result_files, source_files):
this benchmark.
"""
_print_divider_bar()
logging.info(" Running Seed Checker")
no_logged_seed, error_messages = self._assert_unique_seed_per_run(
result_files)

if len(error_messages) > 0:
print("Seed checker failed and found the following "
"errors:\n{}".format('\n'.join(error_messages)))
logging.error(" Seed checker failed and found the following errors %s: ", join(error_messages))
#print("Seed checker failed and found the following "
# "errors:\n{}".format('\n'.join(error_messages)))
return False

if no_logged_seed:
Expand Down
14 changes: 12 additions & 2 deletions mlperf_logging/rcp_checker/__main__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
import sys
import logging

from . import rcp_checker

parser = rcp_checker.get_parser()
args = parser.parse_args()

logging.basicConfig(filename=args.log_output, encoding='utf-8', level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())
formatter = logging.Formatter("%(levelname)s - %(message)s")
logging.getLogger().handlers[0].setFormatter(formatter)
logging.getLogger().handlers[1].setFormatter(formatter)

# Results summarizer makes these 3 calls to invoke RCP test
checker = rcp_checker.make_checker(args.rcp_usage, args.rcp_version, args.verbose, args.bert_train_samples)
checker._compute_rcp_stats()
test, msg = checker._check_directory(args.dir)

if test:
print(msg, ",RCP test passed")
logging.info('%s, RCP test PASSED', msg)
print('** Logging output also at', args.log_output)
else:
print(msg, ",RCP test failed")
logging.error('%s, RCP test FAILED, consider adding --rcp_bypass in when running the package_checker if the RCP is NOT missing', msg)
print('** Logging output also at', args.log_output)
sys.exit(1)

Loading

0 comments on commit b38926a

Please sign in to comment.