From 4c57bc2aae30229b095cc24999e1d89404619bc8 Mon Sep 17 00:00:00 2001 From: Elias Mizan <43765737+emizan76@users.noreply.github.com> Date: Wed, 6 Oct 2021 23:39:11 -0700 Subject: [PATCH] Logging lib -- checker logging improvements (#180) - Use the logging library instead of print(...) to display progress, warnings, and errors - Replaced almost all print(...) statements in the package, compliance, and rcp checkers - log outputs go to stdout and also to package_checker.log, compliance_checker.log, rcp_checker.log, depending on what you run. The lof iles can be changed from the command line. - Added additional info to log outputs - Removed exceptions/assertions at the end of the checkers when we find errors --- mlperf_logging/compliance_checker/__main__.py | 18 ++-- .../compliance_checker/mlp_compliance.py | 19 ++-- .../package_checker/package_checker.py | 97 +++++++++++-------- .../package_checker/seed_checker.py | 12 ++- mlperf_logging/rcp_checker/__main__.py | 14 ++- mlperf_logging/rcp_checker/rcp_checker.py | 37 ++++--- .../system_desc_checker.py | 18 ++-- 7 files changed, 136 insertions(+), 79 deletions(-) diff --git a/mlperf_logging/compliance_checker/__main__.py b/mlperf_logging/compliance_checker/__main__.py index d1471fe8..2fe9ed2e 100644 --- a/mlperf_logging/compliance_checker/__main__.py +++ b/mlperf_logging/compliance_checker/__main__.py @@ -1,11 +1,17 @@ import sys +import logging from . import mlp_compliance - parser = mlp_compliance.get_parser() args = parser.parse_args() +logging.basicConfig(filename=args.log_output, encoding='utf-8', level=logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler()) +formatter = logging.Formatter("%(levelname)s - %(message)s") +logging.getLogger().handlers[0].setFormatter(formatter) +logging.getLogger().handlers[1].setFormatter(formatter) + config_file = args.config or f'{args.usage}_{args.ruleset}/common.yaml' checker = mlp_compliance.make_checker( @@ -17,12 +23,10 @@ valid, system_id, benchmark, result = mlp_compliance.main(args.filename, config_file, checker) -print(valid) -print(system_id) -print(benchmark) -print(result) - if not valid: + logging.error('FAILED') + print('** Logging output also at', args.log_output) sys.exit(1) else: - print('SUCCESS') + print('** Logging output also at', args.log_output) + logging.info('SUCCESS') diff --git a/mlperf_logging/compliance_checker/mlp_compliance.py b/mlperf_logging/compliance_checker/mlp_compliance.py index e9d07cf2..cd1a84dc 100644 --- a/mlperf_logging/compliance_checker/mlp_compliance.py +++ b/mlperf_logging/compliance_checker/mlp_compliance.py @@ -8,6 +8,7 @@ import os import yaml import json +import logging import re import math @@ -82,7 +83,7 @@ def log_messages(self): *self.not_overwritable ]) if message: - print(message) + logging.warning(" %s", message) def has_messages(self): return self.not_overwritable or self.overwritable @@ -121,7 +122,7 @@ def run_check_end(self, tests, state): try: if not eval(test.strip(), state): if test.strip().split()[0] == "sorted(s['initialized_tensors'])": - self.put_warning(f" Warning: Failed weights initialization check (can be ignored for 1.1.0)", key='') + self.put_warning(f" Warning: Failed weights initialization check (can be ignored for 1.1.0)", key='weights_initialization') else: self.put_message( f"failed test: {test}" @@ -258,6 +259,7 @@ def check_loglines(self, loglines, config): current_dir = os.path.dirname(os.path.abspath(__file__)) while len(enqueued_configs)>0: current_config = enqueued_configs.pop(0) + logging.info (' Compliance checks: %s', current_config) config_file = general_file = os.path.join(current_dir, current_config) if not os.path.exists(config_file): @@ -269,14 +271,14 @@ def check_loglines(self, loglines, config): def check_file(self, filename, config_file): + logging.info('Running compliance on file: %s', filename) loglines, errors = mlp_parser.parse_file(filename, ruleset=self.ruleset) if len(errors) > 0: - print('Found parsing errors:') + logging.warning(' Found parsing errors:') for line, error in errors: - print(line) - print(' ^^ ', error) - print() + logging.warning(' %s',line) + logging.warning(' ^^ %s', error) self.put_message('Log lines had parsing errors.') self.check_loglines(loglines, config_file) @@ -311,10 +313,11 @@ def get_parser(): parser.add_argument('--config', type=str, help='mlperf logging config, by default it loads {usage}_{ruleset}/common.yaml', default=None) parser.add_argument('--werror', action='store_true', - help='Treas warnings as errors') + help='Treat warnings as errors') parser.add_argument('--quiet', action='store_true', help='Suppress warnings. Does nothing if --werror is set') - + parser.add_argument('--log_output', type=str, default='compliance_checker.log', + help='where to store compliance checker output log') return parser diff --git a/mlperf_logging/package_checker/package_checker.py b/mlperf_logging/package_checker/package_checker.py index 2553b617..9aff796a 100644 --- a/mlperf_logging/package_checker/package_checker.py +++ b/mlperf_logging/package_checker/package_checker.py @@ -6,6 +6,7 @@ import argparse import glob import json +import logging import os import sys @@ -29,7 +30,7 @@ def _get_sub_folders(folder): def _print_divider_bar(): - print('------------------------------') + logging.info('------------------------------') def check_training_result_files(folder, usage, ruleset, quiet, werror, @@ -38,7 +39,7 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, Args: folder: The folder for a submission package. - ruleset: The ruleset such as 0.6.0, 0.7.0, or 1.0.0 + ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0, or 1.1.0 """ allowed_benchmarks = get_allowed_benchmarks(usage, ruleset) benchmark_file_counts = get_result_file_counts(usage) @@ -75,7 +76,7 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, # If it is not a recognized benchmark, skip further checks. if benchmark not in allowed_benchmarks: - print('Skipping benchmark: {}'.format(benchmark)) + logging.warning(' Skipping benchmark: %s', benchmark) continue # Find all result files for this benchmark. @@ -89,33 +90,32 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, os.path.join(folder, 'benchmarks', benchmark)) _print_divider_bar() - print('System {}'.format(system)) - print('Benchmark {}'.format(benchmark)) + logging.info(' Running compliance checks in dir %s', benchmark_folder) + logging.info(' System %s', system) + logging.info(' Benchmark %s', benchmark) + _print_divider_bar() if is_weak_scaling: if len(result_files) < benchmark_file_counts[benchmark]: - print('Expected at least {} runs, but detected {} runs.'.format( + logging.error('Expected at least %d runs, but detected %d runs.', benchmark_file_counts[benchmark], - len(result_files), - )) + len(result_files)) too_many_errors = True else: # The number of result files must be an exact number. # Print a comprehensive message if some files in results # directory do not match naming convention (results_*.txt) if len(result_files) != benchmark_file_counts[benchmark]: - print('Expected {} runs, but detected {} runs.'.format( - benchmark_file_counts[benchmark], - len(result_files), - )) + logging.error('Incorrect number of files in dir, or wrong file names in directory %s, ' + 'found %d, expected %d', + benchmark_folder, len(result_files), benchmark_file_counts[benchmark]) too_many_errors = True if len(all_files) > len(result_files): - print(all_files) - print('Detected {} total files in directory {}, but some do not conform ' - 'to naming convention, should you rename them to result_*.txt ?'.format(len(all_files), benchmark_folder)) + logging.warning('Detected %d total files in directory %s, but some do not conform ' + 'to naming convention, should you rename them to result_*.txt ?',len(all_files), benchmark_folder) if len(result_files) < len(all_files): - print('WARNING: Unknown files in results directory {}'.format(benchmark_folder)) + logging.warning('Unknown files in result directory: %s', benchmark_folder) errors_found = 0 result_files.sort() @@ -126,7 +126,7 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, # For each result file, run the benchmark's compliance checks. _print_divider_bar() - print('Run {}'.format(run)) + logging.info('Run %d/%d', result_files.index(result_file) + 1, len(result_files)) config_file = '{usage}_{ruleset}/common.yaml'.format( usage=usage, ruleset=ruleset, @@ -146,22 +146,20 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, if not valid: errors_found += 1 if errors_found == 1 and benchmark != 'unet3d': - print('WARNING: One file does not comply.') - print('WARNING: Allowing this failure under olympic scoring ' - 'rules.') + logging.warning(" 1 file does not comply, accepting this under olympic scoring") elif errors_found > 0 and errors_found <= 4 and benchmark == 'unet3d': - print('WARNING: {errors} file does not comply.'.format(errors=errors_found)) - print('WARNING: Allowing this failure for unet3d under olympic scoring ' - 'rules.') + logging.warning(" %d files do not comply, accepting this under olympic scoring", errors_found) elif errors_found > 0: too_many_errors = True + logging.error(" %d files do not comply, directory cannot be accepted", errors_found) # Check if each run use unique seeds. if ruleset in {'1.0.0', '1.1.0'} and division == 'closed': if not seed_checker.check_seeds(result_files, source_files): too_many_errors = True + logging.error('Seed checker failed') - # Run RCP checker for 1.0.0 + # Run RCP checker for >= 1.0.0 if ruleset in {'1.0.0', '1.1.0'} and division == 'closed' and benchmark != 'minigo': rcp_chk = rcp_checker.make_checker(usage, ruleset, verbose=False, bert_train_samples=rcp_bert_train_samples) rcp_chk._compute_rcp_stats() @@ -169,13 +167,18 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, # Now go again through result files to do RCP checks rcp_pass, rcp_msg = rcp_chk._check_directory(benchmark_folder, rcp_bypass) if not rcp_pass: - print('ERROR: RCP Test Failed: {}.'.format(rcp_msg)) + logging.error('RCP Test Failed: %s', rcp_msg) too_many_errors = True + else: + logging.info('RCP Test Passed: %s', rcp_msg) _print_divider_bar() + + _print_divider_bar() if too_many_errors: - raise Exception( - 'Found too many errors in logging, see log above for details.') + logging.info('PACKAGE CHECKER FOUND ERRORS, LOOK INTO ERROR LOG LINES AND FIX THEM.') + else: + logging.info('PACKAGE CHECKER FOUND NO ERRORS, SUCCESS !') def check_systems(folder, usage, ruleset): @@ -184,34 +187,37 @@ def check_systems(folder, usage, ruleset): Args: folder: The folder for a submission package. usage: The usage such as training, inference_edge, inference_server, hpc. - ruleset: The ruleset such as 0.6.0, 0.7.0, or 1.0.0. + ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0, or 1.1.0. """ system_folder = os.path.join(folder,'systems') pattern = '{folder}/*.json'.format(folder=system_folder) json_files = glob.glob(pattern) - too_many_errors = False + too_many_errors = False for json_file in json_files: valid, _, _, _ = system_desc_checker.check_system_desc(json_file, usage, ruleset) if not valid: too_many_errors = True - if too_many_errors: - raise Exception( - 'Found too many errors in system checking, see log above for details.') - + return not too_many_errors -def check_training_package(folder, usage, ruleset, quiet, werror, rcp_bypass, rcp_bert_train_samples): +def check_training_package(folder, usage, ruleset, quiet, werror, rcp_bypass, rcp_bert_train_samples, log_output): """Checks a training package for compliance. Args: folder: The folder for a submission package. usage: The usage such as training or hpc - ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0 or 1.0.0. + ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0 or 1.1.0. """ + if ruleset in {'1.0.0', '1.1.0'}: + logging.info(' Checking System Description Files') + if not check_systems(folder, usage, ruleset): + logging.error('System description file checker failed') + check_training_result_files(folder, usage, ruleset, quiet, werror, rcp_bypass, rcp_bert_train_samples) - if ruleset == '1.0.0': - check_systems(folder, usage, ruleset) + _print_divider_bar() + print('\n** Detailed log output is also at', log_output) + def get_parser(): parser = argparse.ArgumentParser( @@ -234,7 +240,7 @@ def get_parser(): 'ruleset', type=str, choices=rule_choices(), - help='the ruleset such as 0.6.0, 0.7.0, or 1.0.0' + help='the ruleset such as 0.6.0, 0.7.0, 1.0.0, or 1.1.0' ) parser.add_argument( '--werror', @@ -258,6 +264,12 @@ def get_parser(): 'bert benchmark is taken from train_samples, ' 'istead of epoch_num', ) + parser.add_argument( + '--log_output', + type=str, + default='package_checker.log', + help='where to store package checker output log' + ) return parser @@ -265,7 +277,14 @@ def main(): parser = get_parser() args = parser.parse_args() - check_training_package(args.folder, args.usage, args.ruleset, args.quiet, args.werror, args.rcp_bypass, args.rcp_bert_train_samples) + logging.basicConfig(filename=args.log_output, encoding='utf-8', level=logging.INFO) + logging.getLogger().addHandler(logging.StreamHandler()) + formatter = logging.Formatter("%(levelname)s - %(message)s") + logging.getLogger().handlers[0].setFormatter(formatter) + logging.getLogger().handlers[1].setFormatter(formatter) + + check_training_package(args.folder, args.usage, args.ruleset, args.quiet, args.werror, + args.rcp_bypass, args.rcp_bert_train_samples, args.log_output) if __name__ == '__main__': diff --git a/mlperf_logging/package_checker/seed_checker.py b/mlperf_logging/package_checker/seed_checker.py index 4fc4ef5a..4d527a40 100644 --- a/mlperf_logging/package_checker/seed_checker.py +++ b/mlperf_logging/package_checker/seed_checker.py @@ -1,5 +1,6 @@ import warnings import os +import logging from ..compliance_checker import mlp_parser @@ -10,6 +11,10 @@ } +def _print_divider_bar(): + logging.info('------------------------------') + + def is_source_file(path): """ Check if a file is considered as a "source file" by extensions. @@ -137,12 +142,15 @@ def check_seeds(self, result_files, source_files): this benchmark. """ + _print_divider_bar() + logging.info(" Running Seed Checker") no_logged_seed, error_messages = self._assert_unique_seed_per_run( result_files) if len(error_messages) > 0: - print("Seed checker failed and found the following " - "errors:\n{}".format('\n'.join(error_messages))) + logging.error(" Seed checker failed and found the following errors %s: ", join(error_messages)) + #print("Seed checker failed and found the following " + # "errors:\n{}".format('\n'.join(error_messages))) return False if no_logged_seed: diff --git a/mlperf_logging/rcp_checker/__main__.py b/mlperf_logging/rcp_checker/__main__.py index f5b20815..9257b771 100644 --- a/mlperf_logging/rcp_checker/__main__.py +++ b/mlperf_logging/rcp_checker/__main__.py @@ -1,17 +1,27 @@ import sys +import logging from . import rcp_checker parser = rcp_checker.get_parser() args = parser.parse_args() +logging.basicConfig(filename=args.log_output, encoding='utf-8', level=logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler()) +formatter = logging.Formatter("%(levelname)s - %(message)s") +logging.getLogger().handlers[0].setFormatter(formatter) +logging.getLogger().handlers[1].setFormatter(formatter) + # Results summarizer makes these 3 calls to invoke RCP test checker = rcp_checker.make_checker(args.rcp_usage, args.rcp_version, args.verbose, args.bert_train_samples) checker._compute_rcp_stats() test, msg = checker._check_directory(args.dir) if test: - print(msg, ",RCP test passed") + logging.info('%s, RCP test PASSED', msg) + print('** Logging output also at', args.log_output) else: - print(msg, ",RCP test failed") + logging.error('%s, RCP test FAILED, consider adding --rcp_bypass in when running the package_checker if the RCP is NOT missing', msg) + print('** Logging output also at', args.log_output) sys.exit(1) + diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 5084081e..fdd79405 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -5,6 +5,7 @@ import argparse import glob import json +import logging import os import numpy as np import re @@ -33,6 +34,9 @@ TOKEN = ':::MLLOG ' +def _print_divider_bar(): + logging.info('------------------------------') + def _detect_eval_error(file_contents): for line in file_contents: if TOKEN not in line: @@ -159,6 +163,7 @@ def _compute_rcp_stats(self): record_contents['RCP Stdev'], len(epoch_list)-samples_rejected*2) record_contents['Max Speedup'] = record_contents['RCP Mean'] / min_epochs + # TODO emizan: Remove verbose if 1.1 round goes well w.r.to logs if self.verbose: print(record, record_contents, "\n") @@ -288,9 +293,10 @@ def _create_interp_rcp(self, benchmark, target_bs, low_rcp, high_rcp): 'RCP Mean': mean, 'RCP Stdev': stdev, 'Max Speedup': mean / min_epochs} - if self.verbose: - print(low_rcp, high_rcp) - print(interp_record) + logging.info(" Creating interpolation record") + logging.info(" Low RCP: %s", low_rcp) + logging.info(" High RCP: %s", high_rcp) + logging.info(" Intepolation record: %s", interp_record) self.rcp_data[interp_record_name] = interp_record @@ -300,14 +306,12 @@ def _eval_submission_record(self, rcp_record, subm_epochs): samples_rejected = 4 if rcp_record["Benchmark"] == 'unet3d' else 1 mean_subm_epochs = np.mean(subm_epochs[samples_rejected:len(subm_epochs)-samples_rejected]) if mean_subm_epochs >= (rcp_record["RCP Mean"] / rcp_record["Max Speedup"]): - if self.verbose: - print("Found RCP record:\n",rcp_record) - print("\nSubm Mean epochs:", mean_subm_epochs) + logging.info(" RCP Record: %s", rcp_record) + logging.info(" Submission mean epochs: %.4f", mean_subm_epochs) return(True) else: - if self.verbose: - print("Found RCP record:\n",rcp_record) - print("\nSubm Mean epochs:", mean_subm_epochs) + logging.info(" RCP Record: %s", rcp_record) + logging.info(" Submission mean epochs: %.4f", mean_subm_epochs) return(False) @@ -326,6 +330,9 @@ def _check_directory(self, dir, rcp_bypass=False): - (False) Fail / RCP interpolated - (False) Missing RCP / Submit missing RCP ''' + _print_divider_bar() + logging.info(" Running RCP Checker") + _print_divider_bar() dir = dir.rstrip("/") pattern = '{folder}/result_*.txt'.format(folder=dir) benchmark = os.path.split(dir)[1] @@ -368,7 +375,7 @@ def _check_directory(self, dir, rcp_bypass=False): if rcp_bypass and not rcp_check: if rcp_msg == 'RCP found' or rcp_msg == 'RCP Interpolation': rcp_msg = rcp_msg + ' passed using rcp_bypass' - print('RCP test failed but allowed to proceed with RCP bypass') + logging.warning(' RCP test failed but allowed to proceed with RCP bypass') rcp_check = True return rcp_check, rcp_msg @@ -385,14 +392,14 @@ def get_parser(): parser.add_argument('--rcp_usage', type=str, default='training', choices=['training', 'hpc'], help='what WG does the benchmark come from to check the log against') - parser.add_argument('--rcp_version', type=str, default='1.0.0', + parser.add_argument('--rcp_version', type=str, default='1.1.0', help='what version of rules to check the log against') parser.add_argument('--verbose', action='store_true') parser.add_argument('--bert_train_samples', action='store_true', - help='If set, num samples used for training ' - 'bert benchmark is taken from train_samples, ' - 'istead of epoch_num') - + help='If set, num samples used for training bert benchmark' + 'is taken from train_samples, instead of epoch_num') + parser.add_argument('--log_output', type=str, default='rcp_checker.log', + help='where to store RCP checker output log') return parser diff --git a/mlperf_logging/system_desc_checker/system_desc_checker.py b/mlperf_logging/system_desc_checker/system_desc_checker.py index 0d9bd509..45a53339 100644 --- a/mlperf_logging/system_desc_checker/system_desc_checker.py +++ b/mlperf_logging/system_desc_checker/system_desc_checker.py @@ -6,6 +6,7 @@ import argparse import json +import logging import sys from ..compliance_checker.mlp_compliance import usage_choices, rule_choices @@ -114,12 +115,9 @@ def check_system_desc(json_file, usage, ruleset): ]) if not valid: - print("FAILURE: {}".format(", ".join(invalid_reasons))) + logging.error(' System description checker failed for %s : %s', system_name, invalid_reasons) else: - print("SUCCESS") - - print("Table CSV prefix: {}".format(table_csv_prefix)) - print("Table CSV postfix: {}".format(table_csv_postfix)) + logging.info(' System description checker passed for %s', system_name) return valid, system_name, table_csv_prefix, table_csv_postfix @@ -140,7 +138,8 @@ def get_parser(): help='Treat warnings as errors') parser.add_argument('--quiet', action='store_true', help='Suppress warnings. Does nothing if --werror is set') - + parser.add_argument('--log_output', type=str, default='system_desc_checker.log', + help='where to store system description checker output log') return parser @@ -148,7 +147,14 @@ def main(): parser = get_parser() args = parser.parse_args() + logging.basicConfig(filename=args.log_output, encoding='utf-8', level=logging.INFO) + logging.getLogger().addHandler(logging.StreamHandler()) + formatter = logging.Formatter("%(levelname)s - %(message)s") + logging.getLogger().handlers[0].setFormatter(formatter) + logging.getLogger().handlers[1].setFormatter(formatter) + check_system_desc(args.filename, args.usage, args.ruleset) + print('** Logging output also at', args.log_output) if __name__ == '__main__':