Skip to content

Commit

Permalink
Merge pull request #225 from mlcommons/repository_check
Browse files Browse the repository at this point in the history
Fixes for issues #205, and #224.
  • Loading branch information
emizan76 authored Apr 13, 2022
2 parents 55dc8e4 + 95a5da0 commit ce4e482
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 1 deletion.
25 changes: 25 additions & 0 deletions mlperf_logging/repo_checker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# MLPerf repository checker

MLPerf repository checker

## Usage

To check whether an organization's submission package is compatible with github
and whether it will cause any problems when added to github with a PR during the
review process.

```sh
python3 -m mlperf_logging.repo_checker FOLDER USAGE RULESET
```

Currently, USAGE in ["training"] and RULESET in ["0.6.0", "0.7.0", "1.0.0"] are supported.

The repo checker checks:
1. Whether the repo contains filenames that github does not like, e.g. files with spaces,
files that start with '.' or '/.'
2. Files that violate the github file limit (50MB)

## Tested software versions
Tested and confirmed working using the following software versions:

Python 3.9.9
Empty file.
3 changes: 3 additions & 0 deletions mlperf_logging/repo_checker/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from . import repo_checker

repo_checker.main()
114 changes: 114 additions & 0 deletions mlperf_logging/repo_checker/repo_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import argparse
import logging
import os
import subprocess


def _check_bad_filenames(submission_dir):
"""Checks for filename errors.
Git does not like filenames with spaces or that start with ., or /. .
"""
logging.info('Running git-unfriendly file name checks.')
names = [
os.path.join(dirpath, filename)
for dirpath, _, filenames in os.walk(submission_dir)
for filename in filenames
if filename.startswith(".") or "/." in filename or " " in filename
]
if len(names) > 0:
error = "\n".join(names)
logging.error('Files with git-unfriendly name: %s ', error)
logging.error('Please remove spaces from filenamed and make sure they do not start with ".", or "/."')
return False
return True


def _check_file_sizes(submission_dir):
"""Checks for large file sizes.
Git does not like file sizes > 50MB.
"""
logging.info('Running large file checks.')
out = subprocess.run(
[
"find",
submission_dir,
"-type",
"f",
"-size",
"+50M",
],
capture_output=True,
text=True,
)
if len(out.stdout) != 0:
logging.error('Files > 50MB: %s', out.stdout)
logging.error('Please remove or reduce the size of these files.')
return False
return True


def run_checks(submission_dir):
"""Top-level checker function.
Call individual checkers from this function.
"""
logging.info('Running repository checks.')

bad_filename_error = _check_bad_filenames(submission_dir)
large_file_error = _check_file_sizes(submission_dir)

if not (bad_filename_error and large_file_error):
logging.info('CHECKS FAILED.')
return False

logging.info('ALL CHECKS PASSED.')
return False


def get_parser():
"""Parse commandline."""
parser = argparse.ArgumentParser(
prog='mlperf_logging.repo_checker',
description='Sanity checks to make sure that package is github compliant.',
)

parser.add_argument(
'folder',
type=str,
help='the folder for a submission package.',
)
parser.add_argument(
'usage',
type=str,
choices=['training'],
help='the usage -- only training is currently supported.',
)
parser.add_argument(
'ruleset',
type=str,
choices=['2.0.0'],
help='the ruleset. Only 2.0.0 is currently supported.'
)
parser.add_argument(
'--log_output',
type=str,
default='repo_checker.log',
help='the ruleset. Only 2.0.0 is currently supported.'
)
return parser


def main():
parser = get_parser()
args = parser.parse_args()

logging.basicConfig(filename=args.log_output, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())
formatter = logging.Formatter("%(levelname)s - %(message)s")
logging.getLogger().handlers[0].setFormatter(formatter)
logging.getLogger().handlers[1].setFormatter(formatter)

valid = run_checks(args.folder)
return valid

if __name__ == '__main__':
main()
15 changes: 14 additions & 1 deletion scripts/verify_for_v2.0_training.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
#!/bin/bash

set -e
python3 -m mlperf_logging.package_checker $1 training 2.0.0

# rcp_bypass and rcp_bert_train_samples packahe checker params
# need to be retrieved at package_checker_params file at top-level submission dir.
PACKAGE_CHECKER_PARAMS=""
PACKAGE_CHECKER_PARAMS_FILE="$1/package_checker_params"
if test -f "$PACKAGE_CHECKER_PARAMS_FILE"; then
while IFS= read -r line
do
PACKAGE_CHECKER_PARAMS="$PACKAGE_CHECKER_PARAMS --$line"
done < "$PACKAGE_CHECKER_PARAMS_FILE"
fi

python3 -m mlperf_logging.package_checker $1 training 2.0.0 $PACKAGE_CHECKER_PARAMS
python3 -m mlperf_logging.result_summarizer $1 training 2.0.0
python3 -m mlperf_logging.repo_checker $1 training 2.0.0

0 comments on commit ce4e482

Please sign in to comment.