Skip to content

Commit

Permalink
Merge pull request #124 from xyhuang/1.0-branch
Browse files Browse the repository at this point in the history
1.0 branch
  • Loading branch information
xyhuang authored May 15, 2021
2 parents 82bf24e + ab44282 commit 3620611
Show file tree
Hide file tree
Showing 10 changed files with 177 additions and 61 deletions.
28 changes: 15 additions & 13 deletions mlperf_logging/compliance_checker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,30 @@ To check a log file for compliance:

python -m mlperf_logging.compliance_checker [--config YAML] [--ruleset MLPERF_EDITION] FILENAME

By default, 0.7.0 edition rules are used and the default config is set to `0.7.0/common.yaml`.
By default, 1.0.0 edition rules are used and the default config is set to `1.0.0/common.yaml`.
This config will check all common keys and enqueue benchmark specific config to be checked as well.
Old editions, still supported are 0.7.0 amd 0.6.0

Prints `SUCCESS` when no issues were found. Otherwise will print error details.

As log examples use [NVIDIA's v0.6 training logs](https://github.com/mlperf/training_results_v0.6/tree/master/NVIDIA/results).

### Existing config files

0.7.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
0.7.0/resnet.yaml
0.7.0/ssd.yaml
0.7.0/minigo.yaml
0.7.0/maskrcnn.yaml
0.7.0/gnmt.yaml
0.7.0/transformer.yaml
0.7.0/bert.yaml
0.7.0/dlrm.yaml
1.0.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
1.0.0/resnet.yaml
1.0.0/ssd.yaml
1.0.0/minigo.yaml
1.0.0/maskrcnn.yaml
1.0.0/rnnt.yaml
1.0.0/unet3d.yaml
1.0.0/bert.yaml
1.0.0/dlrm.yaml

### Implementation details
Compliance checking is done following below algorithm.

1. Parser converts the log into a list of records, each record corresponds to MLL
1. Parser converts the log into a list of records, each record corresponds to MLLOG
line and contains all relevant extracted information
2. Set of rules to be checked in loaded from provided config yaml file
3. Process optional `BEGIN` rule if present by executing provided `CODE` section
Expand Down Expand Up @@ -114,7 +115,7 @@ Example:
`ll` is a structure representing current log line that triggered `KEY` record. `ll` has the following fields
that can be accessed:
- `full_string` - the complete line as a string
- `timestamp` - seconds as a float, e.g. 1234.567
- `timestamp` - milliseconds as an integer
- `key` - the string key
- `value` - the parsed value associated with the key, or None if no value
- `lineno` - line number in the original file of the current key
Expand Down Expand Up @@ -143,7 +144,7 @@ Example:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'maskrcnn', 'transformer', 'gnmt'] "
POST: " enqueue_config('0.7.0/{}.yaml'.format(v['value'])) "
POST: " enqueue_config('1.0.0/{}.yaml'.format(v['value'])) "


#### Other operations
Expand All @@ -158,6 +159,7 @@ For instance, can define rules that would print out information as shown in the
Tested and confirmed working using the following software versions:
- Python 2.7.12 + PyYAML 3.11
- Python 3.6.8 + PyYAML 5.1
- Python 2.9.2 + PyYAML 5.3.1

### How to install PyYaML

Expand Down
2 changes: 2 additions & 0 deletions mlperf_logging/compliance_checker/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,5 @@

if not valid:
sys.exit(1)
else:
print('SUCCESS')
6 changes: 6 additions & 0 deletions mlperf_logging/mllog/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,14 @@
TRANSFORMER = "transformer"
RNNT = "rnnt"
UNET3D = "unet3d"
BERT ="bert"

# Constant values - model info
ADAM = "adam"
LARS = "lars"
LAZY_ADAM = "lazy_adam"
SGD = "sgd"
LAMB ="lamb"

# Constant values - metadata info
ABORTED = "aborted"
Expand Down Expand Up @@ -101,13 +103,15 @@
MIN_IMAGE_SIZE = "min_image_size"
MODEL_BN_SPAN = "model_bn_span"
NUM_IMAGE_CANDIDATES = "num_image_candidates"
NUM_WARMUP_STEPS = "num_warmup_steps"
OPT_ADAM_BETA_1 = "opt_adam_beta_1"
OPT_ADAM_BETA_2 = "opt_adam_beta_2"
OPT_ADAM_EPSILON = "opt_adam_epsilon"
OPT_NAME = "opt_name"
OPT_BASE_LR = "opt_base_learning_rate"
OPT_LAMB_LR_MIN = "opt_lamb_learning_rate_min"
OPT_LAMB_LR_DECAY_POLY_POWER = "opt_lamb_learning_rate_decay_poly_power"
OPT_LAMB_WEIGHT_DECAY = "opt_lamb_weight_decay_rate"
OPT_LAMB_BETA_1 = "opt_lamb_beta_1"
OPT_LAMB_BETA_2 = "opt_lamb_beta_2"
OPT_LAMB_EPSILON = "opt_lamb_epsilon"
Expand All @@ -121,6 +125,7 @@
OPT_LR_DECAY_START_STEP = "opt_learning_rate_decay_start_step"
OPT_LR_DECAY_STEPS = "opt_learning_rate_decay_steps"
OPT_LR_REMAIN_STEPS = "opt_learning_rate_remain_steps"
OPT_LR_TRAINING_STEPS = "opt_learning_rate_training_steps"
OPT_LR_WARMUP_EPOCHS = "opt_learning_rate_warmup_epochs"
OPT_LR_WARMUP_FACTOR = "opt_learning_rate_warmup_factor"
OPT_LR_WARMUP_STEPS = "opt_learning_rate_warmup_steps"
Expand All @@ -140,6 +145,7 @@
MODEL_EVAL_EMA_FACTOR = "model_eval_ema_factor"
MODEL_WEIGHTS_INITIALIZATION_SCALE = "model_weights_initialization_scale"
EVAL_MAX_PREDICTION_SYMBOLS = "eval_max_prediction_symbols"
START_WARMUP_STEP ="start_warmup_step"

# Log keys - misc.
BBOX = "bbox"
Expand Down
30 changes: 15 additions & 15 deletions mlperf_logging/rcp_checker/1.0.0/rcps_bert.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,42 +38,42 @@
2508800, 2458624, 2684416, 2533888, 2533888, 2784768, 2308096, 2784768, 2584064, 2809856]
},

"bert_ref_768":
"bert_ref_1536":
{
"Benchmark": "bert",
"BS": 768,
"BS": 1536,
"Hyperparams": {
"opt_base_learning_rate": 0.00035,
"opt_base_learning_rate": 0.002,
"opt_epsilon": 1e-6,
"opt_learning_rate_training_steps": 8000,
"num_warmup_steps": 420,
"opt_learning_rate_training_steps": 2254,
"num_warmup_steps": 0,
"start_warmup_step": 0,
"opt_lamb_beta_1": 0.91063,
"opt_lamb_beta_2": 0.96497,
"opt_lamb_beta_1": 0.66,
"opt_lamb_beta_2": 0.996,
"opt_lamb_weight_decay_rate": 0.01
},
"Epochs to converge": [
3979008, 3598848, 3598848, 3776256, 3168000, 3370752, 3598848, 3472128, 3826944, 3472128,
3066624, 3345408, 3269376, 3776256, 3396096, 3852288, 3294720, 4004352, 3396096, 3091968]
2836240, 2801664, 2801664, 2727936, 2801664, 2875392, 2899968, 2727936, 2777088, 2875392,
2777088, 2801664, 2678784, 2801664, 2703360, 2629632, 2727936, 2703360, 2654208, 2949120]
},

"bert_ref_3072":
{
"Benchmark": "bert",
"BS": 3072,
"Hyperparams": {
"opt_base_learning_rate": 0.0015,
"opt_base_learning_rate": 0.002,
"opt_epsilon": 1e-6,
"opt_learning_rate_training_steps": 1271,
"opt_learning_rate_training_steps": 1141,
"num_warmup_steps": 100,
"start_warmup_step": 0,
"opt_lamb_beta_1": 0.9,
"opt_lamb_beta_2": 0.999,
"opt_lamb_beta_1": 0.66,
"opt_lamb_beta_2": 0.998,
"opt_lamb_weight_decay_rate": 0.01
},
"Epochs to converge": [
3465216, 3563520, 3489792, 3416064, 3489792, 3514368, 3760128, 3489792, 3612672, 3465216,
3317760, 3661824, 3268608, 3563520, 3588096, 3366912, 3538944, 3489792, 3489792, 3710976]
2703360, 2482176, 3072000, 2654208, 2580480, 2727936, 2605056, 2801664, 2777088, 2580480,
2875392, 2826240, 2973696, 2850816, 2678784, 2919120, 3121152, 2605056, 2678784, 2850816]
},

"bert_ref_8192":
Expand Down
10 changes: 6 additions & 4 deletions mlperf_logging/rcp_checker/1.0.0/rcps_dlrm.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
"sgd_opt_learning_rate_decay_steps": 30000
},
"Epochs to converge": [
0.8, 0.75, 0.75, 0.75, 0.75, 0.8, 0.7, 0.75, 0.75, 0.75]
1.8, 1.75, 1.75, 1.75, 1.75, 1.8, 1.7, 1.75, 1.75, 1.75
]
},

"dlrm_ref_55296":
Expand All @@ -29,9 +30,10 @@
"sgd_opt_learning_rate_decay_steps": 27772
},
"Epochs to converge": [
0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.95,
0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9,
0.9, 0.9, 0.85, 0.9, 0.9, 0.9, 0.9, 0.95, 0.9]
1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.95,
1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9,
1.9, 1.9, 1.85, 1.9, 1.9, 1.9, 1.9, 1.95, 1.9
]
}

}
Expand Down
27 changes: 23 additions & 4 deletions mlperf_logging/rcp_checker/1.0.0/rcps_resnet.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@
"epsilon": 0,
"opt_learning_rate_warmup_epochs": 5,
"opt_momentum": 0.9,
"opt_weight_decay": 2e-3,
"opt_learning_rate_decay_steps": 6720
"opt_weight_decay": 2e-4,
"opt_learning_rate_decay_steps": 6095
},
"Epochs to converge": [
41, 40, 42, 42, 41, 41, 42, 42, 41, 41]
42, 44, 43, 41, 41, 41, 42, 42, 43, 41]
},

"resnet_ref_32768":
Expand All @@ -68,12 +68,31 @@
"opt_learning_rate_decay_poly_power": 2,
"epsilon": 0,
"opt_learning_rate_warmup_epochs": 16,
"opt_momentum": 2.5e-5,
"opt_momentum": 0.94,
"opt_weight_decay": 2e-3,
"opt_learning_rate_decay_steps": 58
},
"Epochs to converge": [
56, 56, 55, 56, 56, 56, 56, 56, 57, 56]
},

"resnet_ref_65536":
{
"Benchmark": "resnet",
"BS": 65536,
"Hyperparams": {
"optimizer": "lars",
"opt_base_learning_rate": 24.699,
"opt_end_learning_rate": 1e-4,
"opt_learning_rate_decay_poly_power": 2,
"epsilon": 0,
"opt_learning_rate_warmup_epochs": 31,
"opt_momentum": 0.951807,
"opt_weight_decay": 1e-4,
"opt_learning_rate_decay_steps": 1133
},
"Epochs to converge": [
83, 85, 84, 86, 85, 85, 83, 84, 85, 85]
}

}
Expand Down
4 changes: 2 additions & 2 deletions mlperf_logging/rcp_checker/1.0.0/rcps_rnnt.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"rnn_t_ref_1k":
{
"Benchmark": "rnnt",
"BS": 128,
"BS": 1024,
"Hyperparams": {
"opt_base_learning_rate": 0.004,
"opt_lamb_learning_rate_hold_epochs": 40,
Expand All @@ -26,7 +26,7 @@
"rnn_t_ref_2k":
{
"Benchmark": "rnnt",
"BS": 256,
"BS": 2048,
"Hyperparams": {
"opt_base_learning_rate": 0.007,
"opt_lamb_learning_rate_hold_epochs": 40,
Expand Down
43 changes: 21 additions & 22 deletions mlperf_logging/rcp_checker/1.0.0/rcps_unet3d.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{

"unet3d_ref_2":
"unet3d_ref_2_fp32":
{
"Benchmark": "unet3d",
"BS": 2,
Expand All @@ -9,21 +9,18 @@
"opt_learning_rate_warmup_epochs": 200
},
"Epochs to converge": [
1980, 1940, 2800, 3020, 2920, 1820, 2300, 2200, 2400, 1780,
2840, 3880, 2120, 2860, 1920, 1480, 2380, 2360, 2220, 3920,
2640, 2240, 2100, 2740, 1740, 3360, 2000, 2460, 2460, 2680,
2320, 2000, 2040, 2180, 2540, 1400, 1720, 1860, 2940, 1880,
1980, 2020, 2440, 2020, 2780, 1660, 2320, 2380, 2680, 2000,
3140, 1680, 1660, 2560, 2660, 1560, 2100, 2000, 2300, 2240,
1780, 2460, 2240, 3500, 1520, 3360, 2260, 2280, 2440, 2800,
2380, 2020, 2880, 2720, 3960, 3840, 3220, 1300, 3140, 3160,
3820, 3220, 2640, 3220, 3680, 2860, 3740, 2320, 2260, 3660,
2260, 2560, 1760, 2720, 1940, 2640, 2200, 2500, 2640, 3460,
1660, 2480, 1560, 2720, 2840, 2300, 1740, 3720, 2800, 3940,
3460, 3380, 3580, 2360, 2720, 3320, 2360, 2980, 3000, 3800,
2100, 1720, 2700, 1780, 3260, 2680, 2140, 3680, 2700]
3420, 3420, 1440, 2320, 2940, 2240, 2600, 2840, 3320, 2360,
4040, 2920, 3360, 2080, 3060, 2900, 4000, 3120, 2120, 2540,
1880, 2640, 2660, 2160, 1420, 2880, 2360, 2260, 2900, 2640,
2380, 3060, 1880, 2420, 2560, 2580, 2180, 2960, 2480, 2140,
3500, 2420, 2500, 3860, 1620, 2260, 2160, 1280, 2320, 2140,
2580, 3020, 2480, 3300, 2140, 3400, 2940, 2520, 3680, 3380,
3080, 2660, 2980, 2740, 2140, 2140, 3000, 2820, 2960, 2420,
2760, 2940, 3280, 2660, 2200, 1660, 1520, 2320, 2180, 2280,
2960, 2140, 3280, 2980, 3580, 3280, 3420]
},
"unet3d_ref_32":

"unet3d_ref_32_amp":
{
"Benchmark": "unet3d",
"BS": 32,
Expand All @@ -32,13 +29,15 @@
"opt_learning_rate_warmup_epochs": 1000
},
"Epochs to converge": [
2220, 1960, 3200, 2440, 2000, 2060, 2420, 2160, 2480, 2480,
3460, 2280, 1660, 2500, 3040, 1860, 2020, 2100, 2560, 3660,
2100, 1760, 2720, 1360, 1580, 4680, 1860, 1680, 1740, 2120,
1720, 2140, 1740, 2220, 1900, 1680, 3040, 1820, 2420, 1380,
2020, 2420, 2020, 2660, 3680, 1740, 2600, 2720, 1940, 2420,
2160, 2060, 2620, 2500, 2080, 3040, 1820, 2780, 1780, 1880,
2240, 2460, 1860]
1512, 3492, 1422, 2052, 2610, 1908, 2052, 1692, 1674, 2196,
2682, 2412, 1980, 2556, 2466, 2358, 2880, 1638, 1890, 2178,
1764, 1872, 2070, 2322, 2178, 2070, 2916, 1548, 1998, 2214,
2034, 2322, 1602, 2610, 1908, 1944, 2646, 2250, 2268, 1854,
1206, 2610, 2394, 2214, 1710, 3240, 2070, 1278, 2034, 1314,
2376, 1530, 1656, 1674, 1494, 2160, 2862, 1152, 1440, 1926,
1440, 2250, 2358, 1836, 2178, 1818, 1458, 1188, 2358, 1692,
1962, 2412, 1296, 2232, 2196, 1926, 1260, 2070, 3042, 2106,
2088, 1926, 2430, 1764, 1854, 2430, 2214, 1638, 2790]
}

}
Expand Down
2 changes: 1 addition & 1 deletion mlperf_logging/rcp_checker/rcp_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def get_submission_epochs(result_files, benchmark):
if conv_result == "success":
subm_epochs.append(conv_epoch)
else:
subm_epochs.append(-1)
subm_epochs.append(1e9)
not_converged = not_converged + 1
if (not_converged > 1 and benchmark != 'unet3d') or (not_converged > 4 and benchmark == 'unet3d'):
subm_epochs = None
Expand Down
Loading

0 comments on commit 3620611

Please sign in to comment.