From 5e343f5f23b4a90cca2beec416b87d4dd7a4264f Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Thu, 28 Jan 2021 14:18:48 -0800 Subject: [PATCH] Remove --distributed-wrapper (consolidate to --ddp-backend) (#1544) Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/1544 Test Plan: Imported from OSS Reviewed By: girifb Differential Revision: D25836856 Pulled By: myleott fbshipit-source-id: eb0a6a02f4d9fe2b6b12a456ef95208dd92c97cb --- examples/cross_lingual_language_model/README.md | 2 +- examples/language_model/README.adaptive_inputs.md | 2 +- examples/language_model/README.conv.md | 2 +- examples/latent_depth/README.md | 2 +- examples/mbart/README.md | 2 +- examples/nonautoregressive_translation/README.md | 2 +- examples/nonautoregressive_translation/scripts.md | 12 ++++++------ examples/pay_less_attention_paper/README.md | 6 +++--- examples/quant_noise/README.md | 10 +++++----- examples/roberta/README.race.md | 2 +- examples/roberta/commonsense_qa/README.md | 2 +- examples/roberta/wsc/README.md | 4 ++-- examples/translation/README.md | 2 +- examples/translation_moe/README.md | 2 +- examples/wav2vec/config/finetuning/base_100h.yaml | 2 +- examples/wav2vec/config/finetuning/base_10h.yaml | 2 +- examples/wav2vec/config/finetuning/base_10m.yaml | 2 +- examples/wav2vec/config/finetuning/base_1h.yaml | 2 +- examples/wav2vec/config/finetuning/base_960h.yaml | 2 +- examples/wav2vec/config/finetuning/vox_100h.yaml | 2 +- examples/wav2vec/config/finetuning/vox_10h.yaml | 2 +- examples/wav2vec/config/finetuning/vox_10m.yaml | 2 +- examples/wav2vec/config/finetuning/vox_1h.yaml | 2 +- examples/wav2vec/config/finetuning/vox_960h.yaml | 2 +- .../pretraining/wav2vec2_base_librispeech.yaml | 2 +- .../config/pretraining/wav2vec2_large_librivox.yaml | 2 +- fairseq/criterions/adaptive_loss.py | 6 +++--- fairseq/dataclass/configs.py | 8 ++------ fairseq/dataclass/constants.py | 9 +++++++-- fairseq/distributed_utils.py | 2 +- fairseq/models/distributed_fairseq_model.py | 6 +++--- fairseq/trainer.py | 9 +++++---- 32 files changed, 59 insertions(+), 57 deletions(-) diff --git a/examples/cross_lingual_language_model/README.md b/examples/cross_lingual_language_model/README.md index f4c76cfed5..af9128e39e 100644 --- a/examples/cross_lingual_language_model/README.md +++ b/examples/cross_lingual_language_model/README.md @@ -68,7 +68,7 @@ fairseq-train \ --dataset-impl lazy --seed 0 \ --masked-lm-only \ --monolingual-langs 'ar,de,en,hi,fr' --num-segment 5 \ ---ddp-backend=no_c10d +--ddp-backend=legacy_ddp ``` Some Notes: diff --git a/examples/language_model/README.adaptive_inputs.md b/examples/language_model/README.adaptive_inputs.md index 98043c5377..6650d58f37 100644 --- a/examples/language_model/README.adaptive_inputs.md +++ b/examples/language_model/README.adaptive_inputs.md @@ -22,7 +22,7 @@ fairseq-train --task language_modeling \ --max-update 286000 --lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \ --warmup-updates 16000 --warmup-init-lr 1e-07 --stop-min-lr 1e-09 --optimizer nag --min-lr 0.0001 --clip-norm 0.1 \ --criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \ - --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=no_c10d + --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=legacy_ddp ``` ## Citation diff --git a/examples/language_model/README.conv.md b/examples/language_model/README.conv.md index f0b6a3a921..1ff8635906 100644 --- a/examples/language_model/README.conv.md +++ b/examples/language_model/README.conv.md @@ -17,7 +17,7 @@ fairseq-train --task language_modeling \ --optimizer nag --clip-norm 0.1 --weight-decay 5e-06 \ --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \ --max-tokens 1024 --tokens-per-sample 1024 \ - --ddp-backend no_c10d \ + --ddp-backend legacy_ddp \ --max-epoch 35 ``` diff --git a/examples/latent_depth/README.md b/examples/latent_depth/README.md index e70e16405c..7774c33305 100644 --- a/examples/latent_depth/README.md +++ b/examples/latent_depth/README.md @@ -30,7 +30,7 @@ fairseq-train ${databin_dir} \ --lr 0.0015 \ --clip-norm 1.0 \ --seed 2 \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --encoder-layers 12 \ --decoder-layers 24 \ --decoder-latent-layer \ diff --git a/examples/mbart/README.md b/examples/mbart/README.md index 8a3e22d425..a45e37243c 100644 --- a/examples/mbart/README.md +++ b/examples/mbart/README.md @@ -81,7 +81,7 @@ fairseq-train path_2_data \ --restore-file $PRETRAIN \ --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \ --langs $langs \ - --ddp-backend no_c10d + --ddp-backend legacy_ddp ``` ## Generate on EN-RO Get sacrebleu on finetuned en-ro model diff --git a/examples/nonautoregressive_translation/README.md b/examples/nonautoregressive_translation/README.md index 7b2d42a91d..8793e225c9 100644 --- a/examples/nonautoregressive_translation/README.md +++ b/examples/nonautoregressive_translation/README.md @@ -36,7 +36,7 @@ The following command will train a *Levenshtein Transformer* on the binarized da fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch levenshtein_transformer \ diff --git a/examples/nonautoregressive_translation/scripts.md b/examples/nonautoregressive_translation/scripts.md index a3a33e6e02..9d3d7b67dc 100644 --- a/examples/nonautoregressive_translation/scripts.md +++ b/examples/nonautoregressive_translation/scripts.md @@ -6,7 +6,7 @@ Note that we need to have an additional module to perform "length prediction" (` fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch nonautoregressive_transformer \ @@ -35,7 +35,7 @@ Note that we implemented a low-rank appromixated CRF model by setting `--crf-low fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch nacrf_transformer \ @@ -68,7 +68,7 @@ Note that `--train-step` means how many iterations of refinement we used during fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch iterative_nonautoregressive_transformer \ @@ -101,7 +101,7 @@ Note that we need to specify the "slot-loss" (uniform or balanced tree) describe fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch insertion_transformer \ @@ -128,7 +128,7 @@ fairseq-train \ fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch cmlm_transformer \ @@ -157,7 +157,7 @@ fairseq-train \ fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch levenshtein_transformer \ diff --git a/examples/pay_less_attention_paper/README.md b/examples/pay_less_attention_paper/README.md index d5b19af6cc..5adab11f4d 100644 --- a/examples/pay_less_attention_paper/README.md +++ b/examples/pay_less_attention_paper/README.md @@ -113,7 +113,7 @@ CUDA_VISIBLE_DEVICES=0 $(which fairseq-train) data-bin/iwslt14.tokenized.de-en \ --log-interval 100 --stop-min-lr '1e-09' --weight-decay 0.0001 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --lr-scheduler inverse_sqrt \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --max-update 50000 --warmup-updates 4000 --warmup-init-lr '1e-07' \ --adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \ -a lightconv_iwslt_de_en --save-dir $SAVE \ @@ -138,7 +138,7 @@ python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \ --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \ - --ddp-backend=no_c10d --max-tokens 3584 \ + --ddp-backend=legacy_ddp --max-tokens 3584 \ --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \ --lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \ --t-mult 1 --lr-period-updates 20000 \ @@ -163,7 +163,7 @@ python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \ --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \ - --ddp-backend=no_c10d --max-tokens 3584 \ + --ddp-backend=legacy_ddp --max-tokens 3584 \ --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \ --lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \ --t-mult 1 --lr-period-updates 70000 \ diff --git a/examples/quant_noise/README.md b/examples/quant_noise/README.md index 7fe301f732..539c3d5af9 100644 --- a/examples/quant_noise/README.md +++ b/examples/quant_noise/README.md @@ -154,7 +154,7 @@ fairseq-train $DATA_DIR \ --batch-size $MAX_SENTENCES \ --update-freq $UPDATE_FREQ --max-update $TOTAL_UPDATES \ --save-dir checkpoint/roberta \ - --ddp-backend no_c10d --encoder-layerdrop 0.2 \ + --ddp-backend legacy_ddp --encoder-layerdrop 0.2 \ --quant-noise-pq 0.2 --quant-noise-pq-block-size 8 --untie-weights-roberta ``` @@ -189,7 +189,7 @@ fairseq-train /path/to/rte/data/ \ --max-epoch 10 \ --find-unused-parameters \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ - --ddp-backend no_c10d \ + --ddp-backend legacy_ddp \ --quant-noise-pq 0.2 --quant-noise-pq-block-size 8 ``` @@ -205,7 +205,7 @@ fairseq-train --task language_modeling /path/to/wikitext-103/data \ --arch transformer_lm_gbw \ --attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1 \ --clip-norm 0.1 --criterion adaptive_loss \ - --ddp-backend no_c10d \ + --ddp-backend legacy_ddp \ --decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 \ --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \ --min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 1.0 --t-mult 2.0 \ @@ -252,7 +252,7 @@ fairseq-train --task sentence_prediction /path/to/data/ \ --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ --clip-norm 0.0 --lr-scheduler polynomial_decay \ --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ - --no-progress-bar --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d \ + --no-progress-bar --skip-invalid-size-inputs-valid-test --ddp-backend legacy_ddp \ --quantization-config-path /path/to/config/yaml ``` @@ -266,7 +266,7 @@ fairseq-train --task language_modeling /path/to/wikitext-103/data \ --attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1 \ --bucket-cap-mb 25 --char-embedder-highway-layers 2 --character-embedding-dim 4 \ --clip-norm 0.1 --criterion adaptive_loss \ - --ddp-backend no_c10d \ + --ddp-backend legacy_ddp \ --decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \ --fp16 --keep-last-epochs -1 \ --min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 0.05 --stop-min-lr 1e-09 \ diff --git a/examples/roberta/README.race.md b/examples/roberta/README.race.md index 527a0bce14..13c917e8ec 100644 --- a/examples/roberta/README.race.md +++ b/examples/roberta/README.race.md @@ -19,7 +19,7 @@ UPDATE_FREQ=8 # Accumulate gradients to simulate training on 8 GPUs. DATA_DIR=/path/to/race-output-dir ROBERTA_PATH=/path/to/roberta/model.pt -CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=no_c10d \ +CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=legacy_ddp \ --restore-file $ROBERTA_PATH \ --reset-optimizer --reset-dataloader --reset-meters \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ diff --git a/examples/roberta/commonsense_qa/README.md b/examples/roberta/commonsense_qa/README.md index 4f371f8b30..05c6f841a8 100644 --- a/examples/roberta/commonsense_qa/README.md +++ b/examples/roberta/commonsense_qa/README.md @@ -39,7 +39,7 @@ DATA_DIR=data/CommonsenseQA FAIRSEQ_PATH=/path/to/fairseq FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa -CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=no_c10d \ +CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=legacy_ddp \ $DATA_DIR \ --user-dir $FAIRSEQ_USER_DIR \ --restore-file $ROBERTA_PATH \ diff --git a/examples/roberta/wsc/README.md b/examples/roberta/wsc/README.md index d40da6a5fd..21a045d999 100644 --- a/examples/roberta/wsc/README.md +++ b/examples/roberta/wsc/README.md @@ -51,7 +51,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train WSC/ \ --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ --valid-subset val \ - --fp16 --ddp-backend no_c10d \ + --fp16 --ddp-backend legacy_ddp \ --user-dir $FAIRSEQ_USER_DIR \ --task wsc --criterion wsc --wsc-cross-entropy \ --arch roberta_large --bpe gpt2 --max-positions 512 \ @@ -110,7 +110,7 @@ CUDA_VISIBLE_DEVICES=0 fairseq-train winogrande_1.0/ \ --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ --valid-subset val \ - --fp16 --ddp-backend no_c10d \ + --fp16 --ddp-backend legacy_ddp \ --user-dir $FAIRSEQ_USER_DIR \ --task winogrande --criterion winogrande \ --wsc-margin-alpha 5.0 --wsc-margin-beta 0.4 \ diff --git a/examples/translation/README.md b/examples/translation/README.md index 7b1fcc8de2..2941f5eb84 100644 --- a/examples/translation/README.md +++ b/examples/translation/README.md @@ -263,7 +263,7 @@ fairseq-preprocess --source-lang fr --target-lang en \ mkdir -p checkpoints/multilingual_transformer CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt17.de_fr.en.bpe16k/ \ --max-epoch 50 \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task multilingual_translation --lang-pairs de-en,fr-en \ --arch multilingual_transformer_iwslt_de_en \ --share-decoders --share-decoder-input-output-embed \ diff --git a/examples/translation_moe/README.md b/examples/translation_moe/README.md index 3cc3fb46dc..2e5c8af617 100644 --- a/examples/translation_moe/README.md +++ b/examples/translation_moe/README.md @@ -15,7 +15,7 @@ The model is trained with online responsibility assignment and shared parameteri The following command will train a `hMoElp` model with `3` experts: ```bash -fairseq-train --ddp-backend='no_c10d' \ +fairseq-train --ddp-backend='legacy_ddp' \ data-bin/wmt17_en_de \ --max-update 100000 \ --task translation_moe --user-dir examples/translation_moe/translation_moe_src \ diff --git a/examples/wav2vec/config/finetuning/base_100h.yaml b/examples/wav2vec/config/finetuning/base_100h.yaml index 7d1664a184..539dabb047 100644 --- a/examples/wav2vec/config/finetuning/base_100h.yaml +++ b/examples/wav2vec/config/finetuning/base_100h.yaml @@ -22,7 +22,7 @@ dataset: valid_subset: dev_other distributed_training: - ddp_backend: no_c10d + ddp_backend: legacy_ddp distributed_world_size: 2 criterion: diff --git a/examples/wav2vec/config/finetuning/base_10h.yaml b/examples/wav2vec/config/finetuning/base_10h.yaml index 31125947c0..16a3c4d96c 100644 --- a/examples/wav2vec/config/finetuning/base_10h.yaml +++ b/examples/wav2vec/config/finetuning/base_10h.yaml @@ -27,7 +27,7 @@ dataset: valid_subset: dev_other distributed_training: - ddp_backend: no_c10d + ddp_backend: legacy_ddp distributed_world_size: 2 criterion: diff --git a/examples/wav2vec/config/finetuning/base_10m.yaml b/examples/wav2vec/config/finetuning/base_10m.yaml index 2235504489..3ceb77a252 100644 --- a/examples/wav2vec/config/finetuning/base_10m.yaml +++ b/examples/wav2vec/config/finetuning/base_10m.yaml @@ -27,7 +27,7 @@ dataset: valid_subset: dev_other distributed_training: - ddp_backend: no_c10d + ddp_backend: legacy_ddp distributed_world_size: 2 criterion: diff --git a/examples/wav2vec/config/finetuning/base_1h.yaml b/examples/wav2vec/config/finetuning/base_1h.yaml index 2235504489..3ceb77a252 100644 --- a/examples/wav2vec/config/finetuning/base_1h.yaml +++ b/examples/wav2vec/config/finetuning/base_1h.yaml @@ -27,7 +27,7 @@ dataset: valid_subset: dev_other distributed_training: - ddp_backend: no_c10d + ddp_backend: legacy_ddp distributed_world_size: 2 criterion: diff --git a/examples/wav2vec/config/finetuning/base_960h.yaml b/examples/wav2vec/config/finetuning/base_960h.yaml index d742c94abf..e393805ad8 100644 --- a/examples/wav2vec/config/finetuning/base_960h.yaml +++ b/examples/wav2vec/config/finetuning/base_960h.yaml @@ -22,7 +22,7 @@ dataset: valid_subset: dev_other distributed_training: - ddp_backend: no_c10d + ddp_backend: legacy_ddp distributed_world_size: 8 criterion: diff --git a/examples/wav2vec/config/finetuning/vox_100h.yaml b/examples/wav2vec/config/finetuning/vox_100h.yaml index 8885c78470..2fdb0c568c 100644 --- a/examples/wav2vec/config/finetuning/vox_100h.yaml +++ b/examples/wav2vec/config/finetuning/vox_100h.yaml @@ -22,7 +22,7 @@ dataset: valid_subset: dev_other distributed_training: - ddp_backend: no_c10d + ddp_backend: legacy_ddp distributed_world_size: 4 criterion: diff --git a/examples/wav2vec/config/finetuning/vox_10h.yaml b/examples/wav2vec/config/finetuning/vox_10h.yaml index c0957c0058..f1a979e05d 100644 --- a/examples/wav2vec/config/finetuning/vox_10h.yaml +++ b/examples/wav2vec/config/finetuning/vox_10h.yaml @@ -27,7 +27,7 @@ dataset: valid_subset: dev_other distributed_training: - ddp_backend: no_c10d + ddp_backend: legacy_ddp distributed_world_size: 4 criterion: diff --git a/examples/wav2vec/config/finetuning/vox_10m.yaml b/examples/wav2vec/config/finetuning/vox_10m.yaml index 0d567552d7..d12439bb28 100644 --- a/examples/wav2vec/config/finetuning/vox_10m.yaml +++ b/examples/wav2vec/config/finetuning/vox_10m.yaml @@ -27,7 +27,7 @@ dataset: valid_subset: dev_other distributed_training: - ddp_backend: no_c10d + ddp_backend: legacy_ddp distributed_world_size: 4 criterion: diff --git a/examples/wav2vec/config/finetuning/vox_1h.yaml b/examples/wav2vec/config/finetuning/vox_1h.yaml index 10c45a52d8..7f3b04c034 100644 --- a/examples/wav2vec/config/finetuning/vox_1h.yaml +++ b/examples/wav2vec/config/finetuning/vox_1h.yaml @@ -27,7 +27,7 @@ dataset: valid_subset: dev_other distributed_training: - ddp_backend: no_c10d + ddp_backend: legacy_ddp distributed_world_size: 4 criterion: diff --git a/examples/wav2vec/config/finetuning/vox_960h.yaml b/examples/wav2vec/config/finetuning/vox_960h.yaml index 6212a2e738..0633915bb2 100644 --- a/examples/wav2vec/config/finetuning/vox_960h.yaml +++ b/examples/wav2vec/config/finetuning/vox_960h.yaml @@ -22,7 +22,7 @@ dataset: valid_subset: dev_other distributed_training: - ddp_backend: no_c10d + ddp_backend: legacy_ddp distributed_world_size: 24 criterion: diff --git a/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml b/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml index e2c2b7b0b3..767aee2852 100644 --- a/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml +++ b/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml @@ -23,7 +23,7 @@ dataset: distributed_training: distributed_world_size: 64 - ddp_backend: no_c10d + ddp_backend: legacy_ddp criterion: _name: wav2vec diff --git a/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml b/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml index 0c911b7491..bee41157a9 100644 --- a/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml +++ b/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml @@ -24,7 +24,7 @@ dataset: distributed_training: distributed_world_size: 128 - ddp_backend: no_c10d + ddp_backend: legacy_ddp criterion: _name: wav2vec diff --git a/fairseq/criterions/adaptive_loss.py b/fairseq/criterions/adaptive_loss.py index 15ad9a15bf..6209ceaedb 100644 --- a/fairseq/criterions/adaptive_loss.py +++ b/fairseq/criterions/adaptive_loss.py @@ -32,11 +32,11 @@ def __init__(self, task, sentence_avg): @classmethod def build_criterion(cls, cfg: AdaptiveLossConfig, task): - if cfg.ddp_backend == "c10d": + if cfg.ddp_backend in {"c10d", "pytorch_ddp"}: raise Exception( - "AdaptiveLoss is not compatible with the c10d " + "AdaptiveLoss is not compatible with the PyTorch " "version of DistributedDataParallel. Please use " - "`--ddp-backend=no_c10d` instead." + "`--ddp-backend=legacy_ddp` instead." ) return cls(task, cfg.sentence_avg) diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py index 2ed27284dc..f66e98fe83 100644 --- a/fairseq/dataclass/configs.py +++ b/fairseq/dataclass/configs.py @@ -12,7 +12,6 @@ from fairseq.dataclass.constants import ( DATASET_IMPL_CHOICES, DDP_BACKEND_CHOICES, - DISTRIBUTED_WRAPPER_CHOICES, GENERATION_CONSTRAINTS_CHOICES, GENERATION_DECODING_FORMAT_CHOICES, LOG_FORMAT_CHOICES, @@ -236,7 +235,7 @@ class DistributedTrainingConfig(FairseqDataclass): }, ) ddp_backend: DDP_BACKEND_CHOICES = field( - default="c10d", metadata={"help": "DistributedDataParallel backend"} + default="pytorch_ddp", metadata={"help": "DistributedDataParallel backend"} ) bucket_cap_mb: int = field( default=25, metadata={"help": "bucket size for reduction"} @@ -252,7 +251,7 @@ class DistributedTrainingConfig(FairseqDataclass): default=False, metadata={ "help": "disable unused parameter detection (not applicable to " - "no_c10d ddp-backend" + "--ddp-backend=legacy_ddp)" }, ) fast_stat_sync: bool = field( @@ -273,9 +272,6 @@ class DistributedTrainingConfig(FairseqDataclass): "batchnorm population statistics" }, ) - distributed_wrapper: DISTRIBUTED_WRAPPER_CHOICES = field( - default="DDP", metadata={"help": "DistributedDataParallel backend"} - ) slowmo_momentum: Optional[float] = field( default=None, metadata={ diff --git a/fairseq/dataclass/constants.py b/fairseq/dataclass/constants.py index 46881786a8..93bc6d03cb 100644 --- a/fairseq/dataclass/constants.py +++ b/fairseq/dataclass/constants.py @@ -35,9 +35,14 @@ def ChoiceEnum(choices: List[str]): LOG_FORMAT_CHOICES = ChoiceEnum(["json", "none", "simple", "tqdm"]) -DDP_BACKEND_CHOICES = ChoiceEnum(["c10d", "no_c10d"]) +DDP_BACKEND_CHOICES = ChoiceEnum([ + "c10d", # alias for pytorch_ddp + "legacy_ddp", + "no_c10d", # alias for legacy_ddp + "pytorch_ddp", + "slow_mo", +]) DATASET_IMPL_CHOICES = ChoiceEnum(["raw", "lazy", "cached", "mmap", "fasta"]) -DISTRIBUTED_WRAPPER_CHOICES = ChoiceEnum(["DDP", "SlowMo"]) GENERATION_CONSTRAINTS_CHOICES = ChoiceEnum(["ordered", "unordered"]) GENERATION_DECODING_FORMAT_CHOICES = ChoiceEnum( ["unigram", "ensemble", "vote", "dp", "bs"] diff --git a/fairseq/distributed_utils.py b/fairseq/distributed_utils.py index 37822362d4..3b5fe6e7a8 100644 --- a/fairseq/distributed_utils.py +++ b/fairseq/distributed_utils.py @@ -591,7 +591,7 @@ def all_gather_list(data, group=None, max_size=16384): "sync if one of them runs out of memory, or if there are other conditions " "in your training script that can cause one worker to finish an epoch " "while other workers are still iterating over their portions of the data. " - "Try rerunning with --ddp-backend=no_c10d and see if that helps." + "Try rerunning with --ddp-backend=legacy_ddp and see if that helps." ) diff --git a/fairseq/models/distributed_fairseq_model.py b/fairseq/models/distributed_fairseq_model.py index ffa3c37b19..b8fbc37793 100644 --- a/fairseq/models/distributed_fairseq_model.py +++ b/fairseq/models/distributed_fairseq_model.py @@ -49,7 +49,7 @@ def DistributedFairseqModel(args, model, process_group): module=model, process_group=process_group, ) - elif args.distributed_wrapper == "DDP" and args.ddp_backend == "c10d": + elif args.ddp_backend in {"c10d", "pytorch_ddp"}: ddp_class = nn.parallel.DistributedDataParallel init_kwargs = dict( module=model, @@ -62,14 +62,14 @@ def DistributedFairseqModel(args, model, process_group): # Maintain backward compatibility if "find_unused_parameters" in inspect.getargspec(ddp_class)[0]: init_kwargs["find_unused_parameters"] = args.find_unused_parameters - elif args.distributed_wrapper == "DDP" and args.ddp_backend == "no_c10d": + elif args.ddp_backend in {"no_c10d", "legacy_ddp"}: ddp_class = LegacyDistributedDataParallel init_kwargs = dict( module=model, buffer_size=2 ** 28, process_group=process_group, ) - elif args.distributed_wrapper == "SlowMo": + elif args.ddp_backend == "slow_mo": if _GOSSIP_DISABLED: raise ImportError( "Cannot find gossip library. Please install from: " diff --git a/fairseq/trainer.py b/fairseq/trainer.py index eea194b950..d893518fea 100644 --- a/fairseq/trainer.py +++ b/fairseq/trainer.py @@ -646,7 +646,7 @@ def maybe_no_sync(): if not self.tpu: if ( not self.cfg.optimization.use_bmuf - and self.cfg.distributed_training.distributed_wrapper != "SlowMo" + and self.cfg.distributed_training.ddp_backend != "slow_mo" ): self._check_grad_norms(grad_norm) if not torch.isfinite(grad_norm).all(): @@ -686,7 +686,8 @@ def maybe_no_sync(): logger.error("OOM during optimization, irrecoverable") raise e - # Some distributed wrappers (e.g., SlowMo) need access to the optimizer after the step + # Some distributed wrappers (e.g., SlowMo) need access to the optimizer + # after the step if hasattr(self.model, "perform_additional_optimizer_actions"): if hasattr(self.optimizer, "fp32_params"): self.model.perform_additional_optimizer_actions( @@ -700,7 +701,7 @@ def maybe_no_sync(): logging_output = None if ( not overflow - or self.cfg.distributed_training.distributed_wrapper == "SlowMo" + or self.cfg.distributed_training.ddp_backend == "slow_mo" ): self.set_num_updates(self.get_num_updates() + 1) @@ -1120,7 +1121,7 @@ def is_consistent(tensor): # use FloatingPointError to trigger NanDetector raise FloatingPointError( "Fatal error: gradients are inconsistent between workers. " - "Try --ddp-backend=no_c10d. " + "Try --ddp-backend=legacy_ddp. " "Or are you mixing up different generation of GPUs in training?" + "\n" + "-" * 80