Remove --distributed-wrapper (consolidate to --ddp-backend) (#1544)

Summary: Pull Request resolved: fairinternal/fairseq-py#1544 Test Plan: Imported from OSS Reviewed By: girifb Differential Revision: D25836856 Pulled By: myleott fbshipit-source-id: eb0a6a02f4d9fe2b6b12a456ef95208dd92c97cb
facebookresearch · Jan 28, 2021 · 5e343f5 · 5e343f5
1 parent 96da4d3
commit 5e343f5
Show file tree

Hide file tree

Showing 32 changed files with 59 additions and 57 deletions.
diff --git a/examples/cross_lingual_language_model/README.md b/examples/cross_lingual_language_model/README.md
@@ -68,7 +68,7 @@ fairseq-train \
 --dataset-impl lazy --seed 0 \
 --masked-lm-only \
 --monolingual-langs 'ar,de,en,hi,fr' --num-segment 5 \
---ddp-backend=no_c10d
+--ddp-backend=legacy_ddp
 ```
 
 Some Notes:

diff --git a/examples/language_model/README.adaptive_inputs.md b/examples/language_model/README.adaptive_inputs.md
@@ -22,7 +22,7 @@ fairseq-train --task language_modeling \
     --max-update 286000 --lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \
     --warmup-updates 16000 --warmup-init-lr 1e-07 --stop-min-lr 1e-09 --optimizer nag --min-lr 0.0001 --clip-norm 0.1 \
     --criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \
-    --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=no_c10d
+    --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=legacy_ddp
 ```
 
 ## Citation

diff --git a/examples/language_model/README.conv.md b/examples/language_model/README.conv.md
@@ -17,7 +17,7 @@ fairseq-train --task language_modeling \
     --optimizer nag --clip-norm 0.1 --weight-decay 5e-06 \
     --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
     --max-tokens 1024 --tokens-per-sample 1024 \
-    --ddp-backend no_c10d \
+    --ddp-backend legacy_ddp \
     --max-epoch 35
 ```
 

diff --git a/examples/latent_depth/README.md b/examples/latent_depth/README.md
@@ -30,7 +30,7 @@ fairseq-train ${databin_dir} \
   --lr 0.0015 \
   --clip-norm 1.0 \
   --seed 2 \
-  --ddp-backend=no_c10d \
+  --ddp-backend=legacy_ddp \
   --encoder-layers 12 \
   --decoder-layers 24 \
   --decoder-latent-layer \

diff --git a/examples/mbart/README.md b/examples/mbart/README.md
@@ -81,7 +81,7 @@ fairseq-train path_2_data \
   --restore-file $PRETRAIN \
   --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
   --langs $langs \
-  --ddp-backend no_c10d
+  --ddp-backend legacy_ddp
 ```
 ## Generate on EN-RO
 Get sacrebleu on finetuned en-ro model

diff --git a/examples/nonautoregressive_translation/README.md b/examples/nonautoregressive_translation/README.md
@@ -36,7 +36,7 @@ The following command will train a *Levenshtein Transformer* on the binarized da
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch levenshtein_transformer \

diff --git a/examples/nonautoregressive_translation/scripts.md b/examples/nonautoregressive_translation/scripts.md
@@ -6,7 +6,7 @@ Note that we need to have an additional module to perform "length prediction" (`
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch nonautoregressive_transformer \
@@ -35,7 +35,7 @@ Note that we implemented a low-rank appromixated CRF model by setting `--crf-low
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch nacrf_transformer \
@@ -68,7 +68,7 @@ Note that `--train-step` means how many iterations of refinement we used during
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch iterative_nonautoregressive_transformer \
@@ -101,7 +101,7 @@ Note that we need to specify the "slot-loss" (uniform or balanced tree) describe
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch insertion_transformer \
@@ -128,7 +128,7 @@ fairseq-train \
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch cmlm_transformer \
@@ -157,7 +157,7 @@ fairseq-train \
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch levenshtein_transformer \

diff --git a/examples/pay_less_attention_paper/README.md b/examples/pay_less_attention_paper/README.md
@@ -113,7 +113,7 @@ CUDA_VISIBLE_DEVICES=0 $(which fairseq-train) data-bin/iwslt14.tokenized.de-en \
     --log-interval 100 --stop-min-lr '1e-09' --weight-decay 0.0001 \
     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
     --lr-scheduler inverse_sqrt \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --max-update 50000 --warmup-updates 4000 --warmup-init-lr '1e-07' \
     --adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \
     -a lightconv_iwslt_de_en --save-dir $SAVE \
@@ -138,7 +138,7 @@ python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \
     --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \
     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
     --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
-    --ddp-backend=no_c10d --max-tokens 3584 \
+    --ddp-backend=legacy_ddp --max-tokens 3584 \
     --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \
     --lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \
     --t-mult 1 --lr-period-updates 20000 \
@@ -163,7 +163,7 @@ python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \
     --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \
     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
     --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
-    --ddp-backend=no_c10d --max-tokens 3584 \
+    --ddp-backend=legacy_ddp --max-tokens 3584 \
     --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \
     --lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \
     --t-mult 1 --lr-period-updates 70000 \

diff --git a/examples/quant_noise/README.md b/examples/quant_noise/README.md
@@ -154,7 +154,7 @@ fairseq-train $DATA_DIR \
     --batch-size $MAX_SENTENCES \
     --update-freq $UPDATE_FREQ --max-update $TOTAL_UPDATES \
     --save-dir checkpoint/roberta \
-    --ddp-backend no_c10d --encoder-layerdrop 0.2 \
+    --ddp-backend legacy_ddp --encoder-layerdrop 0.2 \
     --quant-noise-pq 0.2 --quant-noise-pq-block-size 8 --untie-weights-roberta
 ```
 
@@ -189,7 +189,7 @@ fairseq-train /path/to/rte/data/ \
     --max-epoch 10 \
     --find-unused-parameters \
     --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
-    --ddp-backend no_c10d \
+    --ddp-backend legacy_ddp \
     --quant-noise-pq 0.2 --quant-noise-pq-block-size 8
 ```
 
@@ -205,7 +205,7 @@ fairseq-train --task language_modeling /path/to/wikitext-103/data \
     --arch transformer_lm_gbw \
     --attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1 \
     --clip-norm 0.1 --criterion adaptive_loss \
-    --ddp-backend no_c10d \
+    --ddp-backend legacy_ddp \
     --decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 \
     --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \
     --min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 1.0 --t-mult 2.0 \
@@ -252,7 +252,7 @@ fairseq-train --task sentence_prediction /path/to/data/ \
     --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
     --clip-norm 0.0 --lr-scheduler polynomial_decay \
     --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
-    --no-progress-bar --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d \
+    --no-progress-bar --skip-invalid-size-inputs-valid-test --ddp-backend legacy_ddp \
     --quantization-config-path /path/to/config/yaml
 ```
 
@@ -266,7 +266,7 @@ fairseq-train --task language_modeling /path/to/wikitext-103/data \
     --attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1  \
     --bucket-cap-mb 25 --char-embedder-highway-layers 2 --character-embedding-dim 4 \
     --clip-norm 0.1 --criterion adaptive_loss \
-    --ddp-backend no_c10d \
+    --ddp-backend legacy_ddp \
     --decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \
     --fp16 --keep-last-epochs -1 \
     --min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 0.05 --stop-min-lr 1e-09 \

diff --git a/examples/roberta/README.race.md b/examples/roberta/README.race.md
@@ -19,7 +19,7 @@ UPDATE_FREQ=8         # Accumulate gradients to simulate training on 8 GPUs.
 DATA_DIR=/path/to/race-output-dir
 ROBERTA_PATH=/path/to/roberta/model.pt
 
-CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=no_c10d \
+CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=legacy_ddp \
     --restore-file $ROBERTA_PATH \
     --reset-optimizer --reset-dataloader --reset-meters \
     --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \

diff --git a/examples/roberta/commonsense_qa/README.md b/examples/roberta/commonsense_qa/README.md
@@ -39,7 +39,7 @@ DATA_DIR=data/CommonsenseQA
 FAIRSEQ_PATH=/path/to/fairseq
 FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa
 
-CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=no_c10d \
+CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=legacy_ddp \
     $DATA_DIR \
     --user-dir $FAIRSEQ_USER_DIR \
     --restore-file $ROBERTA_PATH \

diff --git a/examples/roberta/wsc/README.md b/examples/roberta/wsc/README.md
@@ -51,7 +51,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train WSC/ \
     --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
     --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
     --valid-subset val \
-    --fp16 --ddp-backend no_c10d \
+    --fp16 --ddp-backend legacy_ddp \
     --user-dir $FAIRSEQ_USER_DIR \
     --task wsc --criterion wsc --wsc-cross-entropy \
     --arch roberta_large --bpe gpt2 --max-positions 512 \
@@ -110,7 +110,7 @@ CUDA_VISIBLE_DEVICES=0 fairseq-train winogrande_1.0/ \
   --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
   --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
   --valid-subset val \
-  --fp16 --ddp-backend no_c10d \
+  --fp16 --ddp-backend legacy_ddp \
   --user-dir $FAIRSEQ_USER_DIR \
   --task winogrande --criterion winogrande \
   --wsc-margin-alpha 5.0 --wsc-margin-beta 0.4 \

diff --git a/examples/translation/README.md b/examples/translation/README.md
@@ -263,7 +263,7 @@ fairseq-preprocess --source-lang fr --target-lang en \
 mkdir -p checkpoints/multilingual_transformer
 CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt17.de_fr.en.bpe16k/ \
     --max-epoch 50 \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task multilingual_translation --lang-pairs de-en,fr-en \
     --arch multilingual_transformer_iwslt_de_en \
     --share-decoders --share-decoder-input-output-embed \

diff --git a/examples/translation_moe/README.md b/examples/translation_moe/README.md
@@ -15,7 +15,7 @@ The model is trained with online responsibility assignment and shared parameteri
 
 The following command will train a `hMoElp` model with `3` experts:
 ```bash
-fairseq-train --ddp-backend='no_c10d' \
+fairseq-train --ddp-backend='legacy_ddp' \
     data-bin/wmt17_en_de \
     --max-update 100000 \
     --task translation_moe --user-dir examples/translation_moe/translation_moe_src \

diff --git a/examples/wav2vec/config/finetuning/base_100h.yaml b/examples/wav2vec/config/finetuning/base_100h.yaml
@@ -22,7 +22,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 2
 
 criterion:

diff --git a/examples/wav2vec/config/finetuning/base_10h.yaml b/examples/wav2vec/config/finetuning/base_10h.yaml
@@ -27,7 +27,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 2
 
 criterion:

diff --git a/examples/wav2vec/config/finetuning/base_10m.yaml b/examples/wav2vec/config/finetuning/base_10m.yaml
@@ -27,7 +27,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 2
 
 criterion:

diff --git a/examples/wav2vec/config/finetuning/base_1h.yaml b/examples/wav2vec/config/finetuning/base_1h.yaml
@@ -27,7 +27,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 2
 
 criterion:

diff --git a/examples/wav2vec/config/finetuning/base_960h.yaml b/examples/wav2vec/config/finetuning/base_960h.yaml
@@ -22,7 +22,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 8
 
 criterion:

diff --git a/examples/wav2vec/config/finetuning/vox_100h.yaml b/examples/wav2vec/config/finetuning/vox_100h.yaml
@@ -22,7 +22,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 4
 
 criterion:

diff --git a/examples/wav2vec/config/finetuning/vox_10h.yaml b/examples/wav2vec/config/finetuning/vox_10h.yaml
@@ -27,7 +27,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 4
 
 criterion:

diff --git a/examples/wav2vec/config/finetuning/vox_10m.yaml b/examples/wav2vec/config/finetuning/vox_10m.yaml
@@ -27,7 +27,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 4
 
 criterion:

diff --git a/examples/wav2vec/config/finetuning/vox_1h.yaml b/examples/wav2vec/config/finetuning/vox_1h.yaml
@@ -27,7 +27,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 4
 
 criterion:

diff --git a/examples/wav2vec/config/finetuning/vox_960h.yaml b/examples/wav2vec/config/finetuning/vox_960h.yaml
@@ -22,7 +22,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 24
 
 criterion:

diff --git a/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml b/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml
@@ -23,7 +23,7 @@ dataset:
 
 distributed_training:
   distributed_world_size: 64
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
 
 criterion:
   _name: wav2vec

diff --git a/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml b/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml
@@ -24,7 +24,7 @@ dataset:
 
 distributed_training:
   distributed_world_size: 128
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
 
 criterion:
   _name: wav2vec

diff --git a/fairseq/criterions/adaptive_loss.py b/fairseq/criterions/adaptive_loss.py
@@ -32,11 +32,11 @@ def __init__(self, task, sentence_avg):
 
     @classmethod
     def build_criterion(cls, cfg: AdaptiveLossConfig, task):
-        if cfg.ddp_backend == "c10d":
+        if cfg.ddp_backend in {"c10d", "pytorch_ddp"}:
             raise Exception(
-                "AdaptiveLoss is not compatible with the c10d "
+                "AdaptiveLoss is not compatible with the PyTorch "
                 "version of DistributedDataParallel. Please use "
-                "`--ddp-backend=no_c10d` instead."
+                "`--ddp-backend=legacy_ddp` instead."
             )
         return cls(task, cfg.sentence_avg)
 

diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py
@@ -12,7 +12,6 @@
 from fairseq.dataclass.constants import (
     DATASET_IMPL_CHOICES,
     DDP_BACKEND_CHOICES,
-    DISTRIBUTED_WRAPPER_CHOICES,
     GENERATION_CONSTRAINTS_CHOICES,
     GENERATION_DECODING_FORMAT_CHOICES,
     LOG_FORMAT_CHOICES,
@@ -236,7 +235,7 @@ class DistributedTrainingConfig(FairseqDataclass):
         },
     )
     ddp_backend: DDP_BACKEND_CHOICES = field(
-        default="c10d", metadata={"help": "DistributedDataParallel backend"}
+        default="pytorch_ddp", metadata={"help": "DistributedDataParallel backend"}
     )
     bucket_cap_mb: int = field(
         default=25, metadata={"help": "bucket size for reduction"}
@@ -252,7 +251,7 @@ class DistributedTrainingConfig(FairseqDataclass):
         default=False,
         metadata={
             "help": "disable unused parameter detection (not applicable to "
-            "no_c10d ddp-backend"
+            "--ddp-backend=legacy_ddp)"
         },
     )
     fast_stat_sync: bool = field(
@@ -273,9 +272,6 @@ class DistributedTrainingConfig(FairseqDataclass):
             "batchnorm population statistics"
         },
     )
-    distributed_wrapper: DISTRIBUTED_WRAPPER_CHOICES = field(
-        default="DDP", metadata={"help": "DistributedDataParallel backend"}
-    )
     slowmo_momentum: Optional[float] = field(
         default=None,
         metadata={