From 5e343f5f23b4a90cca2beec416b87d4dd7a4264f Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Thu, 28 Jan 2021 14:18:48 -0800
Subject: [PATCH] Remove --distributed-wrapper (consolidate to --ddp-backend)
 (#1544)

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/1544

Test Plan: Imported from OSS

Reviewed By: girifb

Differential Revision: D25836856

Pulled By: myleott

fbshipit-source-id: eb0a6a02f4d9fe2b6b12a456ef95208dd92c97cb
---
 examples/cross_lingual_language_model/README.md      |  2 +-
 examples/language_model/README.adaptive_inputs.md    |  2 +-
 examples/language_model/README.conv.md               |  2 +-
 examples/latent_depth/README.md                      |  2 +-
 examples/mbart/README.md                             |  2 +-
 examples/nonautoregressive_translation/README.md     |  2 +-
 examples/nonautoregressive_translation/scripts.md    | 12 ++++++------
 examples/pay_less_attention_paper/README.md          |  6 +++---
 examples/quant_noise/README.md                       | 10 +++++-----
 examples/roberta/README.race.md                      |  2 +-
 examples/roberta/commonsense_qa/README.md            |  2 +-
 examples/roberta/wsc/README.md                       |  4 ++--
 examples/translation/README.md                       |  2 +-
 examples/translation_moe/README.md                   |  2 +-
 examples/wav2vec/config/finetuning/base_100h.yaml    |  2 +-
 examples/wav2vec/config/finetuning/base_10h.yaml     |  2 +-
 examples/wav2vec/config/finetuning/base_10m.yaml     |  2 +-
 examples/wav2vec/config/finetuning/base_1h.yaml      |  2 +-
 examples/wav2vec/config/finetuning/base_960h.yaml    |  2 +-
 examples/wav2vec/config/finetuning/vox_100h.yaml     |  2 +-
 examples/wav2vec/config/finetuning/vox_10h.yaml      |  2 +-
 examples/wav2vec/config/finetuning/vox_10m.yaml      |  2 +-
 examples/wav2vec/config/finetuning/vox_1h.yaml       |  2 +-
 examples/wav2vec/config/finetuning/vox_960h.yaml     |  2 +-
 .../pretraining/wav2vec2_base_librispeech.yaml       |  2 +-
 .../config/pretraining/wav2vec2_large_librivox.yaml  |  2 +-
 fairseq/criterions/adaptive_loss.py                  |  6 +++---
 fairseq/dataclass/configs.py                         |  8 ++------
 fairseq/dataclass/constants.py                       |  9 +++++++--
 fairseq/distributed_utils.py                         |  2 +-
 fairseq/models/distributed_fairseq_model.py          |  6 +++---
 fairseq/trainer.py                                   |  9 +++++----
 32 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/examples/cross_lingual_language_model/README.md b/examples/cross_lingual_language_model/README.md
index f4c76cfed5..af9128e39e 100644
--- a/examples/cross_lingual_language_model/README.md
+++ b/examples/cross_lingual_language_model/README.md
@@ -68,7 +68,7 @@ fairseq-train \
 --dataset-impl lazy --seed 0 \
 --masked-lm-only \
 --monolingual-langs 'ar,de,en,hi,fr' --num-segment 5 \
---ddp-backend=no_c10d
+--ddp-backend=legacy_ddp
 ```
 
 Some Notes:
diff --git a/examples/language_model/README.adaptive_inputs.md b/examples/language_model/README.adaptive_inputs.md
index 98043c5377..6650d58f37 100644
--- a/examples/language_model/README.adaptive_inputs.md
+++ b/examples/language_model/README.adaptive_inputs.md
@@ -22,7 +22,7 @@ fairseq-train --task language_modeling \
     --max-update 286000 --lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \
     --warmup-updates 16000 --warmup-init-lr 1e-07 --stop-min-lr 1e-09 --optimizer nag --min-lr 0.0001 --clip-norm 0.1 \
     --criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \
-    --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=no_c10d
+    --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=legacy_ddp
 ```
 
 ## Citation
diff --git a/examples/language_model/README.conv.md b/examples/language_model/README.conv.md
index f0b6a3a921..1ff8635906 100644
--- a/examples/language_model/README.conv.md
+++ b/examples/language_model/README.conv.md
@@ -17,7 +17,7 @@ fairseq-train --task language_modeling \
     --optimizer nag --clip-norm 0.1 --weight-decay 5e-06 \
     --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
     --max-tokens 1024 --tokens-per-sample 1024 \
-    --ddp-backend no_c10d \
+    --ddp-backend legacy_ddp \
     --max-epoch 35
 ```
 
diff --git a/examples/latent_depth/README.md b/examples/latent_depth/README.md
index e70e16405c..7774c33305 100644
--- a/examples/latent_depth/README.md
+++ b/examples/latent_depth/README.md
@@ -30,7 +30,7 @@ fairseq-train ${databin_dir} \
   --lr 0.0015 \
   --clip-norm 1.0 \
   --seed 2 \
-  --ddp-backend=no_c10d \
+  --ddp-backend=legacy_ddp \
   --encoder-layers 12 \
   --decoder-layers 24 \
   --decoder-latent-layer \
diff --git a/examples/mbart/README.md b/examples/mbart/README.md
index 8a3e22d425..a45e37243c 100644
--- a/examples/mbart/README.md
+++ b/examples/mbart/README.md
@@ -81,7 +81,7 @@ fairseq-train path_2_data \
   --restore-file $PRETRAIN \
   --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
   --langs $langs \
-  --ddp-backend no_c10d
+  --ddp-backend legacy_ddp
 ```
 ## Generate on EN-RO
 Get sacrebleu on finetuned en-ro model
diff --git a/examples/nonautoregressive_translation/README.md b/examples/nonautoregressive_translation/README.md
index 7b2d42a91d..8793e225c9 100644
--- a/examples/nonautoregressive_translation/README.md
+++ b/examples/nonautoregressive_translation/README.md
@@ -36,7 +36,7 @@ The following command will train a *Levenshtein Transformer* on the binarized da
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch levenshtein_transformer \
diff --git a/examples/nonautoregressive_translation/scripts.md b/examples/nonautoregressive_translation/scripts.md
index a3a33e6e02..9d3d7b67dc 100644
--- a/examples/nonautoregressive_translation/scripts.md
+++ b/examples/nonautoregressive_translation/scripts.md
@@ -6,7 +6,7 @@ Note that we need to have an additional module to perform "length prediction" (`
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch nonautoregressive_transformer \
@@ -35,7 +35,7 @@ Note that we implemented a low-rank appromixated CRF model by setting `--crf-low
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch nacrf_transformer \
@@ -68,7 +68,7 @@ Note that `--train-step` means how many iterations of refinement we used during
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch iterative_nonautoregressive_transformer \
@@ -101,7 +101,7 @@ Note that we need to specify the "slot-loss" (uniform or balanced tree) describe
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch insertion_transformer \
@@ -128,7 +128,7 @@ fairseq-train \
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch cmlm_transformer \
@@ -157,7 +157,7 @@ fairseq-train \
 fairseq-train \
     data-bin/wmt14_en_de_distill \
     --save-dir checkpoints \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task translation_lev \
     --criterion nat_loss \
     --arch levenshtein_transformer \
diff --git a/examples/pay_less_attention_paper/README.md b/examples/pay_less_attention_paper/README.md
index d5b19af6cc..5adab11f4d 100644
--- a/examples/pay_less_attention_paper/README.md
+++ b/examples/pay_less_attention_paper/README.md
@@ -113,7 +113,7 @@ CUDA_VISIBLE_DEVICES=0 $(which fairseq-train) data-bin/iwslt14.tokenized.de-en \
     --log-interval 100 --stop-min-lr '1e-09' --weight-decay 0.0001 \
     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
     --lr-scheduler inverse_sqrt \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --max-update 50000 --warmup-updates 4000 --warmup-init-lr '1e-07' \
     --adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \
     -a lightconv_iwslt_de_en --save-dir $SAVE \
@@ -138,7 +138,7 @@ python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \
     --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \
     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
     --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
-    --ddp-backend=no_c10d --max-tokens 3584 \
+    --ddp-backend=legacy_ddp --max-tokens 3584 \
     --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \
     --lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \
     --t-mult 1 --lr-period-updates 20000 \
@@ -163,7 +163,7 @@ python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \
     --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \
     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
     --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
-    --ddp-backend=no_c10d --max-tokens 3584 \
+    --ddp-backend=legacy_ddp --max-tokens 3584 \
     --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \
     --lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \
     --t-mult 1 --lr-period-updates 70000 \
diff --git a/examples/quant_noise/README.md b/examples/quant_noise/README.md
index 7fe301f732..539c3d5af9 100644
--- a/examples/quant_noise/README.md
+++ b/examples/quant_noise/README.md
@@ -154,7 +154,7 @@ fairseq-train $DATA_DIR \
     --batch-size $MAX_SENTENCES \
     --update-freq $UPDATE_FREQ --max-update $TOTAL_UPDATES \
     --save-dir checkpoint/roberta \
-    --ddp-backend no_c10d --encoder-layerdrop 0.2 \
+    --ddp-backend legacy_ddp --encoder-layerdrop 0.2 \
     --quant-noise-pq 0.2 --quant-noise-pq-block-size 8 --untie-weights-roberta
 ```
 
@@ -189,7 +189,7 @@ fairseq-train /path/to/rte/data/ \
     --max-epoch 10 \
     --find-unused-parameters \
     --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
-    --ddp-backend no_c10d \
+    --ddp-backend legacy_ddp \
     --quant-noise-pq 0.2 --quant-noise-pq-block-size 8
 ```
 
@@ -205,7 +205,7 @@ fairseq-train --task language_modeling /path/to/wikitext-103/data \
     --arch transformer_lm_gbw \
     --attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1 \
     --clip-norm 0.1 --criterion adaptive_loss \
-    --ddp-backend no_c10d \
+    --ddp-backend legacy_ddp \
     --decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 \
     --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \
     --min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 1.0 --t-mult 2.0 \
@@ -252,7 +252,7 @@ fairseq-train --task sentence_prediction /path/to/data/ \
     --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
     --clip-norm 0.0 --lr-scheduler polynomial_decay \
     --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
-    --no-progress-bar --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d \
+    --no-progress-bar --skip-invalid-size-inputs-valid-test --ddp-backend legacy_ddp \
     --quantization-config-path /path/to/config/yaml
 ```
 
@@ -266,7 +266,7 @@ fairseq-train --task language_modeling /path/to/wikitext-103/data \
     --attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1  \
     --bucket-cap-mb 25 --char-embedder-highway-layers 2 --character-embedding-dim 4 \
     --clip-norm 0.1 --criterion adaptive_loss \
-    --ddp-backend no_c10d \
+    --ddp-backend legacy_ddp \
     --decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \
     --fp16 --keep-last-epochs -1 \
     --min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 0.05 --stop-min-lr 1e-09 \
diff --git a/examples/roberta/README.race.md b/examples/roberta/README.race.md
index 527a0bce14..13c917e8ec 100644
--- a/examples/roberta/README.race.md
+++ b/examples/roberta/README.race.md
@@ -19,7 +19,7 @@ UPDATE_FREQ=8         # Accumulate gradients to simulate training on 8 GPUs.
 DATA_DIR=/path/to/race-output-dir
 ROBERTA_PATH=/path/to/roberta/model.pt
 
-CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=no_c10d \
+CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=legacy_ddp \
     --restore-file $ROBERTA_PATH \
     --reset-optimizer --reset-dataloader --reset-meters \
     --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
diff --git a/examples/roberta/commonsense_qa/README.md b/examples/roberta/commonsense_qa/README.md
index 4f371f8b30..05c6f841a8 100644
--- a/examples/roberta/commonsense_qa/README.md
+++ b/examples/roberta/commonsense_qa/README.md
@@ -39,7 +39,7 @@ DATA_DIR=data/CommonsenseQA
 FAIRSEQ_PATH=/path/to/fairseq
 FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa
 
-CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=no_c10d \
+CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=legacy_ddp \
     $DATA_DIR \
     --user-dir $FAIRSEQ_USER_DIR \
     --restore-file $ROBERTA_PATH \
diff --git a/examples/roberta/wsc/README.md b/examples/roberta/wsc/README.md
index d40da6a5fd..21a045d999 100644
--- a/examples/roberta/wsc/README.md
+++ b/examples/roberta/wsc/README.md
@@ -51,7 +51,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train WSC/ \
     --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
     --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
     --valid-subset val \
-    --fp16 --ddp-backend no_c10d \
+    --fp16 --ddp-backend legacy_ddp \
     --user-dir $FAIRSEQ_USER_DIR \
     --task wsc --criterion wsc --wsc-cross-entropy \
     --arch roberta_large --bpe gpt2 --max-positions 512 \
@@ -110,7 +110,7 @@ CUDA_VISIBLE_DEVICES=0 fairseq-train winogrande_1.0/ \
   --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
   --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
   --valid-subset val \
-  --fp16 --ddp-backend no_c10d \
+  --fp16 --ddp-backend legacy_ddp \
   --user-dir $FAIRSEQ_USER_DIR \
   --task winogrande --criterion winogrande \
   --wsc-margin-alpha 5.0 --wsc-margin-beta 0.4 \
diff --git a/examples/translation/README.md b/examples/translation/README.md
index 7b1fcc8de2..2941f5eb84 100644
--- a/examples/translation/README.md
+++ b/examples/translation/README.md
@@ -263,7 +263,7 @@ fairseq-preprocess --source-lang fr --target-lang en \
 mkdir -p checkpoints/multilingual_transformer
 CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt17.de_fr.en.bpe16k/ \
     --max-epoch 50 \
-    --ddp-backend=no_c10d \
+    --ddp-backend=legacy_ddp \
     --task multilingual_translation --lang-pairs de-en,fr-en \
     --arch multilingual_transformer_iwslt_de_en \
     --share-decoders --share-decoder-input-output-embed \
diff --git a/examples/translation_moe/README.md b/examples/translation_moe/README.md
index 3cc3fb46dc..2e5c8af617 100644
--- a/examples/translation_moe/README.md
+++ b/examples/translation_moe/README.md
@@ -15,7 +15,7 @@ The model is trained with online responsibility assignment and shared parameteri
 
 The following command will train a `hMoElp` model with `3` experts:
 ```bash
-fairseq-train --ddp-backend='no_c10d' \
+fairseq-train --ddp-backend='legacy_ddp' \
     data-bin/wmt17_en_de \
     --max-update 100000 \
     --task translation_moe --user-dir examples/translation_moe/translation_moe_src \
diff --git a/examples/wav2vec/config/finetuning/base_100h.yaml b/examples/wav2vec/config/finetuning/base_100h.yaml
index 7d1664a184..539dabb047 100644
--- a/examples/wav2vec/config/finetuning/base_100h.yaml
+++ b/examples/wav2vec/config/finetuning/base_100h.yaml
@@ -22,7 +22,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 2
 
 criterion:
diff --git a/examples/wav2vec/config/finetuning/base_10h.yaml b/examples/wav2vec/config/finetuning/base_10h.yaml
index 31125947c0..16a3c4d96c 100644
--- a/examples/wav2vec/config/finetuning/base_10h.yaml
+++ b/examples/wav2vec/config/finetuning/base_10h.yaml
@@ -27,7 +27,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 2
 
 criterion:
diff --git a/examples/wav2vec/config/finetuning/base_10m.yaml b/examples/wav2vec/config/finetuning/base_10m.yaml
index 2235504489..3ceb77a252 100644
--- a/examples/wav2vec/config/finetuning/base_10m.yaml
+++ b/examples/wav2vec/config/finetuning/base_10m.yaml
@@ -27,7 +27,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 2
 
 criterion:
diff --git a/examples/wav2vec/config/finetuning/base_1h.yaml b/examples/wav2vec/config/finetuning/base_1h.yaml
index 2235504489..3ceb77a252 100644
--- a/examples/wav2vec/config/finetuning/base_1h.yaml
+++ b/examples/wav2vec/config/finetuning/base_1h.yaml
@@ -27,7 +27,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 2
 
 criterion:
diff --git a/examples/wav2vec/config/finetuning/base_960h.yaml b/examples/wav2vec/config/finetuning/base_960h.yaml
index d742c94abf..e393805ad8 100644
--- a/examples/wav2vec/config/finetuning/base_960h.yaml
+++ b/examples/wav2vec/config/finetuning/base_960h.yaml
@@ -22,7 +22,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 8
 
 criterion:
diff --git a/examples/wav2vec/config/finetuning/vox_100h.yaml b/examples/wav2vec/config/finetuning/vox_100h.yaml
index 8885c78470..2fdb0c568c 100644
--- a/examples/wav2vec/config/finetuning/vox_100h.yaml
+++ b/examples/wav2vec/config/finetuning/vox_100h.yaml
@@ -22,7 +22,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 4
 
 criterion:
diff --git a/examples/wav2vec/config/finetuning/vox_10h.yaml b/examples/wav2vec/config/finetuning/vox_10h.yaml
index c0957c0058..f1a979e05d 100644
--- a/examples/wav2vec/config/finetuning/vox_10h.yaml
+++ b/examples/wav2vec/config/finetuning/vox_10h.yaml
@@ -27,7 +27,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 4
 
 criterion:
diff --git a/examples/wav2vec/config/finetuning/vox_10m.yaml b/examples/wav2vec/config/finetuning/vox_10m.yaml
index 0d567552d7..d12439bb28 100644
--- a/examples/wav2vec/config/finetuning/vox_10m.yaml
+++ b/examples/wav2vec/config/finetuning/vox_10m.yaml
@@ -27,7 +27,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 4
 
 criterion:
diff --git a/examples/wav2vec/config/finetuning/vox_1h.yaml b/examples/wav2vec/config/finetuning/vox_1h.yaml
index 10c45a52d8..7f3b04c034 100644
--- a/examples/wav2vec/config/finetuning/vox_1h.yaml
+++ b/examples/wav2vec/config/finetuning/vox_1h.yaml
@@ -27,7 +27,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 4
 
 criterion:
diff --git a/examples/wav2vec/config/finetuning/vox_960h.yaml b/examples/wav2vec/config/finetuning/vox_960h.yaml
index 6212a2e738..0633915bb2 100644
--- a/examples/wav2vec/config/finetuning/vox_960h.yaml
+++ b/examples/wav2vec/config/finetuning/vox_960h.yaml
@@ -22,7 +22,7 @@ dataset:
   valid_subset: dev_other
 
 distributed_training:
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
   distributed_world_size: 24
 
 criterion:
diff --git a/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml b/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml
index e2c2b7b0b3..767aee2852 100644
--- a/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml
+++ b/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml
@@ -23,7 +23,7 @@ dataset:
 
 distributed_training:
   distributed_world_size: 64
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
 
 criterion:
   _name: wav2vec
diff --git a/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml b/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml
index 0c911b7491..bee41157a9 100644
--- a/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml
+++ b/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml
@@ -24,7 +24,7 @@ dataset:
 
 distributed_training:
   distributed_world_size: 128
-  ddp_backend: no_c10d
+  ddp_backend: legacy_ddp
 
 criterion:
   _name: wav2vec
diff --git a/fairseq/criterions/adaptive_loss.py b/fairseq/criterions/adaptive_loss.py
index 15ad9a15bf..6209ceaedb 100644
--- a/fairseq/criterions/adaptive_loss.py
+++ b/fairseq/criterions/adaptive_loss.py
@@ -32,11 +32,11 @@ def __init__(self, task, sentence_avg):
 
     @classmethod
     def build_criterion(cls, cfg: AdaptiveLossConfig, task):
-        if cfg.ddp_backend == "c10d":
+        if cfg.ddp_backend in {"c10d", "pytorch_ddp"}:
             raise Exception(
-                "AdaptiveLoss is not compatible with the c10d "
+                "AdaptiveLoss is not compatible with the PyTorch "
                 "version of DistributedDataParallel. Please use "
-                "`--ddp-backend=no_c10d` instead."
+                "`--ddp-backend=legacy_ddp` instead."
             )
         return cls(task, cfg.sentence_avg)
 
diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py
index 2ed27284dc..f66e98fe83 100644
--- a/fairseq/dataclass/configs.py
+++ b/fairseq/dataclass/configs.py
@@ -12,7 +12,6 @@
 from fairseq.dataclass.constants import (
     DATASET_IMPL_CHOICES,
     DDP_BACKEND_CHOICES,
-    DISTRIBUTED_WRAPPER_CHOICES,
     GENERATION_CONSTRAINTS_CHOICES,
     GENERATION_DECODING_FORMAT_CHOICES,
     LOG_FORMAT_CHOICES,
@@ -236,7 +235,7 @@ class DistributedTrainingConfig(FairseqDataclass):
         },
     )
     ddp_backend: DDP_BACKEND_CHOICES = field(
-        default="c10d", metadata={"help": "DistributedDataParallel backend"}
+        default="pytorch_ddp", metadata={"help": "DistributedDataParallel backend"}
     )
     bucket_cap_mb: int = field(
         default=25, metadata={"help": "bucket size for reduction"}
@@ -252,7 +251,7 @@ class DistributedTrainingConfig(FairseqDataclass):
         default=False,
         metadata={
             "help": "disable unused parameter detection (not applicable to "
-            "no_c10d ddp-backend"
+            "--ddp-backend=legacy_ddp)"
         },
     )
     fast_stat_sync: bool = field(
@@ -273,9 +272,6 @@ class DistributedTrainingConfig(FairseqDataclass):
             "batchnorm population statistics"
         },
     )
-    distributed_wrapper: DISTRIBUTED_WRAPPER_CHOICES = field(
-        default="DDP", metadata={"help": "DistributedDataParallel backend"}
-    )
     slowmo_momentum: Optional[float] = field(
         default=None,
         metadata={
diff --git a/fairseq/dataclass/constants.py b/fairseq/dataclass/constants.py
index 46881786a8..93bc6d03cb 100644
--- a/fairseq/dataclass/constants.py
+++ b/fairseq/dataclass/constants.py
@@ -35,9 +35,14 @@ def ChoiceEnum(choices: List[str]):
 
 
 LOG_FORMAT_CHOICES = ChoiceEnum(["json", "none", "simple", "tqdm"])
-DDP_BACKEND_CHOICES = ChoiceEnum(["c10d", "no_c10d"])
+DDP_BACKEND_CHOICES = ChoiceEnum([
+    "c10d",  # alias for pytorch_ddp
+    "legacy_ddp",
+    "no_c10d",  # alias for legacy_ddp
+    "pytorch_ddp",
+    "slow_mo",
+])
 DATASET_IMPL_CHOICES = ChoiceEnum(["raw", "lazy", "cached", "mmap", "fasta"])
-DISTRIBUTED_WRAPPER_CHOICES = ChoiceEnum(["DDP", "SlowMo"])
 GENERATION_CONSTRAINTS_CHOICES = ChoiceEnum(["ordered", "unordered"])
 GENERATION_DECODING_FORMAT_CHOICES = ChoiceEnum(
     ["unigram", "ensemble", "vote", "dp", "bs"]
diff --git a/fairseq/distributed_utils.py b/fairseq/distributed_utils.py
index 37822362d4..3b5fe6e7a8 100644
--- a/fairseq/distributed_utils.py
+++ b/fairseq/distributed_utils.py
@@ -591,7 +591,7 @@ def all_gather_list(data, group=None, max_size=16384):
             "sync if one of them runs out of memory, or if there are other conditions "
             "in your training script that can cause one worker to finish an epoch "
             "while other workers are still iterating over their portions of the data. "
-            "Try rerunning with --ddp-backend=no_c10d and see if that helps."
+            "Try rerunning with --ddp-backend=legacy_ddp and see if that helps."
         )
 
 
diff --git a/fairseq/models/distributed_fairseq_model.py b/fairseq/models/distributed_fairseq_model.py
index ffa3c37b19..b8fbc37793 100644
--- a/fairseq/models/distributed_fairseq_model.py
+++ b/fairseq/models/distributed_fairseq_model.py
@@ -49,7 +49,7 @@ def DistributedFairseqModel(args, model, process_group):
             module=model,
             process_group=process_group,
         )
-    elif args.distributed_wrapper == "DDP" and args.ddp_backend == "c10d":
+    elif args.ddp_backend in {"c10d", "pytorch_ddp"}:
         ddp_class = nn.parallel.DistributedDataParallel
         init_kwargs = dict(
             module=model,
@@ -62,14 +62,14 @@ def DistributedFairseqModel(args, model, process_group):
         # Maintain backward compatibility
         if "find_unused_parameters" in inspect.getargspec(ddp_class)[0]:
             init_kwargs["find_unused_parameters"] = args.find_unused_parameters
-    elif args.distributed_wrapper == "DDP" and args.ddp_backend == "no_c10d":
+    elif args.ddp_backend in {"no_c10d", "legacy_ddp"}:
         ddp_class = LegacyDistributedDataParallel
         init_kwargs = dict(
             module=model,
             buffer_size=2 ** 28,
             process_group=process_group,
         )
-    elif args.distributed_wrapper == "SlowMo":
+    elif args.ddp_backend == "slow_mo":
         if _GOSSIP_DISABLED:
             raise ImportError(
                 "Cannot find gossip library. Please install from: "
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index eea194b950..d893518fea 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -646,7 +646,7 @@ def maybe_no_sync():
             if not self.tpu:
                 if (
                     not self.cfg.optimization.use_bmuf
-                    and self.cfg.distributed_training.distributed_wrapper != "SlowMo"
+                    and self.cfg.distributed_training.ddp_backend != "slow_mo"
                 ):
                     self._check_grad_norms(grad_norm)
                 if not torch.isfinite(grad_norm).all():
@@ -686,7 +686,8 @@ def maybe_no_sync():
                 logger.error("OOM during optimization, irrecoverable")
             raise e
 
-        # Some distributed wrappers (e.g., SlowMo) need access to the optimizer after the step
+        # Some distributed wrappers (e.g., SlowMo) need access to the optimizer
+        # after the step
         if hasattr(self.model, "perform_additional_optimizer_actions"):
             if hasattr(self.optimizer, "fp32_params"):
                 self.model.perform_additional_optimizer_actions(
@@ -700,7 +701,7 @@ def maybe_no_sync():
         logging_output = None
         if (
             not overflow
-            or self.cfg.distributed_training.distributed_wrapper == "SlowMo"
+            or self.cfg.distributed_training.ddp_backend == "slow_mo"
         ):
             self.set_num_updates(self.get_num_updates() + 1)
 
@@ -1120,7 +1121,7 @@ def is_consistent(tensor):
                 # use FloatingPointError to trigger NanDetector
                 raise FloatingPointError(
                     "Fatal error: gradients are inconsistent between workers. "
-                    "Try --ddp-backend=no_c10d. "
+                    "Try --ddp-backend=legacy_ddp. "
                     "Or are you mixing up different generation of GPUs in training?"
                     + "\n"
                     + "-" * 80