From 03cc549ba8bfca228a804b75bb28b142e9703168 Mon Sep 17 00:00:00 2001
From: Adriano Santos <adriano.nego@gmail.com>
Date: Mon, 24 Aug 2020 16:41:10 -0400
Subject: [PATCH] Changing CustomDataParallel to DistributedDataParallel

---
 requirements.txt | 12 ++++++++++++
 train.py         | 11 ++++++++---
 utils/utils.py   | 25 -------------------------
 3 files changed, 20 insertions(+), 28 deletions(-)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..ca387f146
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,12 @@
+setuptools>=41.0.0
+pycocotools
+scikit-build
+numpy 
+opencv-python 
+tqdm 
+tensorboard 
+tensorboardX 
+pyyaml 
+webcolors
+torch==1.4.0
+torchvision==0.5.0
\ No newline at end of file
diff --git a/train.py b/train.py
index 04133cb0c..961ea00d4 100644
--- a/train.py
+++ b/train.py
@@ -20,7 +20,8 @@
 from efficientdet.dataset import CocoDataset, Resizer, Normalizer, Augmenter, collater
 from efficientdet.loss import FocalLoss
 from utils.sync_batchnorm import patch_replication_callback
-from utils.utils import replace_w_sync_bn, CustomDataParallel, get_last_weights, init_weights, boolean_string
+from utils.utils import replace_w_sync_bn, get_last_weights, init_weights, boolean_string
+import torch.distributed as dist
 
 
 class Params:
@@ -181,9 +182,13 @@ def freeze_backbone(m):
     if params.num_gpus > 0:
         model = model.cuda()
         if params.num_gpus > 1:
-            model = CustomDataParallel(model, params.num_gpus)
+            # https://pytorch.org/tutorials/intermediate/dist_tuto.html
+            dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:6666', world_size=1, rank=0)
+            model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True)
             if use_sync_bn:
                 patch_replication_callback(model)
+    
+    torch.backends.cudnn.benchmark = True
 
     if opt.optim == 'adamw':
         optimizer = torch.optim.AdamW(model.parameters(), opt.lr)
@@ -315,7 +320,7 @@ def freeze_backbone(m):
 
 
 def save_checkpoint(model, name):
-    if isinstance(model, CustomDataParallel):
+    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
         torch.save(model.module.model.state_dict(), os.path.join(opt.saved_path, name))
     else:
         torch.save(model.model.state_dict(), os.path.join(opt.saved_path, name))
diff --git a/utils/utils.py b/utils/utils.py
index 0b69340a6..24f25f239 100644
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -181,31 +181,6 @@ def replace_w_sync_bn(m):
     for var_name, children in m.named_children():
         replace_w_sync_bn(children)
 
-
-class CustomDataParallel(nn.DataParallel):
-    """
-    force splitting data to all gpus instead of sending all data to cuda:0 and then moving around.
-    """
-
-    def __init__(self, module, num_gpus):
-        super().__init__(module)
-        self.num_gpus = num_gpus
-
-    def scatter(self, inputs, kwargs, device_ids):
-        # More like scatter and data prep at the same time. The point is we prep the data in such a way
-        # that no scatter is necessary, and there's no need to shuffle stuff around different GPUs.
-        devices = ['cuda:' + str(x) for x in range(self.num_gpus)]
-        splits = inputs[0].shape[0] // self.num_gpus
-
-        if splits == 0:
-            raise Exception('Batchsize must be greater than num_gpus.')
-
-        return [(inputs[0][splits * device_idx: splits * (device_idx + 1)].to(f'cuda:{device_idx}', non_blocking=True),
-                 inputs[1][splits * device_idx: splits * (device_idx + 1)].to(f'cuda:{device_idx}', non_blocking=True))
-                for device_idx in range(len(devices))], \
-               [kwargs] * len(devices)
-
-
 def get_last_weights(weights_path):
     weights_path = glob(weights_path + f'/*.pth')
     weights_path = sorted(weights_path,