Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changing CustomDataParallel to DistributedDataParallel #487

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
setuptools>=41.0.0
pycocotools
scikit-build
numpy
opencv-python
tqdm
tensorboard
tensorboardX
pyyaml
webcolors
torch==1.4.0
torchvision==0.5.0
11 changes: 8 additions & 3 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
from efficientdet.dataset import CocoDataset, Resizer, Normalizer, Augmenter, collater
from efficientdet.loss import FocalLoss
from utils.sync_batchnorm import patch_replication_callback
from utils.utils import replace_w_sync_bn, CustomDataParallel, get_last_weights, init_weights, boolean_string
from utils.utils import replace_w_sync_bn, get_last_weights, init_weights, boolean_string
import torch.distributed as dist


class Params:
Expand Down Expand Up @@ -181,9 +182,13 @@ def freeze_backbone(m):
if params.num_gpus > 0:
model = model.cuda()
if params.num_gpus > 1:
model = CustomDataParallel(model, params.num_gpus)
# https://pytorch.org/tutorials/intermediate/dist_tuto.html
dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:6666', world_size=1, rank=0)
model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True)
if use_sync_bn:
patch_replication_callback(model)

torch.backends.cudnn.benchmark = True

if opt.optim == 'adamw':
optimizer = torch.optim.AdamW(model.parameters(), opt.lr)
Expand Down Expand Up @@ -315,7 +320,7 @@ def freeze_backbone(m):


def save_checkpoint(model, name):
if isinstance(model, CustomDataParallel):
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
torch.save(model.module.model.state_dict(), os.path.join(opt.saved_path, name))
Copy link
Owner

@zylo117 zylo117 Aug 25, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if DDP's module is the real model like DP

Copy link
Owner

@zylo117 zylo117 Aug 25, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ASAIK, it's model.model.state_dict() for DDP like the ordinary model. Or both are ok?

else:
torch.save(model.model.state_dict(), os.path.join(opt.saved_path, name))
Expand Down
25 changes: 0 additions & 25 deletions utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,31 +181,6 @@ def replace_w_sync_bn(m):
for var_name, children in m.named_children():
replace_w_sync_bn(children)


class CustomDataParallel(nn.DataParallel):
"""
force splitting data to all gpus instead of sending all data to cuda:0 and then moving around.
"""

def __init__(self, module, num_gpus):
super().__init__(module)
self.num_gpus = num_gpus

def scatter(self, inputs, kwargs, device_ids):
# More like scatter and data prep at the same time. The point is we prep the data in such a way
# that no scatter is necessary, and there's no need to shuffle stuff around different GPUs.
devices = ['cuda:' + str(x) for x in range(self.num_gpus)]
splits = inputs[0].shape[0] // self.num_gpus

if splits == 0:
raise Exception('Batchsize must be greater than num_gpus.')

return [(inputs[0][splits * device_idx: splits * (device_idx + 1)].to(f'cuda:{device_idx}', non_blocking=True),
inputs[1][splits * device_idx: splits * (device_idx + 1)].to(f'cuda:{device_idx}', non_blocking=True))
for device_idx in range(len(devices))], \
[kwargs] * len(devices)


def get_last_weights(weights_path):
weights_path = glob(weights_path + f'/*.pth')
weights_path = sorted(weights_path,
Expand Down