From f2710bbe1dc19717b3c99e910cb453ea20ae8b08 Mon Sep 17 00:00:00 2001 From: lokoppakmsft <112720551+lokoppakmsft@users.noreply.github.com> Date: Fri, 11 Nov 2022 14:04:31 -0800 Subject: [PATCH] Make data contiguous before the inplace reshape-copy_ function (#2489) Co-authored-by: Michael Wyatt --- deepspeed/module_inject/load_checkpoint.py | 1 + deepspeed/module_inject/replace_module.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py index 560f1bc83bc7..f577a1a0e1bc 100644 --- a/deepspeed/module_inject/load_checkpoint.py +++ b/deepspeed/module_inject/load_checkpoint.py @@ -15,6 +15,7 @@ def load_model_with_checkpoint(r_module, error_msgs = [] def transpose(data): + data = data.contiguous() data1 = data.transpose(-1, -2).reshape(-1) data.reshape(-1).copy_(data1) data1 = None diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index bfb4791713bb..a06f7a5a722e 100755 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -215,6 +215,7 @@ def replace_attn(child, policy): attn_module = transformer_inference.DeepSpeedDiffusersAttention(config) def transpose(data): + data = data.contiguous() data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1)) data = data.reshape(data.shape[-1], data.shape[-2]) data.to(torch.cuda.current_device()) @@ -531,7 +532,7 @@ def replace_with_policy(child, # transpose it here to reduce inference cost! def transpose(data): # temp move to cpu to avoid requiring extra GPU memory during the reshape - data = data.to('cpu') + data = data.to('cpu').contiguous() data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1)) data = data.reshape(data.shape[-1], data.shape[-2]) data.to(torch.cuda.current_device())