Skip to content
This repository was archived by the owner on Aug 7, 2024. It is now read-only.

Commit 20da1c0

Browse files
committed
performance boooooooooooost
1 parent 48f21f6 commit 20da1c0

File tree

1 file changed

+6
-4
lines changed

1 file changed

+6
-4
lines changed

float8_experimental/float8_ops.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -223,17 +223,19 @@ def backward(ctx, go_fp8: torch.Tensor):
223223
x_fp8, w_fp8 = ctx.saved_tensors
224224

225225
# calculate dL/dX
226-
go_fp8_reshaped = go_fp8.reshape(-1, go_fp8.size(-1))
226+
go_fp8_reshaped = go_fp8.view(-1, go_fp8.size(-1))
227227
w_fp8_t_c_t = w_fp8.t().contiguous().t()
228228
dL_dX = float8_mm_helper(go_fp8_reshaped, w_fp8_t_c_t)
229-
dL_dX = dL_dX.reshape(*go_fp8.shape[:-1], dL_dX.size(-1))
229+
dL_dX = dL_dX.view(*go_fp8.shape[:-1], dL_dX.size(-1))
230230

231231
# calculate dL/dW
232-
x_fp8_reshaped_t_c = x_fp8.reshape(-1, x_fp8.size(-1)).t().contiguous()
232+
x_fp8_reshaped_t_c = x_fp8.view(-1, x_fp8.size(-1)).t().contiguous()
233233
go_fp8_reshaped_t_c_t = go_fp8_reshaped.t().contiguous().t()
234234

235235
dL_dW = float8_mm_helper(x_fp8_reshaped_t_c, go_fp8_reshaped_t_c_t)
236-
dL_dW = dL_dW.t()
236+
# The contiguous call is not needed for correctness, but allows for a faster backward
237+
# pass in conjunction with compile for both single-gpu and fsdp.
238+
dL_dW = dL_dW.t().contiguous()
237239

238240
empty_grads = None, None, None, None, None, None, None, None, None
239241
return dL_dX, dL_dW, *empty_grads

0 commit comments

Comments
 (0)