Make the cached version work

borzunov · borzunov · commit 1ff47c625fa2 · 2021-12-21T05:17:09.000Z
diff --git a/dalle_pytorch/attention.py b/dalle_pytorch/attention.py
@@ -6,6 +6,8 @@
 import torch.nn.functional as F
 from einops import rearrange, repeat
 
+from rotary_embedding_torch import apply_rotary_emb
+
 # helpers
 
 def exists(val):
@@ -27,17 +29,6 @@ def stable_softmax(t, dim = -1, alpha = 32 ** 2):
     t = t - torch.amax(t, dim = dim, keepdim = True)
     return (t * alpha).softmax(dim = dim)
 
-def rotate_half(x):
-    d = x.shape[-1] // 2
-    return torch.cat([-x[..., d:], x[..., :d]], dim=-1)
-
-def apply_rotary_emb(freqs, t):
-    rot_dim = freqs.shape[-1]
-    assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
-    t, t_right = t[..., :rot_dim], t[..., rot_dim:]
-    t = (t * freqs.cos()) + (rotate_half(t) * freqs.sin())
-    return torch.cat((t, t_right), dim = -1)
-
 def apply_pos_emb(pos_emb, qkv):
     n = qkv[0].shape[-2]
     pos_emb = pos_emb[..., :n, :]
diff --git a/dalle_pytorch/transformer.py b/dalle_pytorch/transformer.py
@@ -92,7 +92,7 @@ def __init__(self, dim, dropout = 0., mult = 4.):
             nn.Linear(dim * mult, dim)
         )
 
-    def forward(self, x):
+    def forward(self, x, cache=None, cache_key=None):
         return self.net(x)
 
 # token shift classes
@@ -104,7 +104,13 @@ def __init__(self, fn, image_size, seq_len):
         self.image_size = image_size
         self.seq_len = seq_len
 
-    def forward(self, x, **kwargs):
+    def forward(self, x, cache=None, cache_key=None, **kwargs):
+        n0 = x.shape[1]
+        if exists(cache):
+            if cache_key in cache:
+                x = torch.cat([cache[cache_key], x], dim=-2)
+            cache[cache_key] = x
+
         n = x.shape[1]
         seq_len, image_size = self.seq_len, self.image_size
         img_seq_len = image_size ** 2
@@ -134,7 +140,7 @@ def forward(self, x, **kwargs):
 
         x_img = rearrange(x_img, 'b h w d -> b (h w) d')
         x = torch.cat((x_text, x_img[:, :-padding]), dim = 1)
-        return self.fn(x, **kwargs)
+        return self.fn(x[:, -n0:], cache=cache, **kwargs)
 
 # main transformer class
 
@@ -221,7 +227,8 @@ def __init__(
             attn = CachedAs(f'attn_{ind}', attn)
 
             if shift_tokens:
-                attn, ff = map(lambda t: PreShiftToken(t, image_size = image_fmap_size, seq_len = seq_len), (attn, ff))
+                attn = CachedAs(f'preshift_attn_{ind}', PreShiftToken(attn, image_size = image_fmap_size, seq_len = seq_len))
+                ff = CachedAs(f'preshift_ff_{ind}', PreShiftToken(ff, image_size = image_fmap_size, seq_len = seq_len))
 
             layers.append(nn.ModuleList([
                 LayerScale(dim, ind + 1, PreNorm(dim, attn, sandwich = sandwich_norm)),
@@ -230,8 +237,9 @@ def __init__(
 
         execute_type = ReversibleSequence if reversible else SequentialSequence
         route_attn = ((True, False),) * depth
+        route_all = ((True, True),) * depth
         attn_route_map = {'mask': route_attn, 'rotary_pos_emb': route_attn,
-                          'cache': route_attn}
+                          'cache': route_all}
 
         self.layers = execute_type(layers, args_route = attn_route_map)
 
@@ -270,9 +278,9 @@ def forward(self, x, **kwargs):
 
     def _get_static_mask(self, attn_type):
         img_seq_len = self.image_fmap_size ** 2
-        text_len = self.seq_len - img_seq_len
+        text_len = self.seq_len + 1 - img_seq_len
 
-        static_mask = torch.ones(self.seq_len, self.seq_len, dtype=torch.bool)
+        static_mask = torch.zeros(self.seq_len, self.seq_len, dtype=torch.bool)
         static_mask[:, :text_len] = True
         if attn_type == 'axial_row':
             for row in range(self.image_fmap_size):