code works well in pytorch1.6.0 and add code with Automatic Mixed Training

AlexHex7 · AlexHex7 · commit f9002d5874fc · 2020-08-05T19:06:34.000+08:00
diff --git a/README.md b/README.md
@@ -11,9 +11,9 @@
 - If there is something wrong in my code, please contact me, thanks!
 
 ## Environment
-- python 3.7.3
-- pytorch 1.2.0
-- opencv 3.4.2
+- python 3.7.7
+- pytorch 1.4.0 (>=1.2.0, 1.6.0 works too)
+- opencv 4.2.0.34 (others work too)
 
 ## Visualization
 1. In the **first** Non-local Layer.
@@ -32,17 +32,25 @@
     from lib.non_local_gaussian import NONLocalBlock2D
     from lib.non_local_embedded_gaussian import NONLocalBlock2D
     from lib.non_local_dot_product import NONLocalBlock2D
+   ```
+   
 2. Run **demo_MNIST_train.py** with one GPU or multi GPU to train the Network. Then the weights will be save in **weights/**.
     ```
-    CUDA_VISIBLE_DEVICES=0,1 python demo_MNIST.py
-
+    CUDA_VISIBLE_DEVICES=0,1 python demo_MNIST_train.py
+    
+    # Or train with Automatic Mixed Precision based on pytorch 1.6.0
+    CUDA_VISIBLE_DEVICES=0 python demo_MNIST_AMP_train_with_single_gpu.py
+    ```
+   
 3. Run **nl_map_save.py** to save NL_MAP of one test sample in **nl_map_vis**.
     ```
     CUDA_VISIBLE_DEVICES=0,1 python nl_map_save.py
-    
+    ```
+   
 4. Come into **nl_map_vis/** and run **nl_map_vis.py** to visualize the NL_MAP. (tips: if the Non-local type you select is **non_local_concatenation** or **non_local_dot_product** (without Softmax operation), you may need to normalize NL_MAP in the visualize code)
     ```
     python nl_map_save.py
+   ```
 
 ## Update Records
 1. Figure out how to implement the **concatenation** type, and add the code to **lib/**.
@@ -66,6 +74,9 @@ to **Non-Local_pytorch_0.3.1/**.
 
 8. In order to visualize NL_MAP, some code have been slightly modified. The code **nl_map_save.py** is added to save NL_MAP (two Non-local Layer) of one test sample. The code **Non-local_pytorch/nl_map_vis.py** is added to visualize NL_MAP. Besieds, the code is support pytorch 1.2.0.
 
+9. The code also works well in **pytorch 1.4.0**.
+
+10. The code also works well in **pytorch 1.6.0**. Add **demo_MNIST_AMP_train_with_single_gpu.py** with Automatic Mixed Precision Training (FP16), supported by **pytorch 1.6.0**. It can reduce GPU memory during training. What's more, if you use GPU 2080Ti (tensor cores), training speed can be increased. More details (such as how to train with multiple GPUs) can be found in [here](https://pytorch.org/docs/stable/notes/amp_examples.html#typical-mixed-precision-training)
 
 ## Todo
 - Experiments on Charades dataset.
diff --git a/demo_MNIST_AMP_train_with_single_gpu.py b/demo_MNIST_AMP_train_with_single_gpu.py
@@ -0,0 +1,96 @@
+import torch
+import torch.utils.data as Data
+import torchvision
+from lib.network import Network
+from torch import nn
+from torch.cuda import amp
+import time
+
+
+train_data = torchvision.datasets.MNIST(root='./mnist', train=True,
+                                        transform=torchvision.transforms.ToTensor(),
+                                        download=True)
+test_data = torchvision.datasets.MNIST(root='./mnist/',
+                                       transform=torchvision.transforms.ToTensor(),
+                                       train=False)
+
+train_loader = Data.DataLoader(dataset=train_data, batch_size=128 * 50, shuffle=True)
+test_loader = Data.DataLoader(dataset=test_data, batch_size=128 * 50, shuffle=False)
+
+train_batch_num = len(train_loader)
+test_batch_num = len(test_loader)
+
+net = Network()
+if torch.cuda.is_available():
+    # net = nn.DataParallel(net)
+    net.cuda()
+
+# +++++++++++++++++++++++++++++++
+scaler = amp.GradScaler()
+# +++++++++++++++++++++++++++++++
+
+opt = torch.optim.Adam(net.parameters(), lr=0.001)
+loss_func = nn.CrossEntropyLoss()
+
+for epoch_index in range(10):
+    st = time.time()
+
+    torch.set_grad_enabled(True)
+    net.train()
+    for train_batch_index, (img_batch, label_batch) in enumerate(train_loader):
+        if torch.cuda.is_available():
+            img_batch = img_batch.cuda()
+            label_batch = label_batch.cuda()
+
+        # ++++++++++++++++++++++++++++++++++++++++++++++
+        # predict = net(img_batch)
+        # loss = loss_func(predict, label_batch)
+        with amp.autocast():
+            predict = net(img_batch)
+            loss = loss_func(predict, label_batch)
+        # ++++++++++++++++++++++++++++++++++++++++++++++
+
+        net.zero_grad()
+        # ++++++++++++++++++++++++++++++++++++++++++++++
+        # loss.backward()
+        # opt.step()
+        scaler.scale(loss).backward()
+        scaler.step(opt)
+        scaler.update()
+        # ++++++++++++++++++++++++++++++++++++++++++++++
+
+    print('(LR:%f) Time of a epoch:%.4fs' % (opt.param_groups[0]['lr'], time.time()-st))
+
+    torch.set_grad_enabled(False)
+    net.eval()
+    total_loss = []
+    total_acc = 0
+    total_sample = 0
+
+    for test_batch_index, (img_batch, label_batch) in enumerate(test_loader):
+        if torch.cuda.is_available():
+            img_batch = img_batch.cuda()
+            label_batch = label_batch.cuda()
+
+        predict = net(img_batch)
+        loss = loss_func(predict, label_batch)
+
+        predict = predict.argmax(dim=1)
+        acc = (predict == label_batch).sum()
+
+        total_loss.append(loss)
+        total_acc += acc
+        total_sample += img_batch.size(0)
+
+    net.train()
+
+    mean_acc = total_acc.item() * 1.0 / total_sample
+    mean_loss = sum(total_loss) / total_loss.__len__()
+
+    print('[Test] epoch[%d/%d] acc:%.4f%% loss:%.4f\n'
+          % (epoch_index, 10, mean_acc * 100, mean_loss.item()))
+
+# weight_path = 'weights/net.pth'
+# print('Save Net weights to', weight_path)
+# net.cpu()
+# torch.save(net.state_dict(), weight_path)
diff --git a/lib/non_local_embedded_gaussian.py b/lib/non_local_embedded_gaussian.py
@@ -130,12 +130,12 @@ def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=T
         print(out.size())
 
         img = torch.zeros(2, 3, 20, 20)
-        net = NONLocalBlock2D(3, sub_sample=sub_sample_, bn_layer=bn_layer_, store_last_batch_nl_map=True)
+        net = NONLocalBlock2D(3, sub_sample=sub_sample_, bn_layer=bn_layer_)
         out = net(img)
         print(out.size())
 
         img = torch.randn(2, 3, 8, 20, 20)
-        net = NONLocalBlock3D(3, sub_sample=sub_sample_, bn_layer=bn_layer_, store_last_batch_nl_map=True)
+        net = NONLocalBlock3D(3, sub_sample=sub_sample_, bn_layer=bn_layer_)
         out = net(img)
         print(out.size())
 
diff --git a/lib/non_local_gaussian.py b/lib/non_local_gaussian.py
@@ -78,8 +78,8 @@ def forward(self, x, return_nl_map=False):
         f = torch.matmul(theta_x, phi_x)
         f_div_C = F.softmax(f, dim=-1)
 
-        if self.store_last_batch_nl_map:
-            self.nl_map = f_div_C
+        # if self.store_last_batch_nl_map:
+        #     self.nl_map = f_div_C
 
         y = torch.matmul(f_div_C, g_x)
         y = y.permute(0, 2, 1).contiguous()