-
Notifications
You must be signed in to change notification settings - Fork 0
/
Models_N1.py
106 lines (87 loc) · 4.22 KB
/
Models_N1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import torch
import torchvision
import pdb
import torch.nn.functional as F
class ResNet101_depth_with_multi(torch.nn.Module):
def __init__(self, num_classes, out_channels=1, out_channels_normal=3):
super(ResNet101_depth_with_multi, self).__init__()
resnet = torchvision.models.resnet101(pretrained=True)
base_model_out_size = 2048
self.layer0 = torch.nn.Sequential(resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool)
self.layer1 = torch.nn.Sequential(resnet.layer1) # 256
self.layer2 = torch.nn.Sequential(resnet.layer2) # 512
self.layer3 = torch.nn.Sequential(resnet.layer3) # 1024
self.layer4 = torch.nn.Sequential(resnet.layer4) # 2048
# Top layer
self.toplayer = torch.nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=0) # Reduce channels
# Lateral layers decoding in lateral layer
self.latlayer1 = torch.nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0)
self.latlayer2 = torch.nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0)
self.latlayer3 = torch.nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)
# Smooth layers -> gets bigger depth features by increasing kernel sizes.
self.smooth1 = torch.nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
self.smooth2 = torch.nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
self.smooth3 = torch.nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
# Depth prediction
self.predict1 = torch.nn.Sequential(
torch.nn.Conv2d(256, 64, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(),
)
self.predict2 = torch.nn.Sequential(
torch.nn.Conv2d(64, out_channels, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(),
)
#Normal Prediction
self.predict_normal1 = torch.nn.Sequential(
torch.nn.Conv2d(256, 64, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(),
)
self.predict_normal2 = torch.nn.Sequential(
torch.nn.Conv2d(64, out_channels_normal, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(),
)
self.classifier = torch.nn.Linear(2048, num_classes)
def _upsample_add(self, x, y):
'''Upsample and add two feature maps.
Args:
x: (Variable) top feature map to be upsampled.
(Variable) added feature map.
Note in PyTorch, when input size is odd, the upsampled feature map
with `F.upsample(..., scale_factor=2, mode='nearest')`
maybe not equal to the lateral feature map size.
e.g.
original input size: [N,_,15,15] ->
e map size: [N,_,8,8] ->
upsampled feature map size: [N,_,16,16]
So we choose bilinear upsample which supports arbitrary output sizes.
'''
#output size is equal to the lateral feature map.
_, _, H, W = y.size()
return F.interpolate(x, size=(H, W), mode='bilinear', align_corners = True) + y
def forward(self, x):
_, _, H, W = x.size() # batchsize N,channel,height,width
# Bottom-up
c1 = self.layer0(x)
c2 = self.layer1(c1) # 256 channels, 1/4 size
c3 = self.layer2(c2) # 512 channels, 1/8 size
c4 = self.layer3(c3) # 1024 channels, 1/16 size
c5 = self.layer4(c4) # 2048 channels, 1/32 size
#flat_multi_preds,recons_preds
#Why is it refer to as a lateral eature map?
# Top-down
p5 = self.toplayer(c5) #input 2048 channels
p4 = self._upsample_add(p5, self.latlayer1(c4)) # 256 channels, 1/16 size (1028 -> 64)
p4 = self.smooth1(p4) #what is smoothing layer for?
p3 = self._upsample_add(p4, self.latlayer2(c3)) # 256 channels, 1/8 size (512 -> 64)
p3 = self.smooth2(p3) # 256 channels, 1/8 size
p2 = self._upsample_add(p3, self.latlayer3(c2)) # 256, 1/4 size (256 -> 64)
p2 = self.smooth3(p2) # 256 channels, 1/4 size
#Second Encoder Layer
#why is two prediction layer needed? -> why cant i go from 256 to output channels)
normal = self.predict_normal2(self.predict_normal1(p2))
#might have to separate each layers
#depth = self.predict2(self.predict1(p2)) #prediction output depth map : size of 64 channels.
#Making classification predictions based on the depth feature map produced from p2.
features = F.avg_pool2d(c5,7).view(-1,2048) #extracting feature map.
output = self.classifier(features)
return normal, output