目录
模型结构
backbone
模型的backbone是res50,主要由四个layers组成,在输入进入每个layer之前,需要经过nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),norm_layer(64),nn.ReLU(inplace=True)和nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
每个layers分别由3,4,6,3个block组成。
每个layer的output_channels分别为[256,512,1024,2048],第二、三、四个layer有下采样操作,在这三层的第一个block中会有用于下采样的卷积操作。
block是该主干网络的基本结构单元,每个block可的卷积操作可以由普通卷积或者变形卷积来执行。
block
class Bottleneck(nn.Module):
""" Adapted from torchvision.models.resnet """
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None, norm_layer=nn.BatchNorm2d, dilation=1, use_dcn=False):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False, dilation=dilation)
self.bn1 = norm_layer(planes)
if use_dcn:
self.conv2 = DCN(planes, planes, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation, deformable_groups=1)
self.conv2.bias.data.zero_()
self.conv2.conv_offset_mask.weight.data.zero_()
self.conv2.conv_offset_mask.bias.data.zero_()
else:
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=dilation, bias=False, dilation=dilation)
self.bn2 = norm_layer(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False, dilation=dilation)
self.bn3 = norm_layer(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNetBackbone(nn.Module):
""" Adapted from torchvision.models.resnet """
def __init__(self, layers, dcn_layers=[0, 0, 0, 0], dcn_interval=1, atrous_layers=[], block=Bottleneck, norm_layer=nn.BatchNorm2d):
super().__init__()
# These will be populated by _make_layer
self.num_base_layers = len(layers)
self.layers = nn.ModuleList()
self.channels = []
self.norm_layer = norm_layer
self.dilation = 1
self.atrous_layers = atrous_layers
# From torchvision.models.resnet.Resnet
self.inplanes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = norm_layer(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self._make_layer(block, 64, layers[0], dcn_layers=dcn_layers[0], dcn_interval=dcn_interval)
self._make_layer(block, 128, layers[1], stride=2, dcn_layers=dcn_layers[1], dcn_interval=dcn_interval)
self._make_layer(block, 256, layers[2], stride=2, dcn_layers=dcn_layers[2], dcn_interval=dcn_interval)
self._make_layer(block, 512, layers[3], stride=2, dcn_layers=dcn_layers[3], dcn_interval=dcn_interval)
# This contains every module that should be initialized by loading in pretrained weights.
# Any extra layers added onto this that won't be initialized by init_backbone will not be
# in this list. That way, Yolact::init_weights knows which backbone weights to initialize
# with xavier, and which ones to leave alone.
self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)]
def _make_layer(self, block, planes, blocks, stride=1, dcn_layers=0, dcn_interval=1):
""" Here one layer means a string of n Bottleneck blocks. """
downsample = None
# This is actually just to create the connection between layers, and not necessarily to
# downsample. Even if the second condition is met, it only downsamples when stride != 1
if stride != 1 or self.inplanes != planes * block.expansion:
if len(self.layers) in self.atrous_layers:
self.dilation += 1
stride = 1
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False,
dilation=self.dilation),
self.norm_layer(planes * block.expansion),
)
layers = []
use_dcn = (dcn_layers >= blocks)
layers.append(block(self.inplanes, planes, stride, downsample, self.norm_layer, self.dilation, use_dcn=use_dcn))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
use_dcn = ((i+dcn_layers) >= blocks) and (i % dcn_interval == 0)
layers.append(block(self.inplanes, planes, norm_layer=self.norm_layer, use_dcn=use_dcn))
layer = nn.Sequential(*layers)
self.channels.append(planes * block.expansion)
self.layers.append(layer)
return layer
def forward(self, x):
""" Returns a list of convouts for each layer. """
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
outs = []
for layer in self.layers:
x = layer(x)
outs.append(x)
return tuple(outs)
FPN
fpn的输入是backbone的第二三四层,输出五层,输出的通道数统一为256。多输出的两层是通过卷积下采样获得的。输出的第一层为不仅送入head,还会经上采样后用于proto_mask的计算。
class FPN(ScriptModuleWrapper):
"""
Implements a general version of the FPN introduced in
https://arxiv.org/pdf/1612.03144.pdf
Parameters (in cfg.fpn):
- num_features (int): The number of output features in the fpn layers.
- interpolation_mode (str): The mode to pass to F.interpolate.
- num_downsample (int): The number of downsampled layers to add onto the selected layers.
These extra layers are downsampled from the last selected layer.
Args:
- in_channels (list): For each conv layer you supply in the forward pass,
how many features will it have?
"""
__constants__ = ['interpolation_mode', 'num_downsample', 'use_conv_downsample', 'relu_pred_layers',
'lat_layers', 'pred_layers', 'downsample_layers', 'relu_downsample_layers']
def __init__(self, in_channels):
super().__init__()
self.lat_layers = nn.ModuleList([
nn.Conv2d(x, cfg.fpn.num_features, kernel_size=1)
for x in reversed(in_channels)
])
# This is here for backwards compatability
padding = 1 if cfg.fpn.pad else 0 # 1
self.pred_layers = nn.ModuleList([
nn.Conv2d(cfg.fpn.num_features, cfg.fpn.num_features, kernel_size=3, padding=padding)
for _ in in_channels
])
if cfg.fpn.use_conv_downsample: # true
self.downsample_layers = nn.ModuleList([
nn.Conv2d(cfg.fpn.num_features, cfg.fpn.num_features, kernel_size=3, padding=1, stride=2)
for _ in range(cfg.fpn.num_downsample)
]) # num_downsample = 2
self.interpolation_mode = cfg.fpn.interpolation_mode # blinear
self.num_downsample = cfg.fpn.num_downsample # 2
self.use_conv_downsample = cfg.fpn.use_conv_downsample # true
self.relu_downsample_layers = cfg.fpn.relu_downsample_layers # false
self.relu_pred_layers = cfg.fpn.relu_pred_layers # true
@script_method_wrapper
def forward(self, convouts:List[torch.Tensor]):
"""
Args:
- convouts (list): A list of convouts for the corresponding layers in in_channels.
Returns:
- A list of FPN convouts in the same order as x with extra downsample layers if requested.
"""
out = []
x = torch.zeros(1, device=convouts[0].device)
for i in range(len(convouts)):
out.append(x)
# For backward compatability, the conv layers are stored in reverse but the input and output is
# given in the correct order. Thus, use j=-i-1 for the input and output and i for the conv layers.
j = len(convouts)
for lat_layer in self.lat_layers:
j -= 1
if j < len(convouts) - 1:
_, _, h, w = convouts[j].size()
x = F.interpolate(x, size=(h, w), mode=self.interpolation_mode, align_corners=False)
x = x + lat_layer(convouts[j])
out[j] = x
# This janky second loop is here because TorchScript.
j = len(convouts)
for pred_layer in self.pred_layers:
j -= 1
out[j] = pred_layer(out[j])
if self.relu_pred_layers:
F.relu(out[j], inplace=True)
cur_idx = len(out)
for downsample_layer in self.downsample_layers:
out.append(downsample_layer(out[-1]))
return out
head
输出层每个位置有三个achor,宽高比为[0.5,1,2]。一共有五个head,每个head的anchor的尺寸分别为[24,48,96,192,384]
head包括bbox_layer,conf_layer,mask_layer(maskdim=32)
self.bbox_layer = nn.Conv2d(out_channels, self.num_priors * 4, **cfg.head_layer_params)
self.conf_layer = nn.Conv2d(out_channels, self.num_priors * self.num_classes, **cfg.head_layer_params)
self.mask_layer = nn.Conv2d(out_channels, self.num_priors * self.mask_dim, **cfg.head_layer_params)
protonet
将FPN的最后一层作为输入(input_size/8),将输出结果与head的mask_layer的输出结果进行线性叠加得到mask,
protonet里有个上采样操作,将输入变为原来的两倍,所以输出是的大小是input_size/4。
pythonSequential(
(0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): ReLU(inplace=True)
(2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(3): ReLU(inplace=True)
(4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(5): ReLU(inplace=True)
(6): InterpolateModule()
(7): ReLU(inplace=True)
(8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(9): ReLU(inplace=True)
(10): Conv2d(256, 32, kernel_size=(1, 1), stride=(1, 1))
)
最终的输出结果形势
loc: torch.Size([1, 19248, 4])
conf: torch.Size([1, 19248, 2])
mask: torch.Size([1, 19248, 32])
priors: torch.Size([19248, 4])
proto: torch.Size([1, 138, 138, 32])
segm: torch.Size([1, 1, 69, 69])
segm是语义分割用到的分支,输入为fpn的第一层输出。
train
cfg
‘use_focal_loss’: False,
‘focal_loss_alpha’: 0.25,
‘focal_loss_gamma’: 2,
‘focal_loss_init_pi’: 0.01,
‘positive_iou_threshold’: 0.5,
‘negative_iou_threshold’: 0.4,
‘ohem_negpos_ratio’: 3,
‘pred_aspect_ratios’: [1, 1/2, 2]
‘pred_scales’: [[24], [48], [96], [192], [384]],
‘crowd_iou_threshold’: 0.7
match
def match(pos_thresh, neg_thresh, truths, priors, labels, crowd_boxes, loc_t, conf_t, idx_t, idx, loc_data,
keypoints, keypoints_t):
"""Match each prior box with the ground truth box of the highest jaccard
overlap, encode the bounding boxes, then return the matched indices
corresponding to both confidence and location preds.
Args:
pos_thresh: (float) IoU > pos_thresh ==> positive.
neg_thresh: (float) IoU < neg_thresh ==> negative.
truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
labels: (tensor) All the class labels for the image, Shape: [num_obj].
crowd_boxes: (tensor) All the crowd box annotations or None if there are none.
loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. Note: -1 means neutral.
idx_t: (tensor) Tensor to be filled w/ the index of the matched gt box for each prior.
idx: (int) current batch index.
loc_data: (tensor) The predicted bbox regression coordinates for this batch.
Return:
The matched indices corresponding to 1)location and 2)confidence preds.
"""
decoded_priors = decode(loc_data, priors, cfg.use_yolo_regressors) if cfg.use_prediction_matching else point_form(priors)
# Size [num_objects, num_priors]
overlaps = jaccard(truths, decoded_priors) if not cfg.use_change_matching else change(truths, decoded_priors)
# Size [num_priors] best ground truth for each prior
best_truth_overlap, best_truth_idx = overlaps.max(0)
# We want to ensure that each gt gets used at least once so that we don't
# waste any training data. In order to do that, find the max overlap anchor
# with each gt, and force that anchor to use that gt.
for _ in range(overlaps.size(0)):
# Find j, the gt with the highest overlap with a prior
# In effect, this will loop through overlaps.size(0) in a "smart" order,
# always choosing the highest overlap first.
best_prior_overlap, best_prior_idx = overlaps.max(1)
j = best_prior_overlap.max(0)[1] #选择那个与anchor有最大iou的gt
# Find i, the highest overlap anchor with this gt
i = best_prior_idx[j]
# Set all other overlaps with i to be -1 so that no other gt uses it
overlaps[:, i] = -1
# Set all other overlaps with j to be -1 so that this loop never uses j again
overlaps[j, :] = -1
# Overwrite i's score to be 2 so it doesn't get thresholded ever
best_truth_overlap[i] = 2
# Set the gt to be used for i to be j, overwriting whatever was there
best_truth_idx[i] = j
matches = truths[best_truth_idx] # Shape: [num_priors,4]
matches_kp = keypoints[best_truth_idx] # Shape: [num_priors,34]
conf = labels[best_truth_idx] + 1 # Shape: [num_priors]
conf[best_truth_overlap < pos_thresh] = -1 # label as neutral
conf[best_truth_overlap < neg_thresh] = 0 # label as background
# Deal with crowd annotations for COCO
if crowd_boxes is not None and cfg.crowd_iou_threshold < 1:
# Size [num_priors, num_crowds]
crowd_overlaps = jaccard(decoded_priors, crowd_boxes, iscrowd=True)
# Size [num_priors]
best_crowd_overlap, best_crowd_idx = crowd_overlaps.max(1)
# Set non-positives with crowd iou of over the threshold to be neutral.
conf[(conf <= 0) & (best_crowd_overlap > cfg.crowd_iou_threshold)] = -1
loc = encode(matches, priors, cfg.use_yolo_regressors)
kp = encode_kp(matches_kp, priors)
keypoints_t[idx] = kp
loc_t[idx] = loc # [num_priors,4] encoded offsets to learn
conf_t[idx] = conf # [num_priors] top class label for each prior
idx_t[idx] = best_truth_idx # [num_priors] indices for lookup
@torch.jit.script
def encode(matched, priors, use_yolo_regressors:bool=False):
variances = [0.1, 0.2]
g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
g_cxcy /= (variances[0] * priors[:, 2:])
g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
g_wh = torch.log(g_wh) / variances[1]
loc = torch.cat([g_cxcy, g_wh], 1)
return loc