BlendMask: Top-Down Meets Bottom-Up for Instance Segmentation
如果对你有帮助的话,希望给我点个赞~
文章目录
-
BlendMask: Top-Down Meets Bottom-Up for Instance Segmentation - BlendMask 网路结构:
- BlendMask的FCOS 分支新增的top_feat结构代码
- 1. AdelaiDet/adet/modeling/blendmask/blendmask.py
- 2. AdelaiDet/adet/modeling/blendmask/blender.py
- 3. AdelaiDet/adet/modeling/blendmask/basis_module.py
blendmask是根据anchor free的FCOS目标检测网络扩展应用到实例分割领域。
总的执行顺序为 backbone fpn and resnet --> fcos --> blendmask.py --> basis_module.py --> blend.py
其中关于FCOS部分的代码笔记,见我的另一篇 FCOS代码笔记。
BlendMask 网路结构:
BlendMask的FCOS 分支新增的top_feat结构代码
def losses(self, logits_pred, reg_pred, ctrness_pred, locations, gt_instances, top_feats=None):
"""
Return the losses from a set of FCOS predictions and their associated ground-truth.
Returns:
dict[loss name -> loss value]: A dict mapping from loss name to loss value.
"""
'''
省略其他相同的内容
'''
if len(top_feats) > 0: # blendmask
instances.top_feats = cat([
# Reshape: (N, -1, Hi, Wi) -> (N*Hi*Wi, -1) [784, -1]
x.permute(0, 2, 3, 1).reshape(-1, x.size(1)) for x in top_feats
], dim=0,)\
'''
in blendmask:
top_feats[0].size()
torch.Size([2, 784, 96, 148])
top_feats[1].size()
torch.Size([2, 784, 48, 74])
top_feats[2].size()
torch.Size([2, 784, 24, 37])
top_feats[3].size()
torch.Size([2, 784, 12, 19])
top_feats[4].size()
torch.Size([2, 784, 6, 10])
'''
# instances.top_feats.size() [37872, 784] 在接下来的fcos_losses(self, instances)函数中会继续筛选,最后只剩下[instances, 784]的大小。
# 这就是attention的矩阵方法:
# 每一行有784个特征。784代表又784个channel,而37872代表了hw * batchsize的大小.
# 说白了就把二维的图像h*w平铺成了1维度hw
pdb.set_trace()
1. AdelaiDet/adet/modeling/blendmask/blendmask.py
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
from torch import nn
from detectron2.structures import ImageList
from detectron2.modeling.postprocessing import detector_postprocess, sem_seg_postprocess
from detectron2.modeling.proposal_generator import build_proposal_generator
from detectron2.modeling.backbone import build_backbone
from detectron2.modeling.meta_arch.panoptic_fpn import combine_semantic_and_instance_outputs
from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
from detectron2.modeling.meta_arch.semantic_seg import build_sem_seg_head
from .blender import build_blender
from .basis_module import build_basis_module
import pdb
__all__ = ["BlendMask"]
@META_ARCH_REGISTRY.register()
class BlendMask(nn.Module):
"""
Main class for BlendMask architectures (see https://arxiv.org/abd/1901.02446).
"""
def __init__(self, cfg):
super().__init__()
self.device = torch.device(cfg.MODEL.DEVICE)
self.instance_loss_weight = cfg.MODEL.BLENDMASK.INSTANCE_LOSS_WEIGHT # 1.0
self.backbone = build_backbone(cfg) # build_fcos_resnet_fpn_backbone
pdb.set_trace()
self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape()) # FCOS
pdb.set_trace()
self.blender = build_blender(cfg) # blender
pdb.set_trace()
self.basis_module = build_basis_module(cfg, self.backbone.output_shape()) # basis_module
pdb.set_trace()
# options when combining instance & semantic outputs
self.combine_on = cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED # FALSE
if self.combine_on:
self.panoptic_module = build_sem_seg_head(cfg, self.backbone.output_shape())
self.combine_overlap_threshold = cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH
self.combine_stuff_area_limit = cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT
self.combine_instances_confidence_threshold = (
cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH)
# build top module
in_channels = cfg.MODEL.FPN.OUT_CHANNELS # 256
num_bases = cfg.MODEL.BASIS_MODULE.NUM_BASES # 4
attn_size = cfg.MODEL.BLENDMASK.ATTN_SIZE # 14
attn_len = num_bases * attn_size * attn_size # K*M*M = 784
self.top_layer = nn.Conv2d(
in_channels, attn_len,
kernel_size=3, stride=1, padding=1)
# self.top_layer Conv2d(256, 784, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
torch.nn.init.normal_(self.top_layer.weight, std=0.01)
torch.nn.init.constant_(self.top_layer.bias, 0)
pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
self.normalizer = lambda x: (x - pixel_mean) / pixel_std
self.to(self.device)
pdb.set_trace()
def forward(self, batched_inputs): # blendmask训练时,先进入blendmask.forward()方法
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
image: Tensor, image in (C, H, W) format.
instances: Instances
sem_seg: semantic segmentation ground truth.
Other information that's included in the original dicts, such as:
"height", "width" (int): the output resolution of the model, used in inference.
See :meth:`postprocess` for details.
Returns:
list[dict]: each dict is the results for one image. The dict
contains the following keys:
"instances": see :meth:`GeneralizedRCNN.forward` for its format.
"sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
"panoptic_seg": available when `PANOPTIC_FPN.COMBINE.ENABLED`.
See the return value of
:func:`combine_semantic_and_instance_outputs` for its format.
"""
images = [x["image"].to(self.device) for x in batched_inputs