coco数据集构建文件
import os
import torch.utils.data
import numpy as np
from PIL import Image
from ssd.structures.container import Container
class COCODataset(torch.utils.data.Dataset):
class_names = ('__background__',
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard',
'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors',
'teddy bear', 'hair drier', 'toothbrush')
def __init__(self, data_dir, ann_file, transform=None, target_transform=None, remove_empty=False):
from pycocotools.coco import COCO
self.coco = COCO(ann_file)
self.data_dir = data_dir
self.transform = transform
self.target_transform = target_transform
self.remove_empty = remove_empty
if self.remove_empty:
# when training, images without annotations are removed.
self.ids = list(self.coco.imgToAnns.keys())
else:
# when testing, all images used.
self.ids = list(self.coco.imgs.keys())
coco_categories = sorted(self.coco.getCatIds())
self.coco_id_to_contiguous_id = {coco_id: i + 1 for i, coco_id in enumerate(coco_categories)}
self.contiguous_id_to_coco_id = {v: k for k, v in self.coco_id_to_contiguous_id.items()}
def __getitem__(self, index):
image_id = self.ids[index]
boxes, labels = self._get_annotation(image_id)
image = self._read_image(image_id)
if self.transform:
image, boxes, labels = self.transform(image, boxes, labels)
if self.target_transform:
boxes, labels = self.target_transform(boxes, labels)
targets = Container(
boxes=boxes,
labels=labels,
)
return image, targets, index
def get_annotation(self, index):
image_id = self.ids[index]
return image_id, self._get_annotation(image_id)
def __len__(self):
return len(self.ids)
def _get_annotation(self, image_id):
ann_ids = self.coco.getAnnIds(imgIds=image_id)
ann = self.coco.loadAnns(ann_ids)
# filter crowd annotations
ann = [obj for obj in ann if obj["iscrowd"] == 0]
boxes = np.array([self._xywh2xyxy(obj["bbox"]) for obj in ann], np.float32).reshape((-1, 4))
labels = np.array([self.coco_id_to_contiguous_id[obj["category_id"]] for obj in ann], np.int64).reshape((-1,))
# remove invalid boxes
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
boxes = boxes[keep]
labels = labels[keep]
return boxes, labels
def _xywh2xyxy(self, box):
x1, y1, w, h = box
return [x1, y1, x1 + w, y1 + h]
def get_img_info(self, index):
image_id = self.ids[index]
img_data = self.coco.imgs[image_id]
return img_data
def _read_image(self, image_id):
file_name = self.coco.loadImgs(image_id)[0]['file_name']
image_file = os.path.join(self.data_dir, file_name)
image = Image.open(image_file).convert("RGB")
image = np.array(image)
return image
VOC数据集构建文件
import os
import torch.utils.data
import numpy as np
import xml.etree.ElementTree as ET
from PIL import Image
from ssd.structures.container import Container
class VOCDataset(torch.utils.data.Dataset):
class_names = ('__background__',
'aeroplane', 'bicycle', 'bird', 'boat',
'bottle', 'bus', 'car', 'cat', 'chair',
'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant',
'sheep', 'sofa', 'train', 'tvmonitor')
def __init__(self, data_dir, split, transform=None, target_transform=None, keep_difficult=False):
"""Dataset for VOC data.
Args:
data_dir: the root of the VOC2007 or VOC2012 dataset, the directory contains the following sub-directories:
Annotations, ImageSets, JPEGImages, SegmentationClass, SegmentationObject.
"""
self.data_dir = data_dir
self.split = split
self.transform = transform
self.target_transform = target_transform
image_sets_file = os.path.join(self.data_dir, "ImageSets", "Main", "%s.txt" % self.split)
self.ids = VOCDataset._read_image_ids(image_sets_file)
self.keep_difficult = keep_difficult
self.class_dict = {class_name: i for i, class_name in enumerate(self.class_names)}
def __getitem__(self, index):
image_id = self.ids[index]
boxes, labels, is_difficult = self._get_annotation(image_id)
if not self.keep_difficult:
boxes = boxes[is_difficult == 0]
labels = labels[is_difficult == 0]
image = self._read_image(image_id)
if self.transform:
image, boxes, labels = self.transform(image, boxes, labels)
if self.target_transform:
boxes, labels = self.target_transform(boxes, labels)
targets = Container(
boxes=boxes,
labels=labels,
)
return image, targets, index
def get_annotation(self, index):
image_id = self.ids[index]
return image_id, self._get_annotation(image_id)
def __len__(self):
return len(self.ids)
@staticmethod
def _read_image_ids(image_sets_file):
ids = []
with open(image_sets_file) as f:
for line in f:
ids.append(line.rstrip())
return ids
def _get_annotation(self, image_id):
annotation_file = os.path.join(self.data_dir, "Annotations", "%s.xml" % image_id)
objects = ET.parse(annotation_file).findall("object")
boxes = []
labels = []
is_difficult = []
for obj in objects:
class_name = obj.find('name').text.lower().strip()
bbox = obj.find('bndbox')
# VOC dataset format follows Matlab, in which indexes start from 0
x1 = float(bbox.find('xmin').text) - 1
y1 = float(bbox.find('ymin').text) - 1
x2 = float(bbox.find('xmax').text) - 1
y2 = float(bbox.find('ymax').text) - 1
boxes.append([x1, y1, x2, y2])
labels.append(self.class_dict[class_name])
is_difficult_str = obj.find('difficult').text
is_difficult.append(int(is_difficult_str) if is_difficult_str else 0)
return (np.array(boxes, dtype=np.float32),
np.array(labels, dtype=np.int64),
np.array(is_difficult, dtype=np.uint8))
def get_img_info(self, index):
img_id = self.ids[index]
annotation_file = os.path.join(self.data_dir, "Annotations", "%s.xml" % img_id)
anno = ET.parse(annotation_file).getroot()
size = anno.find("size")
im_info = tuple(map(int, (size.find("height").text, size.find("width").text)))
return {"height": im_info[0], "width": im_info[1]}
def _read_image(self, image_id):
image_file = os.path.join(self.data_dir, "JPEGImages", "%s.jpg" % image_id)
image = Image.open(image_file).convert("RGB")
image = np.array(image)
return image
backbone模块构建
VGG
import torch.nn as nn
import torch.nn.functional as F
from ssd.layers import L2Norm
from ssd.modeling import registry
from ssd.utils.model_zoo import load_state_dict_from_url
model_urls = {
'vgg': 'https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth',
}
# borrowed from https://github.com/amdegroot/ssd.pytorch/blob/master/ssd.py
def add_vgg(cfg, batch_norm=False):
layers = []
in_channels = 3
for v in cfg:
if v == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
elif v == 'C':
layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
else:
conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
if batch_norm:
layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
else:
layers += [conv2d, nn.ReLU(inplace=True)]
in_channels = v
pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
layers += [pool5, conv6,
nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
return layers
def add_extras(cfg, i, size=300):
# Extra layers added to VGG for feature scaling
layers = []
in_channels = i
flag = False
for k, v in enumerate(cfg):
if in_channels != 'S':
if v == 'S':
layers += [nn.Conv2d(in_channels, cfg[k + 1], kernel_size=(1, 3)[flag], stride=2, padding=1)]
else:
layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
flag = not flag
in_channels = v
if size == 512:
layers.append(nn.Conv2d(in_channels, 128, kernel_size=1, stride=1))
layers.append(nn.Conv2d(128, 256, kernel_size=4, stride=1, padding=1))
return layers
vgg_base = {
'300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
512, 512, 512],
'512': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
512, 512, 512],
}
extras_base = {
'300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
'512': [256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256],
}
class VGG(nn.Module):
def __init__(self, cfg):
super().__init__()
size = cfg.INPUT.IMAGE_SIZE
vgg_config = vgg_base[str(size)]
extras_config = extras_base[str(size)]
self.vgg = nn.ModuleList(add_vgg(vgg_config))
self.extras = nn.ModuleList(add_extras(extras_config, i=1024, size=size))
self.l2_norm = L2Norm(512, scale=20)
self.reset_parameters()
def reset_parameters(self):
for m in self.extras.modules():
if isinstance(m, nn.Conv2d):
nn.init.xavier_uniform_(m.weight)
nn.init.zeros_(m.bias)
def init_from_pretrain(self, state_dict):
self.vgg.load_state_dict(state_dict)
def forward(self, x):
features = []
for i in range(23):
x = self.vgg[i](x)
s = self.l2_norm(x) # Conv4_3 L2 normalization
features.append(s)
# apply vgg up to fc7
for i in range(23, len(self.vgg)):
x = self.vgg[i](x)
features.append(x)
for k, v in enumerate(self.extras):
x = F.relu(v(x), inplace=True)
if k % 2 == 1:
features.append(x)
return tuple(features)
@registry.BACKBONES.register('vgg')
def vgg(cfg, pretrained=True):
model = VGG(cfg)
if pretrained:
model.init_from_pretrain(load_state_dict_from_url(model_urls['vgg']))
return model
mobilenet
from torch import nn
from ssd.modeling import registry
from ssd.utils.model_zoo import load_state_dict_from_url
model_urls = {
'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
}
class ConvBNReLU(nn.Sequential):
def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
padding = (kernel_size - 1) // 2
super(ConvBNReLU, self).__init__(
nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
nn.BatchNorm2d(out_planes),
nn.ReLU6(inplace=True)
)
class InvertedResidual(nn.Module):
def __init__(self, inp, oup, stride, expand_ratio):
super(InvertedResidual, self).__init__()
self.stride = stride
assert stride in [1, 2]
hidden_dim = int(round(inp * expand_ratio))
self.use_res_connect = self.stride == 1 and inp == oup
layers = []
if expand_ratio != 1:
# pw
layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
layers.extend([
# dw
ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
# pw-linear
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
nn.BatchNorm2d(oup),
])
self.conv = nn.Sequential(*layers)
def forward(self, x):
if self.use_res_connect:
return x + self.conv(x)
else:
return self.conv(x)
class MobileNetV2(nn.Module):
def __init__(self, width_mult=1.0, inverted_residual_setting=None):
super(MobileNetV2, self).__init__()
block = InvertedResidual
input_channel = 32
last_channel = 1280
if inverted_residual_setting is None:
inverted_residual_setting = [
# t, c, n, s
[1, 16, 1, 1],
[6, 24, 2, 2],
[6, 32, 3, 2],
[6, 64, 4, 2],
[6, 96, 3, 1],
[6, 160, 3, 2],
[6, 320, 1, 1],
]
# only check the first element, assuming user knows t,c,n,s are required
if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
raise ValueError("inverted_residual_setting should be non-empty "
"or a 4-element list, got {}".format(inverted_residual_setting))
# building first layer
input_channel = int(input_channel * width_mult)
self.last_channel = int(last_channel * max(1.0, width_mult))
features = [ConvBNReLU(3, input_channel, stride=2)]
# building inverted residual blocks
for t, c, n, s in inverted_residual_setting:
output_channel = int(c * width_mult)
for i in range(n):
stride = s if i == 0 else 1
features.append(block(input_channel, output_channel, stride, expand_ratio=t))
input_channel = output_channel
# building last several layers
features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
# make it nn.Sequential
self.features = nn.Sequential(*features)
self.extras = nn.ModuleList([
InvertedResidual(1280, 512, 2, 0.2),
InvertedResidual(512, 256, 2, 0.25),
InvertedResidual(256, 256, 2, 0.5),
InvertedResidual(256, 64, 2, 0.25)
])
self.reset_parameters()
def reset_parameters(self):
# weight initialization
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.BatchNorm2d):
nn.init.ones_(m.weight)
nn.init.zeros_(m.bias)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.zeros_(m.bias)
def forward(self, x):
features = []
for i in range(14):
x = self.features[i](x)
features.append(x)
for i in range(14, len(self.features)):
x = self.features[i](x)
features.append(x)
for i in range(len(self.extras)):
x = self.extras[i](x)
features.append(x)
return tuple(features)
@registry.BACKBONES.register('mobilenet_v2')
def mobilenet_v2(cfg, pretrained=True):
model = MobileNetV2()
if pretrained:
model.load_state_dict(load_state_dict_from_url(model_urls['mobilenet_v2']), strict=False)
return model
head模块构建
from torch import nn
import torch.nn.functional as F
from ssd.modeling import registry
from ssd.modeling.anchors.prior_box import PriorBox
from ssd.modeling.box_head.box_predictor import make_box_predictor
from ssd.utils import box_utils
from .inference import PostProcessor
from .loss import MultiBoxLoss
@registry.BOX_HEADS.register('SSDBoxHead')
class SSDBoxHead(nn.Module):
def __init__(self, cfg):
super().__init__()
self.cfg = cfg
self.predictor = make_box_predictor(cfg)
self.loss_evaluator = MultiBoxLoss(neg_pos_ratio=cfg.MODEL.NEG_POS_RATIO)
self.post_processor = PostProcessor(cfg)
self.priors = None
def forward(self, features, targets=None):
cls_logits, bbox_pred = self.predictor(features)
if self.training:
return self._forward_train(cls_logits, bbox_pred, targets)
else:
return self._forward_test(cls_logits, bbox_pred)
def _forward_train(self, cls_logits, bbox_pred, targets):
gt_boxes, gt_labels = targets['boxes'], targets['labels']
reg_loss, cls_loss = self.loss_evaluator(cls_logits, bbox_pred, gt_labels, gt_boxes)
loss_dict = dict(
reg_loss=reg_loss,
cls_loss=cls_loss,
)
detections = (cls_logits, bbox_pred)
return detections, loss_dict
def _forward_test(self, cls_logits, bbox_pred):
if self.priors is None:
self.priors = PriorBox(self.cfg)().to(bbox_pred.device)
scores = F.softmax(cls_logits, dim=2)
boxes = box_utils.convert_locations_to_boxes(
bbox_pred, self.priors, self.cfg.MODEL.CENTER_VARIANCE, self.cfg.MODEL.SIZE_VARIANCE
)
boxes = box_utils.center_form_to_corner_form(boxes)
detections = (scores, boxes)
detections = self.post_processor(detections)
return detections, {}
import torch
from torch import nn
from ssd.layers import SeparableConv2d
from ssd.modeling import registry
class BoxPredictor(nn.Module):
def __init__(self, cfg):
super().__init__()
self.cfg = cfg
self.cls_headers = nn.ModuleList()
self.reg_headers = nn.ModuleList()
for level, (boxes_per_location, out_channels) in enumerate(zip(cfg.MODEL.PRIORS.BOXES_PER_LOCATION, cfg.MODEL.BACKBONE.OUT_CHANNELS)):
self.cls_headers.append(self.cls_block(level, out_channels, boxes_per_location))
self.reg_headers.append(self.reg_block(level, out_channels, boxes_per_location))
self.reset_parameters()
def cls_block(self, level, out_channels, boxes_per_location):
raise NotImplementedError
def reg_block(self, level, out_channels, boxes_per_location):
raise NotImplementedError
def reset_parameters(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.xavier_uniform_(m.weight)
nn.init.zeros_(m.bias)
def forward(self, features):
cls_logits = []
bbox_pred = []
for feature, cls_header, reg_header in zip(features, self.cls_headers, self.reg_headers):
cls_logits.append(cls_header(feature).permute(0, 2, 3, 1).contiguous())
bbox_pred.append(reg_header(feature).permute(0, 2, 3, 1).contiguous())
batch_size = features[0].shape[0]
cls_logits = torch.cat([c.view(c.shape[0], -1) for c in cls_logits], dim=1).view(batch_size, -1, self.cfg.MODEL.NUM_CLASSES)
bbox_pred = torch.cat([l.view(l.shape[0], -1) for l in bbox_pred], dim=1).view(batch_size, -1, 4)
return cls_logits, bbox_pred
@registry.BOX_PREDICTORS.register('SSDBoxPredictor')
class SSDBoxPredictor(BoxPredictor):
def cls_block(self, level, out_channels, boxes_per_location):
return nn.Conv2d(out_channels, boxes_per_location * self.cfg.MODEL.NUM_CLASSES, kernel_size=3, stride=1, padding=1)
def reg_block(self, level, out_channels, boxes_per_location):
return nn.Conv2d(out_channels, boxes_per_location * 4, kernel_size=3, stride=1, padding=1)
@registry.BOX_PREDICTORS.register('SSDLiteBoxPredictor')
class SSDLiteBoxPredictor(BoxPredictor):
def cls_block(self, level, out_channels, boxes_per_location):
num_levels = len(self.cfg.MODEL.BACKBONE.OUT_CHANNELS)
if level == num_levels - 1:
return nn.Conv2d(out_channels, boxes_per_location * self.cfg.MODEL.NUM_CLASSES, kernel_size=1)
return SeparableConv2d(out_channels, boxes_per_location * self.cfg.MODEL.NUM_CLASSES, kernel_size=3, stride=1, padding=1)
def reg_block(self, level, out_channels, boxes_per_location):
num_levels = len(self.cfg.MODEL.BACKBONE.OUT_CHANNELS)
if level == num_levels - 1:
return nn.Conv2d(out_channels, boxes_per_location * 4, kernel_size=1)
return SeparableConv2d(out_channels, boxes_per_location * 4, kernel_size=3, stride=1, padding=1)
def make_box_predictor(cfg):
return registry.BOX_PREDICTORS[cfg.MODEL.BOX_HEAD.PREDICTOR](cfg)