diff --git a/README.md b/README.md index 3f94c149a..70346f3d3 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ We provide reference implementations for each of the 7 benchmarks in the MLPerf * image_classification - Resnet-50 v1 applied to Imagenet. * object_detection - Mask R-CNN applied to COCO. +* single_stage_detector - SSD applied to COCO 2017. * speech_recognition - DeepSpeech2 applied to Librispeech. * translation - Transformer applied to WMT English-German. * recommendation - Neural Collaborative Filtering applied to MovieLens 20 Million (ml-20m). diff --git a/single_stage_detector/Dockerfile b/single_stage_detector/Dockerfile new file mode 100755 index 000000000..af2632105 --- /dev/null +++ b/single_stage_detector/Dockerfile @@ -0,0 +1,20 @@ +FROM pytorch/pytorch:0.4_cuda9_cudnn7 + +# Set working directory +WORKDIR /mlperf + +RUN apt-get update && \ + apt-get install -y python3-tk python-pip + +# Necessary pip packages +RUN pip install --upgrade pip +RUN pip install Cython==0.28.4 \ + matplotlib==2.2.2 +RUN python3 -m pip install pycocotools==2.0.0 + +# Copy SSD code +WORKDIR /mlperf +COPY . . +RUN pip install -r requirements.txt + +WORKDIR /mlperf/ssd diff --git a/single_stage_detector/download_dataset.sh b/single_stage_detector/download_dataset.sh new file mode 100755 index 000000000..3d0712519 --- /dev/null +++ b/single_stage_detector/download_dataset.sh @@ -0,0 +1,7 @@ +# Get COCO 2017 data sets +dir=$(pwd) +mkdir /coco; cd /coco +curl -O http://images.cocodataset.org/zips/train2017.zip; unzip train2017.zip +curl -O http://images.cocodataset.org/zips/val2017.zip; unzip val2017.zip +curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip; unzip annotations_trainval2017.zip +cd $dir diff --git a/single_stage_detector/requirements.txt b/single_stage_detector/requirements.txt new file mode 100755 index 000000000..131c6adea --- /dev/null +++ b/single_stage_detector/requirements.txt @@ -0,0 +1,12 @@ +cycler==0.10.0 +kiwisolver==1.0.1 +matplotlib==2.2.2 +numpy==1.14.5 +Pillow==5.2.0 +pycocotools==2.0.0 +pyparsing==2.2.0 +python-dateutil==2.7.3 +pytz==2018.5 +six==1.11.0 +torch==0.4.0 +torchvision==0.2.1 diff --git a/single_stage_detector/ssd/README.md b/single_stage_detector/ssd/README.md new file mode 100644 index 000000000..4066f45f4 --- /dev/null +++ b/single_stage_detector/ssd/README.md @@ -0,0 +1,71 @@ +# 1. Problem +Object detection. + +# 2. Directions + +### Steps to configure machine +From Source + +Standard script. + +From Docker +1. Checkout the MLPerf repository +``` +git clone https://github.com/mlperf/reference.git +``` +2. Install CUDA and Docker +``` +source reference/install_cuda_docker.sh +``` +3. Build the docker image for the single stage detection task +``` +# Build from Dockerfile +cd reference/single_stage_detector/ +sudo docker build -t mlperf/single_stage_detector . +``` + +### Steps to download data +``` +cd reference/single_stage_detector/ +source download_dataset.sh +``` + +### Steps to run benchmark. +From Source + +Run the run_and_time.sh script +``` +cd reference/single_stage_detector/ssd +source run_and_time.sh SEED TARGET +``` +where SEED is the random seed for a run, TARGET is the quality target from Section 5 below. + +Docker Image +``` +sudo nvidia-docker run -v /coco:/coco -t -i --rm --ipc=host mlperf/single_stage_detector ./run_and_time.sh SEED TARGET +``` + +# 3. Dataset/Environment +### Publiction/Attribution. +Microsoft COCO: COmmon Objects in Context. 2017. + +### Training and test data separation +Train on 2017 COCO train data set, compute mAP on 2017 COCO val data set. + +# 4. Model. +### Publication/Attribution +Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector. In the Proceedings of the European Conference on Computer Vision (ECCV), 2016. + +Backbone is ResNet34 pretrained on ILSVRC 2012 (from torchvision). Modifications to the backbone networks: remove conv_5x residual blocks, change the first 3x3 convolution of the conv_4x block from stride 2 to stride1 (this increases the resolution of the feature map to which detector heads are attached), attach all 6 detector heads to the output of the last conv_4x residual block. Thus detections are attached to 38x38, 19x19, 10x10, 5x5, 3x3, and 1x1 feature maps. Convolutions in the detector layers are followed by batch normalization layers. + +# 5. Quality. +### Quality metric +Metric is COCO box mAP (averaged over IoU of 0.5:0.95), computed over 2017 COCO val data. + +### Quality target +mAP of 0.212 + +### Evaluation frequency + +### Evaluation thoroughness +All the images in COCO 2017 val data set. diff --git a/single_stage_detector/ssd/base_model.py b/single_stage_detector/ssd/base_model.py new file mode 100644 index 000000000..04f39ff33 --- /dev/null +++ b/single_stage_detector/ssd/base_model.py @@ -0,0 +1,206 @@ +""" + Load the vgg16 weight and save it to special file +""" + +#from torchvision.models.vgg import vgg16 +import torch.nn as nn +import torch.nn.functional as F +import torch +from torch.autograd import Variable +from collections import OrderedDict + +from torchvision.models.resnet import resnet18, resnet34, resnet50 + +def _ModifyConvStrideDilation(conv, stride=(1, 1), padding=None): + conv.stride = stride + + if padding is not None: + conv.padding = padding + +def _ModifyBlock(block, bottleneck=False, **kwargs): + for m in list(block.children()): + if bottleneck: + _ModifyConvStrideDilation(m.conv2, **kwargs) + else: + _ModifyConvStrideDilation(m.conv1, **kwargs) + + if m.downsample is not None: + # need to make sure no padding for the 1x1 residual connection + _ModifyConvStrideDilation(list(m.downsample.children())[0], **kwargs) + +class ResNet18(nn.Module): + def __init__(self): + super().__init__() + rn18 = resnet18(pretrained=True) + + + # discard last Resnet block, avrpooling and classification FC + # layer1 = up to and including conv3 block + self.layer1 = nn.Sequential(*list(rn18.children())[:6]) + # layer2 = conv4 block only + self.layer2 = nn.Sequential(*list(rn18.children())[6:7]) + + # modify conv4 if necessary + # Always deal with stride in first block + modulelist = list(self.layer2.children()) + _ModifyBlock(modulelist[0], stride=(1,1)) + + def forward(self, data): + layer1_activation = self.layer1(data) + x = layer1_activation + layer2_activation = self.layer2(x) + + # Only need the output of conv4 + return [layer2_activation] + +class ResNet34(nn.Module): + def __init__(self): + super().__init__() + rn34 = resnet34(pretrained=True) + + # discard last Resnet block, avrpooling and classification FC + self.layer1 = nn.Sequential(*list(rn34.children())[:6]) + self.layer2 = nn.Sequential(*list(rn34.children())[6:7]) + # modify conv4 if necessary + # Always deal with stride in first block + modulelist = list(self.layer2.children()) + _ModifyBlock(modulelist[0], stride=(1,1)) + + + def forward(self, data): + layer1_activation = self.layer1(data) + x = layer1_activation + layer2_activation = self.layer2(x) + + return [layer2_activation] + +class L2Norm(nn.Module): + """ + Scale shall be learnable according to original paper + scale: initial scale number + chan_num: L2Norm channel number (norm over all channels) + """ + def __init__(self, scale=20, chan_num=512): + super(L2Norm, self).__init__() + # Scale across channels + self.scale = \ + nn.Parameter(torch.Tensor([scale]*chan_num).view(1, chan_num, 1, 1)) + + def forward(self, data): + # normalize accross channel + return self.scale*data*data.pow(2).sum(dim=1, keepdim=True).clamp(min=1e-12).rsqrt() + + + +def tailor_module(src_model, src_dir, tgt_model, tgt_dir): + state = torch.load(src_dir) + src_model.load_state_dict(state) + src_state = src_model.state_dict() + # only need features + keys1 = src_state.keys() + keys1 = [k for k in src_state.keys() if k.startswith("features")] + keys2 = tgt_model.state_dict().keys() + + assert len(keys1) == len(keys2) + state = OrderedDict() + + for k1, k2 in zip(keys1, keys2): + # print(k1, k2) + state[k2] = src_state[k1] + #diff_keys = state.keys() - target_model.state_dict().keys() + #print("Different Keys:", diff_keys) + # Remove unecessary keys + #for k in diff_keys: + # state.pop(k) + tgt_model.load_state_dict(state) + torch.save(tgt_model.state_dict(), tgt_dir) + +# Default vgg16 in pytorch seems different from ssd +def make_layers(cfg, batch_norm=False): + layers = [] + in_channels = 3 + for v in cfg: + if v == 'M': + layers += [nn.MaxPool2d(kernel_size=2, stride=2)] + elif v == 'C': + # Notice ceil_mode is true + layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] + else: + conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) + if batch_norm: + layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] + else: + layers += [conv2d, nn.ReLU(inplace=True)] + in_channels = v + return layers + +class Loss(nn.Module): + """ + Implements the loss as the sum of the followings: + 1. Confidence Loss: All labels, with hard negative mining + 2. Localization Loss: Only on positive labels + Suppose input dboxes has the shape 8732x4 + """ + + def __init__(self, dboxes): + super(Loss, self).__init__() + self.scale_xy = 1.0/dboxes.scale_xy + self.scale_wh = 1.0/dboxes.scale_wh + + self.sl1_loss = nn.SmoothL1Loss(reduce=False) + self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim = 0), + requires_grad=False) + # Two factor are from following links + # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html + self.con_loss = nn.CrossEntropyLoss(reduce=False) + + def _loc_vec(self, loc): + """ + Generate Location Vectors + """ + gxy = self.scale_xy*(loc[:, :2, :] - self.dboxes[:, :2, :])/self.dboxes[:, 2:, ] + gwh = self.scale_wh*(loc[:, 2:, :]/self.dboxes[:, 2:, :]).log() + + return torch.cat((gxy, gwh), dim=1).contiguous() + + def forward(self, ploc, plabel, gloc, glabel): + """ + ploc, plabel: Nx4x8732, Nxlabel_numx8732 + predicted location and labels + + gloc, glabel: Nx4x8732, Nx8732 + ground truth location and labels + """ + + mask = glabel > 0 + pos_num = mask.sum(dim=1) + + vec_gd = self._loc_vec(gloc) + + # sum on four coordinates, and mask + sl1 = self.sl1_loss(ploc, vec_gd).sum(dim=1) + sl1 = (mask.float()*sl1).sum(dim=1) + + # hard negative mining + con = self.con_loss(plabel, glabel) + + # postive mask will never selected + con_neg = con.clone() + con_neg[mask] = 0 + _, con_idx = con_neg.sort(dim=1, descending=True) + _, con_rank = con_idx.sort(dim=1) + + # number of negative three times positive + neg_num = torch.clamp(3*pos_num, max=mask.size(1)).unsqueeze(-1) + neg_mask = con_rank < neg_num + + closs = (con*(mask.float() + neg_mask.float())).sum(dim=1) + + # avoid no object detected + total_loss = sl1 + closs + num_mask = (pos_num > 0).float() + pos_num = pos_num.float().clamp(min=1e-6) + + ret = (total_loss*num_mask/pos_num).mean(dim=0) + return ret + diff --git a/single_stage_detector/ssd/coco.py b/single_stage_detector/ssd/coco.py new file mode 100755 index 000000000..dd0e880be --- /dev/null +++ b/single_stage_detector/ssd/coco.py @@ -0,0 +1,433 @@ +__author__ = 'tylin' +__version__ = '2.0' +# Interface for accessing the Microsoft COCO dataset. + +# Microsoft COCO is a large image dataset designed for object detection, +# segmentation, and caption generation. pycocotools is a Python API that +# assists in loading, parsing and visualizing the annotations in COCO. +# Please visit http://mscoco.org/ for more information on COCO, including +# for the data, paper, and tutorials. The exact format of the annotations +# is also described on the COCO website. For example usage of the pycocotools +# please see pycocotools_demo.ipynb. In addition to this API, please download both +# the COCO images and annotations in order to run the demo. + +# An alternative to using the API is to load the annotations directly +# into Python dictionary +# Using the API provides additional utility functions. Note that this API +# supports both *instance* and *caption* annotations. In the case of +# captions not all functions are defined (e.g. categories are undefined). + +# The following API functions are defined: +# COCO - COCO api class that loads COCO annotation file and prepare data structures. +# decodeMask - Decode binary mask M encoded via run-length encoding. +# encodeMask - Encode binary mask M using run-length encoding. +# getAnnIds - Get ann ids that satisfy given filter conditions. +# getCatIds - Get cat ids that satisfy given filter conditions. +# getImgIds - Get img ids that satisfy given filter conditions. +# loadAnns - Load anns with the specified ids. +# loadCats - Load cats with the specified ids. +# loadImgs - Load imgs with the specified ids. +# annToMask - Convert segmentation in an annotation to binary mask. +# showAnns - Display the specified annotations. +# loadRes - Load algorithm results and create API for accessing them. +# download - Download COCO images from mscoco.org server. +# Throughout the API "ann"=annotation, "cat"=category, and "img"=image. +# Help on each functions can be accessed by: "help COCO>function". + +# See also COCO>decodeMask, +# COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds, +# COCO>getImgIds, COCO>loadAnns, COCO>loadCats, +# COCO>loadImgs, COCO>annToMask, COCO>showAnns + +# Microsoft COCO Toolbox. version 2.0 +# Data, paper, and tutorials available at: http://mscoco.org/ +# Code written by Piotr Dollar and Tsung-Yi Lin, 2014. +# Licensed under the Simplified BSD License [see bsd.txt] + +import json +import time +import matplotlib.pyplot as plt +from matplotlib.collections import PatchCollection +from matplotlib.patches import Polygon +import numpy as np +import copy +import itertools +from pycocotools import mask as maskUtils +import os +from collections import defaultdict +import sys +PYTHON_VERSION = sys.version_info[0] +if PYTHON_VERSION == 2: + from urllib import urlretrieve +elif PYTHON_VERSION == 3: + from urllib.request import urlretrieve + + +def _isArrayLike(obj): + return hasattr(obj, '__iter__') and hasattr(obj, '__len__') + + +class COCO: + def __init__(self, annotation_file=None): + """ + Constructor of Microsoft COCO helper class for reading and visualizing annotations. + :param annotation_file (str): location of annotation file + :param image_folder (str): location to the folder that hosts images. + :return: + """ + # load dataset + self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict() + self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list) + if not annotation_file == None: + print('loading annotations into memory...') + tic = time.time() + dataset = json.load(open(annotation_file, 'r')) + assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset)) + print('Done (t={:0.2f}s)'.format(time.time()- tic)) + self.dataset = dataset + self.createIndex() + + def createIndex(self): + # create index + print('creating index...') + anns, cats, imgs = {}, {}, {} + imgToAnns,catToImgs = defaultdict(list),defaultdict(list) + if 'annotations' in self.dataset: + for ann in self.dataset['annotations']: + imgToAnns[ann['image_id']].append(ann) + anns[ann['id']] = ann + + if 'images' in self.dataset: + for img in self.dataset['images']: + imgs[img['id']] = img + + if 'categories' in self.dataset: + for cat in self.dataset['categories']: + cats[cat['id']] = cat + + if 'annotations' in self.dataset and 'categories' in self.dataset: + for ann in self.dataset['annotations']: + catToImgs[ann['category_id']].append(ann['image_id']) + + print('index created!') + + # create class members + self.anns = anns + self.imgToAnns = imgToAnns + self.catToImgs = catToImgs + self.imgs = imgs + self.cats = cats + + def info(self): + """ + Print information about the annotation file. + :return: + """ + for key, value in self.dataset['info'].items(): + print('{}: {}'.format(key, value)) + + def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None): + """ + Get ann ids that satisfy given filter conditions. default skips that filter + :param imgIds (int array) : get anns for given imgs + catIds (int array) : get anns for given cats + areaRng (float array) : get anns for given area range (e.g. [0 inf]) + iscrowd (boolean) : get anns for given crowd label (False or True) + :return: ids (int array) : integer array of ann ids + """ + imgIds = imgIds if _isArrayLike(imgIds) else [imgIds] + catIds = catIds if _isArrayLike(catIds) else [catIds] + + if len(imgIds) == len(catIds) == len(areaRng) == 0: + anns = self.dataset['annotations'] + else: + if not len(imgIds) == 0: + lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns] + anns = list(itertools.chain.from_iterable(lists)) + else: + anns = self.dataset['annotations'] + anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds] + anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]] + if not iscrowd == None: + ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] + else: + ids = [ann['id'] for ann in anns] + return ids + + def getCatIds(self, catNms=[], supNms=[], catIds=[]): + """ + filtering parameters. default skips that filter. + :param catNms (str array) : get cats for given cat names + :param supNms (str array) : get cats for given supercategory names + :param catIds (int array) : get cats for given cat ids + :return: ids (int array) : integer array of cat ids + """ + catNms = catNms if _isArrayLike(catNms) else [catNms] + supNms = supNms if _isArrayLike(supNms) else [supNms] + catIds = catIds if _isArrayLike(catIds) else [catIds] + + if len(catNms) == len(supNms) == len(catIds) == 0: + cats = self.dataset['categories'] + else: + cats = self.dataset['categories'] + cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms] + cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms] + cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds] + ids = [cat['id'] for cat in cats] + return ids + + def getImgIds(self, imgIds=[], catIds=[]): + ''' + Get img ids that satisfy given filter conditions. + :param imgIds (int array) : get imgs for given ids + :param catIds (int array) : get imgs with all given cats + :return: ids (int array) : integer array of img ids + ''' + imgIds = imgIds if _isArrayLike(imgIds) else [imgIds] + catIds = catIds if _isArrayLike(catIds) else [catIds] + + if len(imgIds) == len(catIds) == 0: + ids = self.imgs.keys() + else: + ids = set(imgIds) + for i, catId in enumerate(catIds): + if i == 0 and len(ids) == 0: + ids = set(self.catToImgs[catId]) + else: + ids &= set(self.catToImgs[catId]) + return list(ids) + + def loadAnns(self, ids=[]): + """ + Load anns with the specified ids. + :param ids (int array) : integer ids specifying anns + :return: anns (object array) : loaded ann objects + """ + if _isArrayLike(ids): + return [self.anns[id] for id in ids] + elif type(ids) == int: + return [self.anns[ids]] + + def loadCats(self, ids=[]): + """ + Load cats with the specified ids. + :param ids (int array) : integer ids specifying cats + :return: cats (object array) : loaded cat objects + """ + if _isArrayLike(ids): + return [self.cats[id] for id in ids] + elif type(ids) == int: + return [self.cats[ids]] + + def loadImgs(self, ids=[]): + """ + Load anns with the specified ids. + :param ids (int array) : integer ids specifying img + :return: imgs (object array) : loaded img objects + """ + if _isArrayLike(ids): + return [self.imgs[id] for id in ids] + elif type(ids) == int: + return [self.imgs[ids]] + + def showAnns(self, anns): + """ + Display the specified annotations. + :param anns (array of object): annotations to display + :return: None + """ + if len(anns) == 0: + return 0 + if 'segmentation' in anns[0] or 'keypoints' in anns[0]: + datasetType = 'instances' + elif 'caption' in anns[0]: + datasetType = 'captions' + else: + raise Exception('datasetType not supported') + if datasetType == 'instances': + ax = plt.gca() + ax.set_autoscale_on(False) + polygons = [] + color = [] + for ann in anns: + c = (np.random.random((1, 3))*0.6+0.4).tolist()[0] + if 'segmentation' in ann: + if type(ann['segmentation']) == list: + # polygon + for seg in ann['segmentation']: + poly = np.array(seg).reshape((int(len(seg)/2), 2)) + polygons.append(Polygon(poly)) + color.append(c) + else: + # mask + t = self.imgs[ann['image_id']] + if type(ann['segmentation']['counts']) == list: + rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width']) + else: + rle = [ann['segmentation']] + m = maskUtils.decode(rle) + img = np.ones( (m.shape[0], m.shape[1], 3) ) + if ann['iscrowd'] == 1: + color_mask = np.array([2.0,166.0,101.0])/255 + if ann['iscrowd'] == 0: + color_mask = np.random.random((1, 3)).tolist()[0] + for i in range(3): + img[:,:,i] = color_mask[i] + ax.imshow(np.dstack( (img, m*0.5) )) + if 'keypoints' in ann and type(ann['keypoints']) == list: + # turn skeleton into zero-based index + sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1 + kp = np.array(ann['keypoints']) + x = kp[0::3] + y = kp[1::3] + v = kp[2::3] + for sk in sks: + if np.all(v[sk]>0): + plt.plot(x[sk],y[sk], linewidth=3, color=c) + plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2) + plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2) + p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4) + ax.add_collection(p) + p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2) + ax.add_collection(p) + elif datasetType == 'captions': + for ann in anns: + print(ann['caption']) + + def loadRes(self, resFile): + """ + Load result file and return a result api object. + :param resFile (str) : file name of result file + :return: res (obj) : result api object + """ + res = COCO() + res.dataset['images'] = [img for img in self.dataset['images']] + + print('Loading and preparing results...') + tic = time.time() + if type(resFile) == str: #or type(resFile) == unicode: + anns = json.load(open(resFile)) + elif type(resFile) == np.ndarray: + anns = self.loadNumpyAnnotations(resFile) + else: + anns = resFile + assert type(anns) == list, 'results in not an array of objects' + annsImgIds = [ann['image_id'] for ann in anns] + assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ + 'Results do not correspond to current coco set' + if 'caption' in anns[0]: + imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) + res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] + for id, ann in enumerate(anns): + ann['id'] = id+1 + elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: + res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) + for id, ann in enumerate(anns): + bb = ann['bbox'] + x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]] + if not 'segmentation' in ann: + ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] + ann['area'] = bb[2]*bb[3] + ann['id'] = id+1 + ann['iscrowd'] = 0 + elif 'segmentation' in anns[0]: + res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) + for id, ann in enumerate(anns): + # now only support compressed RLE format as segmentation results + ann['area'] = maskUtils.area(ann['segmentation']) + if not 'bbox' in ann: + ann['bbox'] = maskUtils.toBbox(ann['segmentation']) + ann['id'] = id+1 + ann['iscrowd'] = 0 + elif 'keypoints' in anns[0]: + res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) + for id, ann in enumerate(anns): + s = ann['keypoints'] + x = s[0::3] + y = s[1::3] + x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y) + ann['area'] = (x1-x0)*(y1-y0) + ann['id'] = id + 1 + ann['bbox'] = [x0,y0,x1-x0,y1-y0] + print('DONE (t={:0.2f}s)'.format(time.time()- tic)) + + res.dataset['annotations'] = anns + res.createIndex() + return res + + def download(self, tarDir = None, imgIds = [] ): + ''' + Download COCO images from mscoco.org server. + :param tarDir (str): COCO results directory name + imgIds (list): images to be downloaded + :return: + ''' + if tarDir is None: + print('Please specify target directory') + return -1 + if len(imgIds) == 0: + imgs = self.imgs.values() + else: + imgs = self.loadImgs(imgIds) + N = len(imgs) + if not os.path.exists(tarDir): + os.makedirs(tarDir) + for i, img in enumerate(imgs): + tic = time.time() + fname = os.path.join(tarDir, img['file_name']) + if not os.path.exists(fname): + urlretrieve(img['coco_url'], fname) + print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic)) + + def loadNumpyAnnotations(self, data): + """ + Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class} + :param data (numpy.ndarray) + :return: annotations (python nested list) + """ + print('Converting ndarray to lists...') + assert(type(data) == np.ndarray) + print(data.shape) + assert(data.shape[1] == 7) + N = data.shape[0] + ann = [] + for i in range(N): + if i % 1000000 == 0: + print('{}/{}'.format(i,N)) + ann += [{ + 'image_id' : int(data[i, 0]), + 'bbox' : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ], + 'score' : data[i, 5], + 'category_id': int(data[i, 6]), + }] + return ann + + def annToRLE(self, ann): + """ + Convert annotation which can be polygons, uncompressed RLE to RLE. + :return: binary mask (numpy 2D array) + """ + t = self.imgs[ann['image_id']] + h, w = t['height'], t['width'] + segm = ann['segmentation'] + if type(segm) == list: + # polygon -- a single object might consist of multiple parts + # we merge all parts into one mask rle code + rles = maskUtils.frPyObjects(segm, h, w) + rle = maskUtils.merge(rles) + elif type(segm['counts']) == list: + # uncompressed RLE + rle = maskUtils.frPyObjects(segm, h, w) + else: + # rle + rle = ann['segmentation'] + return rle + + def annToMask(self, ann): + """ + Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask. + :return: binary mask (numpy 2D array) + """ + rle = self.annToRLE(ann) + m = maskUtils.decode(rle) + return m diff --git a/single_stage_detector/ssd/distributed.py b/single_stage_detector/ssd/distributed.py new file mode 100644 index 000000000..7776997fc --- /dev/null +++ b/single_stage_detector/ssd/distributed.py @@ -0,0 +1,82 @@ +import torch +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors +import torch.distributed as dist +from torch.nn.modules import Module + +''' +This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py +launcher included with this example. It assumes that your run is using multiprocess with 1 +GPU/process, that the model is on the correct device, and that torch.set_device has been +used to set the device. + +Parameters are broadcasted to the other processes on initialization of DistributedDataParallel, +and will be allreduced at the finish of the backward pass. +''' +class DistributedDataParallel(Module): + + def __init__(self, module): + super(DistributedDataParallel, self).__init__() + self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False + + self.module = module + + for p in self.module.state_dict().values(): + if not torch.is_tensor(p): + continue + if dist._backend == dist.dist_backend.NCCL: + assert p.is_cuda, "NCCL backend only supports model parameters to be on GPU." + dist.broadcast(p, 0) + + def allreduce_params(): + if(self.needs_reduction): + self.needs_reduction = False + buckets = {} + for param in self.module.parameters(): + if param.requires_grad and param.grad is not None: + tp = param.data.type() + if tp not in buckets: + buckets[tp] = [] + buckets[tp].append(param) + if self.warn_on_half: + if torch.cuda.HalfTensor in buckets: + print("WARNING: gloo dist backend for half parameters may be extremely slow." + + " It is recommended to use the NCCL backend in this case.") + self.warn_on_half = False + + for tp in buckets: + bucket = buckets[tp] + grads = [param.grad.data for param in bucket] + coalesced = _flatten_dense_tensors(grads) + dist.all_reduce(coalesced) + coalesced /= dist.get_world_size() + for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): + buf.copy_(synced) + + for param in list(self.module.parameters()): + def allreduce_hook(*unused): + param._execution_engine.queue_callback(allreduce_params) + if param.requires_grad: + param.register_hook(allreduce_hook) + + def forward(self, *inputs, **kwargs): + self.needs_reduction = True + return self.module(*inputs, **kwargs) + + ''' + def _sync_buffers(self): + buffers = list(self.module._all_buffers()) + if len(buffers) > 0: + # cross-node buffer sync + flat_buffers = _flatten_dense_tensors(buffers) + dist.broadcast(flat_buffers, 0) + for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)): + buf.copy_(synced) + def train(self, mode=True): + # Clear NCCL communicator and CUDA event cache of the default group ID, + # These cache will be recreated at the later call. This is currently a + # work-around for a potential NCCL deadlock. + if dist._backend == dist.dist_backend.NCCL: + dist._clear_group_cache() + super(DistributedDataParallel, self).train(mode) + self.module.train(mode) + ''' diff --git a/single_stage_detector/ssd/eval.py b/single_stage_detector/ssd/eval.py new file mode 100644 index 000000000..82075a55d --- /dev/null +++ b/single_stage_detector/ssd/eval.py @@ -0,0 +1,193 @@ +import numpy as np +import xml.etree.ElementTree as ET +import pickle +import os + +def voc_ap(rec, prec, use_07_metric=True): + """ ap = voc_ap(rec, prec, [use_07_metric]) + Compute VOC AP given precision and recall. + If use_07_metric is true, uses the + VOC 07 11 point method (default:True). + """ + if use_07_metric: + # 11 point metric + ap = 0. + for t in np.arange(0., 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11. + else: + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.], rec, [1.])) + mpre = np.concatenate(([0.], prec, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def parse_rec(filename): + """ Parse a PASCAL VOC xml file """ + tree = ET.parse(filename) + objects = [] + for obj in tree.findall('object'): + obj_struct = {} + obj_struct['name'] = obj.find('name').text + obj_struct['pose'] = obj.find('pose').text + obj_struct['truncated'] = int(obj.find('truncated').text) + obj_struct['difficult'] = int(obj.find('difficult').text) + bbox = obj.find('bndbox') + obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1, + int(bbox.find('ymin').text) - 1, + int(bbox.find('xmax').text) - 1, + int(bbox.find('ymax').text) - 1] + objects.append(obj_struct) + + return objects + +def voc_eval(detpath, + annopath, + imagesetfile, + classname, + cachedir, + ovthresh=0.5, + use_07_metric=True): + """rec, prec, ap = voc_eval(detpath, + annopath, + imagesetfile, + classname, + [ovthresh], + [use_07_metric]) +Top level function that does the PASCAL VOC evaluation. +detpath: Path to detections + detpath.format(classname) should produce the detection results file. +annopath: Path to annotations + annopath.format(imagename) should be the xml annotations file. +imagesetfile: Text file containing the list of images, one image per line. +classname: Category name (duh) +cachedir: Directory for caching the annotations +[ovthresh]: Overlap threshold (default = 0.5) +[use_07_metric]: Whether to use VOC07's 11 point AP computation + (default True) +""" +# assumes detections are in detpath.format(classname) +# assumes annotations are in annopath.format(imagename) +# assumes imagesetfile is a text file with each line an image name +# cachedir caches the annotations in a pickle file +# first load gt + if not os.path.isdir(cachedir): + os.mkdir(cachedir) + cachefile = os.path.join(cachedir, 'annots.pkl') + # read list of images + with open(imagesetfile, 'r') as f: + lines = f.readlines() + imagenames = [x.strip() for x in lines] + if not os.path.isfile(cachefile): + # load annots + recs = {} + for i, imagename in enumerate(imagenames): + recs[imagename] = parse_rec(annopath % (imagename)) + if i % 100 == 0: + print('Reading annotation for {:d}/{:d}'.format( + i + 1, len(imagenames))) + # save + print('Saving cached annotations to {:s}'.format(cachefile)) + with open(cachefile, 'wb') as f: + pickle.dump(recs, f) + else: + # load + with open(cachefile, 'rb') as f: + recs = pickle.load(f) + + # extract gt objects for this class + class_recs = {} + npos = 0 + for imagename in imagenames: + R = [obj for obj in recs[imagename] if obj['name'] == classname] + bbox = np.array([x['bbox'] for x in R]) + difficult = np.array([x['difficult'] for x in R]).astype(np.bool) + det = [False] * len(R) + npos = npos + sum(~difficult) + class_recs[imagename] = {'bbox': bbox, + 'difficult': difficult, + 'det': det} + + # read dets + detfile = detpath.format(classname) + with open(detfile, 'r') as f: + lines = f.readlines() + if any(lines) == 1: + + splitlines = [x.strip().split(' ') for x in lines] + image_ids = [x[0] for x in splitlines] + confidence = np.array([float(x[1]) for x in splitlines]) + BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) + + # sort by confidence + sorted_ind = np.argsort(-confidence) + sorted_scores = np.sort(-confidence) + BB = BB[sorted_ind, :] + image_ids = [image_ids[x] for x in sorted_ind] + + # go down dets and mark TPs and FPs + nd = len(image_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + for d in range(nd): + R = class_recs[image_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -np.inf + BBGT = R['bbox'].astype(float) + if BBGT.size > 0: + # compute overlaps + # intersection + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin, 0.) + ih = np.maximum(iymax - iymin, 0.) + inters = iw * ih + uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) + + (BBGT[:, 2] - BBGT[:, 0]) * + (BBGT[:, 3] - BBGT[:, 1]) - inters) + overlaps = inters / uni + ovmax = np.max(overlaps) + jmax = np.argmax(overlaps) + + if ovmax > ovthresh: + if not R['difficult'][jmax]: + if not R['det'][jmax]: + tp[d] = 1. + R['det'][jmax] = 1 + else: + fp[d] = 1. + else: + fp[d] = 1. + + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = voc_ap(rec, prec, use_07_metric) + else: + rec = -1. + prec = -1. + ap = -1. + + return rec, prec, ap + diff --git a/single_stage_detector/ssd/run_and_time.sh b/single_stage_detector/ssd/run_and_time.sh new file mode 100755 index 000000000..01b9ef983 --- /dev/null +++ b/single_stage_detector/ssd/run_and_time.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# runs benchmark and reports time to convergence +# to use the script: +# run_and_time.sh + +SEED=${1:-1} +TARGET=${2:-0.212} + +time stdbuf -o 0 \ + python3 train.py --seed $SEED --threshold $TARGET | tee run.log.$SEED diff --git a/single_stage_detector/ssd/ssd300.py b/single_stage_detector/ssd/ssd300.py new file mode 100644 index 000000000..177b42fae --- /dev/null +++ b/single_stage_detector/ssd/ssd300.py @@ -0,0 +1,141 @@ +import torch +import torch.nn as nn +from base_model import ResNet34 + +class SSD300(nn.Module): + """ + Build a SSD module to take 300x300 image input, + and output 8732 per class bounding boxes + + vggt: pretrained vgg16 (partial) model + label_num: number of classes (including background 0) + """ + def __init__(self, label_num, backbone='resnet34', model_path="./resnet34-333f7ec4.pth"): + + super(SSD300, self).__init__() + + self.label_num = label_num + + if backbone == 'resnet34': + self.model = ResNet34() + out_channels = 256 + out_size = 38 + self.out_chan = [out_channels, 512, 512, 256, 256, 256] + else: + raise ValueError('Invalid backbone chosen') + + self._build_additional_features(out_size, self.out_chan) + + # after l2norm, conv7, conv8_2, conv9_2, conv10_2, conv11_2 + # classifer 1, 2, 3, 4, 5 ,6 + + self.num_defaults = [4, 6, 6, 6, 4, 4] + self.loc = [] + self.conf = [] + + for nd, oc in zip(self.num_defaults, self.out_chan): + self.loc.append(nn.Conv2d(oc, nd*4, kernel_size=3, padding=1)) + self.conf.append(nn.Conv2d(oc, nd*label_num, kernel_size=3, padding=1)) + + + self.loc = nn.ModuleList(self.loc) + self.conf = nn.ModuleList(self.conf) + # intitalize all weights + self._init_weights() + + def _build_additional_features(self, input_size, input_channels): + idx = 0 + if input_size == 38: + idx = 0 + elif input_size == 19: + idx = 1 + elif input_size == 10: + idx = 2 + + self.additional_blocks = [] + + if input_size == 38: + self.additional_blocks.append(nn.Sequential( + nn.Conv2d(input_channels[idx], 256, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, input_channels[idx+1], kernel_size=3, padding=1, stride=2), + nn.ReLU(inplace=True), + )) + idx += 1 + + self.additional_blocks.append(nn.Sequential( + nn.Conv2d(input_channels[idx], 256, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, input_channels[idx+1], kernel_size=3, padding=1, stride=2), + nn.ReLU(inplace=True), + )) + idx += 1 + + # conv9_1, conv9_2 + self.additional_blocks.append(nn.Sequential( + nn.Conv2d(input_channels[idx], 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, input_channels[idx+1], kernel_size=3, padding=1, stride=2), + nn.ReLU(inplace=True), + )) + idx += 1 + + # conv10_1, conv10_2 + self.additional_blocks.append(nn.Sequential( + nn.Conv2d(input_channels[idx], 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, input_channels[idx+1], kernel_size=3), + nn.ReLU(inplace=True), + )) + idx += 1 + + # Only necessary in VGG for now + if input_size >= 19: + # conv11_1, conv11_2 + self.additional_blocks.append(nn.Sequential( + nn.Conv2d(input_channels[idx], 128, kernel_size=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, input_channels[idx+1], kernel_size=3), + nn.ReLU(inplace=True), + )) + + self.additional_blocks = nn.ModuleList(self.additional_blocks) + + def _init_weights(self): + + layers = [ + *self.additional_blocks, + *self.loc, *self.conf] + + for layer in layers: + for param in layer.parameters(): + if param.dim() > 1: nn.init.xavier_uniform_(param) + + # Shape the classifier to the view of bboxes + def bbox_view(self, src, loc, conf): + ret = [] + for s, l, c in zip(src, loc, conf): + ret.append((l(s).view(s.size(0), 4, -1), c(s).view(s.size(0), self.label_num, -1))) + + locs, confs = list(zip(*ret)) + locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous() + return locs, confs + + def forward(self, data): + + layers = self.model(data) + + # last result from network goes into additional blocks + x = layers[-1] + additional_results = [] + for i, l in enumerate(self.additional_blocks): + x = l(x) + additional_results.append(x) + + src = [*layers, *additional_results] + # Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4 + + locs, confs = self.bbox_view(src, self.loc, self.conf) + + # For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results + return locs, confs diff --git a/single_stage_detector/ssd/train.py b/single_stage_detector/ssd/train.py new file mode 100644 index 000000000..37a447435 --- /dev/null +++ b/single_stage_detector/ssd/train.py @@ -0,0 +1,211 @@ +import os +from argparse import ArgumentParser +from utils import DefaultBoxes, Encoder, COCODetection +from base_model import Loss +from utils import SSDTransformer +from ssd300 import SSD300 +import torch +from torch.autograd import Variable +from torch.utils.data import DataLoader +import time +import numpy as np + + +def parse_args(): + parser = ArgumentParser(description="Train Single Shot MultiBox Detector" + " on COCO") + parser.add_argument('--data', '-d', type=str, default='/coco', + help='path to test and training data files') + parser.add_argument('--epochs', '-e', type=int, default=800, + help='number of epochs for training') + parser.add_argument('--batch-size', '-b', type=int, default=32, + help='number of examples for each iteration') + parser.add_argument('--no-cuda', action='store_true', + help='use available GPUs') + parser.add_argument('--seed', '-s', type=int, + help='manually set random seed for torch') + parser.add_argument('--threshold', '-t', type=float, default=0.212, + help='stop training early at threshold') + parser.add_argument('--iteration', type=int, default=0, + help='iteration to start from') + parser.add_argument('--checkpoint', type=str, default=None, + help='path to model checkpoint file') + parser.add_argument('--no-save', action='store_true', + help='save model checkpoints') + parser.add_argument('--evaluation', nargs='*', type=int, + default=[120000, 160000, 180000, 200000, 220000, 240000], + help='iterations at which to evaluate') + return parser.parse_args() + + +def show_memusage(device=0): + import gpustat + gpu_stats = gpustat.GPUStatCollection.new_query() + item = gpu_stats.jsonify()["gpus"][device] + print("{}/{}".format(item["memory.used"], item["memory.total"])) + + +def dboxes300_coco(): + figsize = 300 + feat_size = [38, 19, 10, 5, 3, 1] + steps = [8, 16, 32, 64, 100, 300] + # use the scales here: https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py + scales = [21, 45, 99, 153, 207, 261, 315] + aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] + dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios) + return dboxes + + +def coco_eval(model, coco, cocoGt, encoder, inv_map, threshold, use_cuda=True): + from pycocotools.cocoeval import COCOeval + print("") + model.eval() + if use_cuda: + model.cuda() + ret = [] + start = time.time() + for idx, image_id in enumerate(coco.img_keys): + img, (htot, wtot), _, _ = coco[idx] + + with torch.no_grad(): + print("Parsing image: {}/{}".format(idx+1, len(coco)), end="\r") + inp = img.unsqueeze(0) + if use_cuda: + inp = inp.cuda() + ploc, plabel = model(inp) + + try: + result = encoder.decode_batch(ploc, plabel, 0.50, 200)[0] + except: + #raise + print("") + print("No object detected in idx: {}".format(idx)) + continue + + loc, label, prob = [r.cpu().numpy() for r in result] + for loc_, label_, prob_ in zip(loc, label, prob): + ret.append([image_id, loc_[0]*wtot, \ + loc_[1]*htot, + (loc_[2] - loc_[0])*wtot, + (loc_[3] - loc_[1])*htot, + prob_, + inv_map[label_]]) + print("") + print("Predicting Ended, total time: {:.2f} s".format(time.time()-start)) + + cocoDt = cocoGt.loadRes(np.array(ret)) + + E = COCOeval(cocoGt, cocoDt, iouType='bbox') + E.evaluate() + E.accumulate() + E.summarize() + print("Current AP: {:.5f} AP goal: {:.5f}".format(E.stats[0], threshold)) + return (E.stats[0] >= threshold) #Average Precision (AP) @[ IoU=050:0.95 | area= all | maxDets=100 ] + + + +def train300_mlperf_coco(args): + from coco import COCO + # Check that GPUs are actually available + use_cuda = not args.no_cuda and torch.cuda.is_available() + dboxes = dboxes300_coco() + encoder = Encoder(dboxes) + train_trans = SSDTransformer(dboxes, (300, 300), val=False) + val_trans = SSDTransformer(dboxes, (300, 300), val=True) + + val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") + val_coco_root = os.path.join(args.data, "val2017") + train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") + train_coco_root = os.path.join(args.data, "train2017") + + cocoGt = COCO(annotation_file=val_annotate) + val_coco = COCODetection(val_coco_root, val_annotate, val_trans) + train_coco = COCODetection(train_coco_root, train_annotate, train_trans) + + #print("Number of labels: {}".format(train_coco.labelnum)) + train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4) + + ssd300 = SSD300(train_coco.labelnum) + if args.checkpoint is not None: + print("loading model checkpoint", args.checkpoint) + od = torch.load(args.checkpoint) + ssd300.load_state_dict(od["model"]) + ssd300.train() + if use_cuda: + ssd300.cuda() + loss_func = Loss(dboxes) + if use_cuda: + loss_func.cuda() + + optim = torch.optim.SGD(ssd300.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4) + print("epoch", "nbatch", "loss") + + iter_num = args.iteration + avg_loss = 0.0 + inv_map = {v:k for k,v in val_coco.label_map.items()} + + for epoch in range(args.epochs): + + for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): + + if iter_num == 160000: + print("") + print("lr decay step #1") + for param_group in optim.param_groups: + param_group['lr'] = 1e-4 + + if iter_num == 200000: + print("") + print("lr decay step #2") + for param_group in optim.param_groups: + param_group['lr'] = 1e-5 + + if use_cuda: + img = img.cuda() + img = Variable(img, requires_grad=True) + ploc, plabel = ssd300(img) + trans_bbox = bbox.transpose(1,2).contiguous() + if use_cuda: + trans_bbox = trans_bbox.cuda() + label = label.cuda() + gloc, glabel = Variable(trans_bbox, requires_grad=False), \ + Variable(label, requires_grad=False) + loss = loss_func(ploc, plabel, gloc, glabel) + + if not np.isinf(loss.item()): avg_loss = 0.999*avg_loss + 0.001*loss.item() + + print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ + .format(iter_num, loss.item(), avg_loss), end="\r") + optim.zero_grad() + loss.backward() + optim.step() + + if iter_num in args.evaluation: + if not args.no_save: + print("") + print("saving model...") + torch.save({"model" : ssd300.state_dict(), "label_map": train_coco.label_info}, + "./models/iter_{}.pt".format(iter_num)) + + if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold): + return + + iter_num += 1 + +def main(): + args = parse_args() + + if not os.path.isdir('./models'): + os.mkdir('./models') + + if args.seed is not None: + print("Using seed = {}".format(args.seed)) + torch.manual_seed(args.seed) + np.random.seed(seed=args.seed) + + + torch.backends.cudnn.benchmark = True + train300_mlperf_coco(args) + +if __name__ == "__main__": + main() diff --git a/single_stage_detector/ssd/utils.py b/single_stage_detector/ssd/utils.py new file mode 100644 index 000000000..e1a0901ab --- /dev/null +++ b/single_stage_detector/ssd/utils.py @@ -0,0 +1,772 @@ +import torch +import torchvision +import torchvision.transforms as transforms +import torch.utils.data as data +from PIL import Image +from xml.etree import ElementTree +import os +import glob +from pathlib import Path +import numpy as np +import random +import itertools +import torch.nn.functional as F +import json +import time +import bz2 +import pickle +from math import sqrt, ceil + +# This function is from https://github.com/kuangliu/pytorch-ssd. +def calc_iou_tensor(box1, box2): + """ Calculation of IoU based on two boxes tensor, + Reference to https://github.com/kuangliu/pytorch-ssd + input: + box1 (N, 4) + box2 (M, 4) + output: + IoU (N, M) + """ + N = box1.size(0) + M = box2.size(0) + + be1 = box1.unsqueeze(1).expand(-1, M, -1) + be2 = box2.unsqueeze(0).expand(N, -1, -1) + + # Left Top & Right Bottom + lt = torch.max(be1[:,:,:2], be2[:,:,:2]) + #mask1 = (be1[:,:, 0] < be2[:,:, 0]) ^ (be1[:,:, 1] < be2[:,:, 1]) + #mask1 = ~mask1 + rb = torch.min(be1[:,:,2:], be2[:,:,2:]) + #mask2 = (be1[:,:, 2] < be2[:,:, 2]) ^ (be1[:,:, 3] < be2[:,:, 3]) + #mask2 = ~mask2 + + delta = rb - lt + delta[delta < 0] = 0 + intersect = delta[:,:,0]*delta[:,:,1] + #*mask1.float()*mask2.float() + + delta1 = be1[:,:,2:] - be1[:,:,:2] + area1 = delta1[:,:,0]*delta1[:,:,1] + delta2 = be2[:,:,2:] - be2[:,:,:2] + area2 = delta2[:,:,0]*delta2[:,:,1] + + iou = intersect/(area1 + area2 - intersect) + return iou + +# This function is from https://github.com/kuangliu/pytorch-ssd. +class Encoder(object): + """ + Inspired by https://github.com/kuangliu/pytorch-ssd + Transform between (bboxes, lables) <-> SSD output + + dboxes: default boxes in size 8732 x 4, + encoder: input ltrb format, output xywh format + decoder: input xywh format, output ltrb format + + encode: + input : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes) + output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732) + criteria : IoU threshold of bboexes + + decode: + input : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems) + output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes) + criteria : IoU threshold of bboexes + max_output : maximum number of output bboxes + """ + + def __init__(self, dboxes): + self.dboxes = dboxes(order="ltrb") + self.dboxes_xywh = dboxes(order="xywh").unsqueeze(dim=0) + self.nboxes = self.dboxes.size(0) + #print("# Bounding boxes: {}".format(self.nboxes)) + self.scale_xy = dboxes.scale_xy + self.scale_wh = dboxes.scale_wh + + def encode(self, bboxes_in, labels_in, criteria = 0.5): + + ious = calc_iou_tensor(bboxes_in, self.dboxes) + best_dbox_ious, best_dbox_idx = ious.max(dim=0) + best_bbox_ious, best_bbox_idx = ious.max(dim=1) + + # set best ious 2.0 + best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0) + + idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64) + best_dbox_idx[best_bbox_idx[idx]] = idx + + # filter IoU > 0.5 + masks = best_dbox_ious > criteria + labels_out = torch.zeros(self.nboxes, dtype=torch.long) + #print(maxloc.shape, labels_in.shape, labels_out.shape) + labels_out[masks] = labels_in[best_dbox_idx[masks]] + bboxes_out = self.dboxes.clone() + bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :] + # Transform format to xywh format + x, y, w, h = 0.5*(bboxes_out[:, 0] + bboxes_out[:, 2]), \ + 0.5*(bboxes_out[:, 1] + bboxes_out[:, 3]), \ + -bboxes_out[:, 0] + bboxes_out[:, 2], \ + -bboxes_out[:, 1] + bboxes_out[:, 3] + bboxes_out[:, 0] = x + bboxes_out[:, 1] = y + bboxes_out[:, 2] = w + bboxes_out[:, 3] = h + return bboxes_out, labels_out + + def scale_back_batch(self, bboxes_in, scores_in): + """ + Do scale and transform from xywh to ltrb + suppose input Nx4xnum_bbox Nxlabel_numxnum_bbox + """ + if bboxes_in.device == torch.device("cpu"): + self.dboxes = self.dboxes.cpu() + self.dboxes_xywh = self.dboxes_xywh.cpu() + else: + self.dboxes = self.dboxes.cuda() + self.dboxes_xywh = self.dboxes_xywh.cuda() + + bboxes_in = bboxes_in.permute(0, 2, 1) + scores_in = scores_in.permute(0, 2, 1) + #print(bboxes_in.device, scores_in.device, self.dboxes_xywh.device) + + bboxes_in[:, :, :2] = self.scale_xy*bboxes_in[:, :, :2] + bboxes_in[:, :, 2:] = self.scale_wh*bboxes_in[:, :, 2:] + + bboxes_in[:, :, :2] = bboxes_in[:, :, :2]*self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2] + bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp()*self.dboxes_xywh[:, :, 2:] + + # Transform format to ltrb + l, t, r, b = bboxes_in[:, :, 0] - 0.5*bboxes_in[:, :, 2],\ + bboxes_in[:, :, 1] - 0.5*bboxes_in[:, :, 3],\ + bboxes_in[:, :, 0] + 0.5*bboxes_in[:, :, 2],\ + bboxes_in[:, :, 1] + 0.5*bboxes_in[:, :, 3] + + bboxes_in[:, :, 0] = l + bboxes_in[:, :, 1] = t + bboxes_in[:, :, 2] = r + bboxes_in[:, :, 3] = b + + return bboxes_in, F.softmax(scores_in, dim=-1) + + def decode_batch(self, bboxes_in, scores_in, criteria = 0.45, max_output=200): + bboxes, probs = self.scale_back_batch(bboxes_in, scores_in) + + output = [] + for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)): + bbox = bbox.squeeze(0) + prob = prob.squeeze(0) + output.append(self.decode_single(bbox, prob, criteria, max_output)) + #print(output[-1]) + return output + + # perform non-maximum suppression + def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200): + # Reference to https://github.com/amdegroot/ssd.pytorch + + bboxes_out = [] + scores_out = [] + labels_out = [] + + for i, score in enumerate(scores_in.split(1, 1)): + # skip background + # print(score[score>0.90]) + if i == 0: continue + # print(i) + + score = score.squeeze(1) + mask = score > 0.05 + + bboxes, score = bboxes_in[mask, :], score[mask] + if score.size(0) == 0: continue + + score_sorted, score_idx_sorted = score.sort(dim=0) + + # select max_output indices + score_idx_sorted = score_idx_sorted[-max_num:] + candidates = [] + #maxdata, maxloc = scores_in.sort() + + while score_idx_sorted.numel() > 0: + idx = score_idx_sorted[-1].item() + bboxes_sorted = bboxes[score_idx_sorted, :] + bboxes_idx = bboxes[idx, :].unsqueeze(dim=0) + iou_sorted = calc_iou_tensor(bboxes_sorted, bboxes_idx).squeeze() + # we only need iou < criteria + score_idx_sorted = score_idx_sorted[iou_sorted < criteria] + candidates.append(idx) + + bboxes_out.append(bboxes[candidates, :]) + scores_out.append(score[candidates]) + labels_out.extend([i]*len(candidates)) + + bboxes_out, labels_out, scores_out = torch.cat(bboxes_out, dim=0), \ + torch.tensor(labels_out, dtype=torch.long), \ + torch.cat(scores_out, dim=0) + + + _, max_ids = scores_out.sort(dim=0) + max_ids = max_ids[-max_output:] + return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids] + + +class DefaultBoxes(object): + def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, \ + scale_xy=0.1, scale_wh=0.2): + + self.feat_size = feat_size + self.fig_size = fig_size + + self.scale_xy_ = scale_xy + self.scale_wh_ = scale_wh + + # According to https://github.com/weiliu89/caffe + # Calculation method slightly different from paper + self.steps = steps + self.scales = scales + + fk = fig_size/np.array(steps) + self.aspect_ratios = aspect_ratios + + self.default_boxes = [] + # size of feature and number of feature + for idx, sfeat in enumerate(self.feat_size): + + sk1 = scales[idx]/fig_size + sk2 = scales[idx+1]/fig_size + sk3 = sqrt(sk1*sk2) + all_sizes = [(sk1, sk1), (sk3, sk3)] + + for alpha in aspect_ratios[idx]: + w, h = sk1*sqrt(alpha), sk1/sqrt(alpha) + all_sizes.append((w, h)) + all_sizes.append((h, w)) + for w, h in all_sizes: + for i, j in itertools.product(range(sfeat), repeat=2): + cx, cy = (j+0.5)/fk[idx], (i+0.5)/fk[idx] + self.default_boxes.append((cx, cy, w, h)) + + self.dboxes = torch.tensor(self.default_boxes) + self.dboxes.clamp_(min=0, max=1) + # For IoU calculation + self.dboxes_ltrb = self.dboxes.clone() + self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5*self.dboxes[:, 2] + self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5*self.dboxes[:, 3] + self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5*self.dboxes[:, 2] + self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5*self.dboxes[:, 3] + + @property + def scale_xy(self): + return self.scale_xy_ + + @property + def scale_wh(self): + return self.scale_wh_ + + def __call__(self, order="ltrb"): + if order == "ltrb": return self.dboxes_ltrb + if order == "xywh": return self.dboxes + + +# This function is from https://github.com/chauhan-utk/ssd.DomainAdaptation. +class SSDCropping(object): + """ Cropping for SSD, according to original paper + Choose between following 3 conditions: + 1. Preserve the original image + 2. Random crop minimum IoU is among 0.1, 0.3, 0.5, 0.7, 0.9 + 3. Random crop + Reference to https://github.com/chauhan-utk/ssd.DomainAdaptation + """ + def __init__(self): + + self.sample_options = ( + # Do nothing + None, + # min IoU, max IoU + (0.1, None), + (0.3, None), + (0.5, None), + (0.7, None), + (0.9, None), + # no IoU requirements + (None, None), + ) + + def __call__(self, img, img_size, bboxes, labels): + + # Ensure always return cropped image + while True: + mode = random.choice(self.sample_options) + + if mode is None: + return img, img_size, bboxes, labels + + htot, wtot = img_size + + min_iou, max_iou = mode + min_iou = float("-inf") if min_iou is None else min_iou + max_iou = float("+inf") if max_iou is None else max_iou + + # Implementation use 50 iteration to find possible candidate + for _ in range(50): + # suze of each sampled path in [0.1, 1] 0.3*0.3 approx. 0.1 + w = random.uniform(0.3 , 1.0) + h = random.uniform(0.3 , 1.0) + + if w/h < 0.5 or w/h > 2: + continue + + # left 0 ~ wtot - w, top 0 ~ htot - h + left = random.uniform(0, 1.0 - w) + top = random.uniform(0, 1.0 - h) + + right = left + w + bottom = top + h + + ious = calc_iou_tensor(bboxes, torch.tensor([[left, top, right, bottom]])) + + # tailor all the bboxes and return + if not ((ious > min_iou) & (ious < max_iou)).all(): + continue + + # discard any bboxes whose center not in the cropped image + xc = 0.5*(bboxes[:, 0] + bboxes[:, 2]) + yc = 0.5*(bboxes[:, 1] + bboxes[:, 3]) + + masks = (xc > left) & (xc < right) & (yc > top) & (yc < bottom) + + # if no such boxes, continue searching again + if not masks.any(): + continue + + bboxes[bboxes[:, 0] < left, 0] = left + bboxes[bboxes[:, 1] < top, 1] = top + bboxes[bboxes[:, 2] > right, 2] = right + bboxes[bboxes[:, 3] > bottom, 3] = bottom + + #print(left, top, right, bottom) + #print(labels, bboxes, masks) + bboxes = bboxes[masks, :] + labels = labels[masks] + + left_idx = int(left*wtot) + top_idx = int(top*htot) + right_idx = int(right*wtot) + bottom_idx = int(bottom*htot) + #print(left_idx,top_idx,right_idx,bottom_idx) + #img = img[:, top_idx:bottom_idx, left_idx:right_idx] + img = img.crop((left_idx, top_idx, right_idx, bottom_idx)) + + bboxes[:, 0] = (bboxes[:, 0] - left)/w + bboxes[:, 1] = (bboxes[:, 1] - top)/h + bboxes[:, 2] = (bboxes[:, 2] - left)/w + bboxes[:, 3] = (bboxes[:, 3] - top)/h + + htot = bottom_idx - top_idx + wtot = right_idx - left_idx + return img, (htot, wtot), bboxes, labels + +class ToTensor(object): + def __init__(self): + pass + + def __call__(self, img): + img = torch.Tensor(np.array(img)) + # Transform from HWC to CHW + img = img.permute(2, 0 ,1) + return img + +class LightingNoice(object): + """ + See this question, AlexNet data augumentation: + https://stackoverflow.com/questions/43328600 + """ + def __init__(self): + self.eigval = torch.tensor([55.46, 4.794, 1.148]) + self.eigvec = torch.tensor([ + [-0.5675, 0.7192, 0.4009], + [-0.5808, -0.0045, -0.8140], + [-0.5836, -0.6948, 0.4203]]) + + def __call__(self, img): + img = torch.Tensor(np.array(img)) + # Transform from HWC to CHW + img = img.permute(2, 0 ,1) + return img + alpha0 = random.gauss(sigma=0.1, mu=0) + alpha1 = random.gauss(sigma=0.1, mu=0) + alpha2 = random.gauss(sigma=0.1, mu=0) + + channels = alpha0*self.eigval[0]*self.eigvec[0, :] + \ + alpha1*self.eigval[1]*self.eigvec[1, :] + \ + alpha2*self.eigval[2]*self.eigvec[2, :] + channels = channels.view(3, 1, 1) + img += channels + + return img + +class RandomHorizontalFlip(object): + def __init__(self, p=0.5): + self.p = p + + def __call__(self, image, bboxes): + if random.random() < self.p: + bboxes[:, 0], bboxes[:, 2] = 1.0 - bboxes[:, 2], 1.0 - bboxes[:, 0] + return image.transpose(Image.FLIP_LEFT_RIGHT), bboxes + return image, bboxes + +# Do data augumentation +class SSDTransformer(object): + """ SSD Data Augumentation, according to original paper + Composed by several steps: + Cropping + Resize + Flipping + Jittering + """ + def __init__(self, dboxes, size = (300, 300), val=False): + + # define vgg16 mean + self.size = size + self.val = val + + self.dboxes_ = dboxes #DefaultBoxes300() + self.encoder = Encoder(self.dboxes_) + + self.crop = SSDCropping() + self.img_trans = transforms.Compose([ + transforms.Resize(self.size), + #transforms.Resize((300, 300)), + #transforms.RandomHorizontalFlip(), + transforms.ColorJitter(brightness=0.125, contrast=0.5, + saturation=0.5, hue=0.05 + ), + transforms.ToTensor() + #LightingNoice(), + ]) + self.hflip = RandomHorizontalFlip() + + # All Pytorch Tensor will be normalized + # https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683 + self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + #self.normalize = transforms.Normalize(mean = [104.0, 117.0, 123.0], + # std = [1.0, 1.0, 1.0]) + + self.trans_val = transforms.Compose([ + transforms.Resize(self.size), + transforms.ToTensor(), + #ToTensor(), + self.normalize,]) + + @property + def dboxes(self): + return self.dboxes_ + + def __call__(self, img, img_size, bbox=None, label=None, max_num=200): + #img = torch.tensor(img) + if self.val: + bbox_out = torch.zeros(max_num, 4) + label_out = torch.zeros(max_num, dtype=torch.long) + bbox_out[:bbox.size(0), :] = bbox + label_out[:label.size(0)] = label + return self.trans_val(img), img_size, bbox_out, label_out + + #print("before", img.size, bbox) + img, img_size, bbox, label = self.crop(img, img_size, bbox, label) + #print("after", img.size, bbox) + img, bbox = self.hflip(img, bbox) + + img = self.img_trans(img).contiguous() + #img = img.contiguous().div(255) + img = self.normalize(img) + + bbox, label = self.encoder.encode(bbox, label) + + return img, img_size, bbox, label + +# Implement a datareader for COCO dataset +class COCODetection(data.Dataset): + def __init__(self, img_folder, annotate_file, transform=None): + self.img_folder = img_folder + self.annotate_file = annotate_file + + # Start processing annotation + with open(annotate_file) as fin: + self.data = json.load(fin) + + self.images = {} + + self.label_map = {} + self.label_info = {} + #print("Parsing COCO data...") + start_time = time.time() + # 0 stand for the background + cnt = 0 + self.label_info[cnt] = "background" + for cat in self.data["categories"]: + cnt += 1 + self.label_map[cat["id"]] = cnt + self.label_info[cnt] = cat["name"] + + # build inference for images + for img in self.data["images"]: + img_id = img["id"] + img_name = img["file_name"] + img_size = (img["height"],img["width"]) + #print(img_name) + if img_id in self.images: raise Exception("dulpicated image record") + self.images[img_id] = (img_name, img_size, []) + + # read bboxes + for bboxes in self.data["annotations"]: + img_id = bboxes["image_id"] + category_id = bboxes["category_id"] + bbox = bboxes["bbox"] + bbox_label = self.label_map[bboxes["category_id"]] + self.images[img_id][2].append((bbox, bbox_label)) + + for k, v in list(self.images.items()): + if len(v[2]) == 0: + #print("empty image: {}".format(k)) + self.images.pop(k) + + self.img_keys = list(self.images.keys()) + self.transform = transform + #print("End parsing COCO data, total time {}".format(time.time()-start_time)) + + @property + def labelnum(self): + return len(self.label_info) + + @staticmethod + def load(pklfile): + #print("Loading from {}".format(pklfile)) + with bz2.open(pklfile, "rb") as fin: + ret = pickle.load(fin) + return ret + + def save(self, pklfile): + #print("Saving to {}".format(pklfile)) + with bz2.open(pklfile, "wb") as fout: + pickle.dump(self, fout) + + + def __len__(self): + return len(self.images) + + def __getitem__(self, idx): + img_id = self.img_keys[idx] + img_data = self.images[img_id] + fn = img_data[0] + img_path = os.path.join(self.img_folder, fn) + img = Image.open(img_path).convert("RGB") + + htot, wtot = img_data[1] + bbox_sizes = [] + bbox_labels = [] + + #for (xc, yc, w, h), bbox_label in img_data[2]: + for (l,t,w,h), bbox_label in img_data[2]: + r = l + w + b = t + h + #l, t, r, b = xc - 0.5*w, yc - 0.5*h, xc + 0.5*w, yc + 0.5*h + bbox_size = (l/wtot, t/htot, r/wtot, b/htot) + bbox_sizes.append(bbox_size) + bbox_labels.append(bbox_label) + + bbox_sizes = torch.tensor(bbox_sizes) + bbox_labels = torch.tensor(bbox_labels) + + + if self.transform != None: + img, (htot, wtot), bbox_sizes, bbox_labels = \ + self.transform(img, (htot, wtot), bbox_sizes, bbox_labels) + else: + pass + + return img, (htot, wtot), bbox_sizes, bbox_labels + +# Implement a datareader for VOC dataset +class VOCDetection(data.Dataset): + """ VOC PASCAL 07/12 DataReader + params: + img: image folder + annotate: annotation folder (xml) + """ + def __init__(self, img_folder, annotate_folder, file_filter, transform=None, label_map = {}, difficult=True): + #print("Reading data informations") + + self.img_folder = img_folder + self.annotate_folder = annotate_folder + self.transform = transform + self.difficult = difficult + self.file_filter = file_filter + + # Read file filter to filter out files + with open(file_filter, "r") as fin: + self.filter = fin.read().strip().split("\n") + + self.images = [] + self.label_num = 0 + self.label_map = {v:k for k, v in label_map.items()} + + for xml_file in glob.glob(os.path.join(annotate_folder, "*.xml")): + ret = self._parse_xml(xml_file) + if ret: + self.images.append(ret) + + self.label_map = {v:k for k, v in self.label_map.items()} + # Add background label + self.label_map[0] = "background" + self.label_num += 1 + #print("Finished Reading") + + def _parse_xml(self, xml_file): + #print(xml_file) + root = ElementTree.ElementTree(file=xml_file) + img_name = root.find("filename").text + # Get basename + base_name = Path(img_name).resolve().stem + if base_name not in self.filter: + return [] + + img_size = ( + int(root.find("size").find("height").text) , + int(root.find("size").find("width").text) , + int(root.find("size").find("depth").text) , ) + + tmp_data = [] + for obj in root.findall("object"): + # extract xmin, ymin, xmax, ymax + difficult = obj.find("difficult").text + if difficult == "1" and not self.difficult: + continue + bbox = ( + int(obj.find("bndbox").find("xmin").text), + int(obj.find("bndbox").find("ymin").text), + int(obj.find("bndbox").find("xmax").text), + int(obj.find("bndbox").find("ymax").text), ) + bbox_label = obj.find("name").text + if bbox_label in self.label_map: + bbox_label = self.label_map[bbox_label] + else: + self.label_num += 1 + self.label_map[bbox_label] = self.label_num + bbox_label = self.label_num + tmp_data.append((bbox, bbox_label)) + + return (img_name, img_size, tmp_data) + + def __getitem__(self, idx): + + image_info = self.images[idx] + #print(self.images) + #print(image_info) + img_path = os.path.join(self.img_folder, image_info[0]) + #img = np.array(Image.open(img_path).convert('RGB')) + img = Image.open(img_path) + + # Assert the record in xml and image matches + # assert img.size == image_info[1], "Image Size Does Not Match!" + + htot, wtot, _ = image_info[1] + + bbox_sizes = [] + bbox_labels = [] + + for (xmin, ymin, xmax, ymax), bbox_label in image_info[2]: + #cx, cy, w, h = (xmin + xmax)/2, (ymin + ymax)/2, xmax - xmin, ymax - ymin + #bbox_size = (cx, cy, w, h) + #print(cx, cy, w, h) + #bbox_size = (cx/wtot, cy/htot, w/wtot, h/htot) + l, t, r, b = xmin, ymin, xmax, ymax + bbox_size = (l/wtot, t/htot, r/wtot, b/htot) + bbox_sizes.append(bbox_size) + #bbox_labels.append(self.label_map[bbox_label]) + bbox_labels.append(bbox_label) + + bbox_sizes = torch.tensor(bbox_sizes) + bbox_labels = torch.tensor(bbox_labels) + #bbox_size = (xmin, ymin, xmax, ymax) + #bbox_label = bbox_info[3] + + # Perform image transformation + if self.transform != None: + img, (htot, wtot), bbox_sizes, bbox_labels = \ + self.transform(img, (htot, wtot), bbox_sizes, bbox_labels) + else: + pass + + #print(img.shape, bbox_sizes.shape, bbox_labels.shape) + #print(idx, "non_bg:", (bbox_labels > 0).sum().item()) + #print(img.shape) + return img, (htot, wtot), bbox_sizes, bbox_labels + + def __len__(self): + return len(self.images) + + +def draw_patches(img, bboxes, labels, order="xywh", label_map={}): + + import matplotlib.pyplot as plt + import matplotlib.patches as patches + # Suppose bboxes in fractional coordinate: + # cx, cy, w, h + # img = img.numpy() + img = np.array(img) + labels = np.array(labels) + bboxes = bboxes.numpy() + + if label_map: + labels = [label_map.get(l) for l in labels] + + if order == "ltrb": + xmin, ymin, xmax, ymax = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3] + cx, cy, w, h = (xmin + xmax)/2, (ymin + ymax)/2, xmax - xmin, ymax - ymin + else: + cx, cy, w, h = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3] + + htot, wtot,_ = img.shape + cx *= wtot + cy *= htot + w *= wtot + h *= htot + + bboxes = zip(cx, cy, w, h) + + plt.imshow(img) + ax = plt.gca() + for (cx, cy, w, h), label in zip(bboxes, labels): + if label == "background": continue + ax.add_patch(patches.Rectangle((cx-0.5*w, cy-0.5*h), + w, h, fill=False, color="r")) + bbox_props = dict(boxstyle="round", fc="y", ec="0.5", alpha=0.3) + ax.text(cx-0.5*w, cy-0.5*h, label, ha="center", va="center", size=15, bbox=bbox_props) + plt.show() + + +if __name__ == "__main__": + + #trans = SSDTransformer() + #vd = VOCDetection("../../VOCdevkit/VOC2007/JPEGImages", + # "../../VOCdevkit/VOC2007/Annotations", + # "../../VOCdevkit/VOC2007/ImageSets/Main/trainval.txt", + # transform = trans) + + #imgs, img_size, bbox, label = vd[0] + #img = imgs[:, :, :] + #img *= torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) + #img += torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1) + #img = img.permute(1, 2, 0) + #print(bbox[label>0], label[label>0]) + #draw_patches(img, bbox[label>0], label[label>0], order="xywh", label_map=vd.label_map) + + annotate = "../../coco_ssd/instances_valminusminival2014.json" + coco_root = "../../coco_data/val2014" + + coco = COCODetection(coco_root, annotate) + #coco.save("save.pb2") + print(len(coco)) + #img, img_size, bbox, label = coco[2] + #draw_patches(img, bbox, label, order="ltrb", label_map=coco.label_info)